Python pandas.read_parquet() Examples

The following are 30 code examples of pandas.read_parquet(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: validator_kfold.py    From mljar-supervised with MIT License 7 votes vote down vote up
def get_split(self, k):

        train_index_file = os.path.join(
            self._results_path, "folds", f"fold_{k}_train_indices.npy"
        )
        validation_index_file = os.path.join(
            self._results_path, "folds", f"fold_{k}_validation_indices.npy"
        )

        train_index = np.load(train_index_file)
        validation_index = np.load(validation_index_file)

        X = pd.read_parquet(self._X_train_path)
        y = pd.read_parquet(self._y_train_path)
        y = y["target"]

        return (
            {"X": X.loc[train_index], "y": y.loc[train_index]},
            {"X": X.loc[validation_index], "y": y.loc[validation_index]},
        ) 
Example #2
Source File: test_dataframe.py    From sdc with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_sort_parallel(self):
        # create `kde.parquet` file
        ParquetGenerator.gen_kde_pq()

        # TODO: better parallel sort test
        def test_impl():
            df = pd.read_parquet('kde.parquet')
            df['A'] = df.points.astype(np.float64)
            df.sort_values('points', inplace=True)
            res = df.A.values
            return res

        hpat_func = self.jit(locals={'res:return': 'distributed'})(test_impl)

        save_min_samples = sdc.hiframes.sort.MIN_SAMPLES
        try:
            sdc.hiframes.sort.MIN_SAMPLES = 10
            res = hpat_func()
            self.assertTrue((np.diff(res) >= 0).all())
        finally:
            # restore global val
            sdc.hiframes.sort.MIN_SAMPLES = save_min_samples 
Example #3
Source File: parquet.py    From timeserio with MIT License 6 votes vote down vote up
def make_subgen(self, chunk):
        filename = chunk
        subgen = single_sequence.SequenceForecastBatchGenerator(
            df=pd.read_parquet(filename).reset_index(),
            batch_size=self.batch_size,
            sequence_length=self.sequence_length,
            id_column=self.id_column,
            sequence_columns=self.sequence_columns,
            sequence_prefix=self.sequence_prefix,
            last_step_columns=self.last_step_columns,
            last_step_prefix=self.last_step_prefix,
            forecast_steps_min=self.forecast_steps_min,
            forecast_steps_max=self.forecast_steps_max,
            batch_offset=self.batch_offset,
            batch_offset_period=self.batch_offset_period,
            dt_column=self.dt_column,
            start_time=self.start_time,
        )
        return subgen 
Example #4
Source File: data.py    From lkpy with MIT License 6 votes vote down vote up
def read_df_detect(path):
    """
    Read a Pandas data frame, auto-detecting the file format based on filename suffix.
    The following file types are supported:

    CSV
        File has suffix ``.csv``, read with :py:func:`pandas.read_csv`.
    Parquet
        File has suffix ``.parquet``, ``.parq``, or ``.pq``, read with
        :py:func:`pandas.read_parquet`.
    """
    import pandas as pd
    if not isinstance(path, pathlib.Path):
        path = pathlib.Path(path)

    if path.suffix == '.csv':
        return pd.read_csv(path)
    elif path.suffix in ('.parquet', '.parq', '.pq'):
        return pd.read_parquet(path) 
Example #5
Source File: io.py    From modin with Apache License 2.0 6 votes vote down vote up
def read_parquet(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a Modin DataFrame.
           Modin only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Modin only supports pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        ErrorMessage.default_to_pandas("`read_parquet`")
        return cls.from_pandas(pandas.read_parquet(path, engine, columns, **kwargs)) 
Example #6
Source File: parsers.py    From modin with Apache License 2.0 6 votes vote down vote up
def parse(fname, **kwargs):
        num_splits = kwargs.pop("num_splits", None)
        columns = kwargs.get("columns", None)
        if num_splits is None:
            return pandas.read_parquet(fname, **kwargs)
        kwargs["use_pandas_metadata"] = True
        df = pandas.read_parquet(fname, **kwargs)
        if isinstance(df.index, pandas.RangeIndex):
            idx = len(df.index)
        else:
            idx = df.index
        columns = [c for c in columns if c not in df.index.names and c in df.columns]
        if columns is not None:
            df = df[columns]
        # Append the length of the index here to build it externally
        return _split_result_for_readers(0, num_splits, df) + [idx, df.dtypes] 
Example #7
Source File: test_dataframe_spark_io.py    From koalas with Apache License 2.0 6 votes vote down vote up
def test_parquet_write(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_parquet(tmp, mode="overwrite", partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            ) 
Example #8
Source File: test_config_driven_df.py    From dagster with Apache License 2.0 6 votes vote down vote up
def test_dataframe_parquet_materialization():
    check_parquet_support()

    @solid(output_defs=[OutputDefinition(DataFrame)])
    def return_df(_context):
        return pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]})

    @pipeline
    def return_df_pipeline():
        return_df()

    with get_temp_file_name() as filename:
        result = execute_pipeline(
            return_df_pipeline,
            {'solids': {'return_df': {'outputs': [{'result': {'parquet': {'path': filename}}}]}}},
        )

        assert result.success

        df = pd.read_parquet(filename)
        assert df.to_dict('list') == {'num1': [1, 3], 'num2': [2, 4]} 
Example #9
Source File: automl.py    From mljar-supervised with MIT License 6 votes vote down vote up
def _load_data_variables(self, X_train):
        if X_train.shape[1] == 0:
            X = pd.read_parquet(self._X_train_path)
            for c in X.columns:
                X_train.insert(loc=X_train.shape[1], column=c, value=X[c])

        os.remove(self._X_train_path)
        os.remove(self._y_train_path) 
Example #10
Source File: validator_split.py    From mljar-supervised with MIT License 6 votes vote down vote up
def get_split(self, k=0):

        X = pd.read_parquet(self._X_train_path)
        y = pd.read_parquet(self._y_train_path)
        y = y["target"]

        stratify = None
        if self.stratify:
            stratify = y
        if self.shuffle == False:
            stratify = None

        X_train, X_validation, y_train, y_validation = train_test_split(
            X,
            y,
            train_size=self.train_ratio,
            test_size=1.0 - self.train_ratio,
            shuffle=self.shuffle,
            stratify=stratify,
            random_state=self.random_seed,
        )
        return {"X": X_train, "y": y_train}, {"X": X_validation, "y": y_validation} 
Example #11
Source File: test_schema.py    From flytekit with Apache License 2.0 6 votes vote down vote up
def test_datetime_coercion_explicitly():
    """
    Sanity check that we're using a version of pyarrow that allows us to
    truncate timestamps
    """
    dt = _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1, microsecond=1)
    values = [(dt,)]
    df = _pd.DataFrame.from_records(values, columns=['testname'])
    assert df['testname'][0] == dt

    with _utils.AutoDeletingTempDir('test') as tmpdir:
        tmpfile = tmpdir.get_named_tempfile('repro.parquet')
        df.to_parquet(tmpfile, coerce_timestamps='ms', allow_truncated_timestamps=True)
        df2 = _pd.read_parquet(tmpfile)

    dt2 = _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1)
    assert df2['testname'][0] == dt2 
Example #12
Source File: invoker.py    From anomalydetector with MIT License 6 votes vote down vote up
def read_as_dataframe(input_path: str):
    if os.path.isfile(input_path):
        if input_path.endswith(".csv"):
            return pd.read_csv(input_path)
        elif input_path.endswith(".parquet"):
            return pd.read_parquet(input_path)
    else:
        dir_path = pathlib.Path(input_path)

        csv_files = list(dir_path.glob("**/*.csv"))
        if csv_files:
            df_from_csv_files = (pd.read_csv(f) for f in csv_files)
            return pd.concat(df_from_csv_files, ignore_index=True)

        parquet_files = list(dir_path.glob("**/*.parquet"))
        if parquet_files:
            df_from_parquet_files = (pd.read_parquet(f) for f in parquet_files)
            return pd.concat(df_from_parquet_files, ignore_index=True)

    raise ValueError(f"Failed to read path: {input_path}") 
Example #13
Source File: file_type.py    From gordo with GNU Affero General Public License v3.0 6 votes vote down vote up
def read_df(self, f: BinaryIO) -> pd.DataFrame:
        columns = self.time_series_columns.columns
        datetime_column = self.time_series_columns.datetime_column
        df = pd.read_parquet(f, engine="pyarrow", columns=columns).set_index(
            datetime_column
        )
        df.index = pd.to_datetime(df.index, utc=True)
        return df 
Example #14
Source File: test_io.py    From modin with Apache License 2.0 5 votes vote down vote up
def test_from_parquet_partition(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, directory=True)

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    df_equals(modin_df, pandas_df) 
Example #15
Source File: message_collector.py    From aki with GNU Affero General Public License v3.0 5 votes vote down vote up
def load(dt_: datetime) -> Optional[DataFrame]:
    filename = make_filename(dt_)
    if path.isfile(filename):
        return read_parquet(filename) 
Example #16
Source File: test_io.py    From modin with Apache License 2.0 5 votes vote down vote up
def test_from_parquet_hdfs():
    path = "modin/pandas/test/data/hdfs.parquet"
    pandas_df = pandas.read_parquet(path)
    modin_df = pd.read_parquet(path)
    df_equals(modin_df, pandas_df) 
Example #17
Source File: core.py    From tensorqtl with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def read_phenotype_bed(phenotype_bed):
    """Load phenotype BED file as phenotype and TSS DataFrames"""
    if phenotype_bed.endswith('.bed.gz'):
        phenotype_df = pd.read_csv(phenotype_bed, sep='\t', index_col=3, dtype={'#chr':str, '#Chr':str})
    elif phenotype_bed.endswith('.parquet'):
        phenotype_df = pd.read_parquet(phenotype_bed)
        phenotype_df.set_index(phenotype_df.columns[3], inplace=True)
    else:
        raise ValueError('Unsupported file type.')
    phenotype_df.rename(columns={i:i.lower().replace('#chr','chr') for i in phenotype_df.columns[:3]}, inplace=True)
    phenotype_pos_df = phenotype_df[['chr', 'end']].rename(columns={'end':'tss'})
    phenotype_df.drop(['chr', 'start', 'end'], axis=1, inplace=True)
    return phenotype_df, phenotype_pos_df 
Example #18
Source File: test_io.py    From modin with Apache License 2.0 5 votes vote down vote up
def test_from_parquet_partitioned_columns(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    df_equals(modin_df, pandas_df) 
Example #19
Source File: io.py    From mlcomp with Apache License 2.0 5 votes vote down vote up
def read_pandas(file):
    if file.endswith('.csv'):
        df = pd.read_csv(file)
    elif file.endswith('.parquet'):
        df = pd.read_parquet(file)
    else:
        raise Exception('Unknown file type')
    return df 
Example #20
Source File: eqtl_prepare_expression.py    From gtex-pipeline with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def read_gct(gct_file, sample_ids=None, dtype=None):
    """
    Load GCT as DataFrame. The first two columns must be 'Name' and 'Description'.
    """
    if sample_ids is not None:
        sample_ids = ['Name']+list(sample_ids)

    if gct_file.endswith('.gct.gz') or gct_file.endswith('.gct'):
        if dtype is not None:
            with gzip.open(gct_file, 'rt') as gct:
                gct.readline()
                gct.readline()
                sample_ids = gct.readline().strip().split()
            dtypes = {i:dtype for i in sample_ids[2:]}
            dtypes['Name'] = str
            dtypes['Description'] = str
            df = pd.read_csv(gct_file, sep='\t', skiprows=2, usecols=sample_ids, index_col=0, dtype=dtypes)
        else:
            df = pd.read_csv(gct_file, sep='\t', skiprows=2, usecols=sample_ids, index_col=0)
    elif gct_file.endswith('.parquet'):
        df = pd.read_parquet(gct_file, columns=sample_ids)
    elif gct_file.endswith('.ft'):  # feather format
        df = feather.read_dataframe(gct_file, columns=sample_ids)
        df = df.set_index('Name')
    else:
        raise ValueError('Unsupported input format.')
    df.index.name = 'gene_id'
    if 'Description' in df.columns:
        df = df.drop('Description', axis=1)
    return df 
Example #21
Source File: test_io.py    From modin with Apache License 2.0 5 votes vote down vote up
def test_from_parquet_partitioned_columns_with_columns(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    df_equals(modin_df, pandas_df) 
Example #22
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_read_freevar_str1(self):
        kde_file2 = 'kde.parquet'

        def test_impl():
            df = pd.read_parquet(kde_file2)
            X = df['points']
            return X.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #23
Source File: parquet.py    From timeserio with MIT License 5 votes vote down vote up
def make_subgen(self, chunk):
        filename = chunk
        subgen = single_row.RowBatchGenerator(
            df=pd.read_parquet(filename).reset_index(),
            batch_size=self.batch_size,
            columns=self.columns
        )
        return subgen 
Example #24
Source File: test_groupby.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_crosstab_parallel1(self):
        def test_impl():
            df = pd.read_parquet("pivot2.pq")
            pt = pd.crosstab(df.A, df.C)
            res = pt.small.values.sum()
            return res

        hpat_func = self.jit(
            pivots={'pt': ['small', 'large']})(test_impl)
        self.assertEqual(hpat_func(), test_impl()) 
Example #25
Source File: test_groupby.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pivot_parallel(self):
        def test_impl():
            df = pd.read_parquet("pivot2.pq")
            pt = df.pivot_table(index='A', columns='C', values='D', aggfunc='sum')
            res = pt.small.values.sum()
            return res

        hpat_func = self.jit(
            pivots={'pt': ['small', 'large']})(test_impl)
        self.assertEqual(hpat_func(), test_impl()) 
Example #26
Source File: test_join.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_merge_asof_parallel1(self):
        def test_impl():
            df1 = pd.read_parquet('asof1.pq')
            df2 = pd.read_parquet('asof2.pq')
            df3 = pd.merge_asof(df1, df2, on='time')
            return (df3.A.sum(), df3.time.max(), df3.B.sum())

        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl()) 
Example #27
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_spark_date(self):
        def test_impl():
            df = pd.read_parquet('sdf_dt.pq')
            return pd.DataFrame({'DT64': df.DT64, 'col2': df.DATE})

        hpat_func = self.jit(test_impl)
        pd.testing.assert_frame_equal(hpat_func(), test_impl()) 
Example #28
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_pandas_date(self):
        def test_impl():
            df = pd.read_parquet('pandas_dt.pq')
            return pd.DataFrame({'DT64': df.DT64, 'col2': df.DATE})

        hpat_func = self.jit(test_impl)
        pd.testing.assert_frame_equal(hpat_func(), test_impl()) 
Example #29
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pd_read_parquet(self):
        def test_impl():
            df = pd.read_parquet('kde.parquet')
            X = df['points']
            return X.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #30
Source File: brute_force_plotter.py    From brute-force-plotter with MIT License 5 votes vote down vote up
def plot_numeric_numeric(input_file, col1, col2, path):
    df = pd.read_parquet(input_file, columns=[col1, col2])
    file_name = os.path.join(path, f"{col1}-{col2}-scatter-plot.png")
    scatter_plot(df, col1, col2, file_name=file_name)