Python pyarrow.parquet.read_table() Examples

The following are 25 code examples of pyarrow.parquet.read_table(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow.parquet , or try the search function .
Example #1
Source File: dataframe_bytes_storage.py    From pyABC with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def df_from_bytes_parquet_(bytes_: bytes) -> pd.DataFrame:
    """
    Since pyabc 0.9.14, pandas DataFrames are converted using
    pyarrow parquet. If the conversion to DataFrame fails,
    then `df_from_bytes_msgpack_` is tried, which was the formerly
    used method. This is in particular useful for databases that
    still employ the old format. In case errors occur here, it may
    be necessary to use a pandas version prior to 0.25.0.
    """
    try:
        b = BytesIO(bytes_)
        table = parquet.read_table(b)
        df = table.to_pandas()
    except pyarrow.lib.ArrowIOError:
        df = df_from_bytes_msgpack_(bytes_)
    return df 
Example #2
Source File: utils.py    From gordo with GNU Affero General Public License v3.0 6 votes vote down vote up
def dataframe_from_parquet_bytes(buf: bytes) -> pd.DataFrame:
    """
    Convert bytes representing a parquet table into a pandas dataframe.

    Parameters
    ----------
    buf: bytes
        Bytes representing a parquet table. Can be the direct result from
        `func`::gordo.server.utils.dataframe_into_parquet_bytes

    Returns
    -------
    pandas.DataFrame
    """
    table = pq.read_table(io.BytesIO(buf))
    return table.to_pandas() 
Example #3
Source File: index.py    From kartothek with MIT License 6 votes vote down vote up
def _parquet_bytes_to_dict(column: str, index_buffer: bytes):
    reader = pa.BufferReader(index_buffer)
    # This can be done much more efficient but would take a lot more
    # time to implement so this will be only done on request.
    table = pq.read_table(reader)
    if ARROW_LARGER_EQ_0150:
        column_type = table.schema.field(column).type
    else:
        column_type = table.schema.field_by_name(column).type

    # `datetime.datetime` objects have a precision of up to microseconds only, so arrow
    # parses the type to `pa.timestamp("us")`. Since the
    # values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this
    # and load the column type as `pa.timestamp("ns")`
    if column_type == pa.timestamp("us"):
        column_type = pa.timestamp("ns")

    df = _fix_pyarrow_07992_table(table).to_pandas()  # Could eventually be phased out

    index_dct = dict(
        zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))
    )
    return index_dct, column_type 
Example #4
Source File: io.py    From fletcher with MIT License 6 votes vote down vote up
def read_parquet(
    path, columns: Optional[List[str]] = None, continuous: bool = False
) -> pd.DataFrame:
    """
    Load a parquet object from the file path, returning a DataFrame with fletcher columns.

    Parameters
    ----------
    path : str or file-like
    continuous : bool
        Use FletcherContinuousArray instead of FletcherChunkedArray

    Returns
    -------
    pd.DataFrame
    """
    table = pq.read_table(path, columns=columns)
    return pandas_from_arrow(table, continuous=continuous) 
Example #5
Source File: data_utils.py    From sagemaker-xgboost-container with Apache License 2.0 6 votes vote down vote up
def _get_parquet_dmatrix_file_mode(files_path):
    """Get Data Matrix from parquet data in file mode.

    :param files_path: File path where parquet formatted training data resides, either directory or file
    :return: xgb.DMatrix
    """
    try:
        table = pq.read_table(files_path)

        data = table.to_pandas()
        del table

        if type(data) is pd.DataFrame:
            # pyarrow.Table.to_pandas may produce NumPy array or pandas DataFrame
            data = data.to_numpy()

        dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
        del data

        return dmatrix

    except Exception as e:
        raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e)) 
Example #6
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_str_with_nan_par_multigroup(self):
        def test_impl():
            df = pq.read_table('example2.parquet').to_pandas()
            A = df.five.values == 'foo'
            return A.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #7
Source File: protocols.py    From bionic with Apache License 2.0 5 votes vote down vote up
def read(self, path):
        with path.open("rb") as file_:
            return parquet.read_table(file_).to_pandas() 
Example #8
Source File: test_groupby.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_agg_parallel_str(self):
        def test_impl():
            df = pq.read_table("groupby3.pq").to_pandas()
            A = df.groupby('A')['B'].agg(lambda x: x.max() - x.min())
            return A.sum()

        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #9
Source File: test_hiframes.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_concat_series_str(self):
        def test_impl():
            df1 = pq.read_table('example.parquet').to_pandas()
            df2 = pq.read_table('example.parquet').to_pandas()
            A3 = pd.concat([df1.two, df2.two])
            return (A3 == 'foo').sum()

        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #10
Source File: test_hiframes.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_concat_str(self):
        def test_impl():
            df1 = pq.read_table('example.parquet').to_pandas()
            df2 = pq.read_table('example.parquet').to_pandas()
            A3 = pd.concat([df1, df2])
            return (A3.two == 'foo').sum()

        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #11
Source File: test_hiframes.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_unique_str_parallel(self):
        # TODO: test without file
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            return (df.two.unique() == 'foo').sum()

        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0) 
Example #12
Source File: test_hiframes.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_nunique_str_parallel(self):
        # TODO: test without file
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            return df.two.nunique()

        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        # test compile again for overload related issues
        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0) 
Example #13
Source File: test_hiframes.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_nunique_parallel(self):
        # TODO: test without file
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            return df.four.nunique()

        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        # test compile again for overload related issues
        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0) 
Example #14
Source File: test_strings.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_string_NA_box(self):
        # create `example.parquet` file
        ParquetGenerator.gen_pq_test()

        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            return df.five
        hpat_func = self.jit(test_impl)

        # XXX just checking isna() since Pandas uses None in this case
        # instead of nan for some reason
        np.testing.assert_array_equal(hpat_func().isna(), test_impl().isna())

    # test utf8 decode 
Example #15
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_float_no_nan(self):
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            return df.four.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #16
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_nan(self):
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            return df.one.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #17
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_str_with_nan_par(self):
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            A = df.five.values == 'foo'
            return A.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #18
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_str_with_nan_seq(self):
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            A = df.five.values == 'foo'
            return A

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl()) 
Example #19
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_str(self):
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            A = df.two.values == 'foo'
            return A.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #20
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_pq_read(self):
        def test_impl():
            t = pq.read_table('kde.parquet')
            df = t.to_pandas()
            X = df['points']
            return X.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0) 
Example #21
Source File: test_pyarrow_roundtrip.py    From fletcher with MIT License 5 votes vote down vote up
def test_parquet_roundtrip(array_type):
    df = pd.DataFrame({"col": array_type(["A", "B"])})
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    reader = pa.BufferReader(buf.getvalue().to_pybytes())
    table = pq.read_table(reader)
    pdt.assert_frame_equal(df, table.to_pandas()) 
Example #22
Source File: hydrofunctions.py    From hydrofunctions with MIT License 5 votes vote down vote up
def read_parquet(filename):
    pa_table = pq.read_table(filename)
    dataframe = pa_table.to_pandas()
    meta_dict = pa_table.schema.metadata
    if b"hydrofunctions_meta" in meta_dict:
        meta_string = meta_dict[b"hydrofunctions_meta"].decode()
        meta = json.loads(meta_string, encoding="utf-8")
    else:
        meta = None
    return dataframe, meta 
Example #23
Source File: test_parquet.py    From spectrify with MIT License 5 votes vote down vote up
def test_write(self):
        # Write out test file
        with UncloseableBytesIO() as write_buffer:
            with Writer(write_buffer, self.table) as writer:
                writer.write_row_group(self.data)
            file_bytes = write_buffer.getvalue()

        # Read in test file
        read_buffer = BytesIO(file_bytes)
        with pa.PythonFile(read_buffer, mode='r') as infile:

            # Verify data
            parq_table = pq.read_table(infile)
            written_data = list(parq_table.to_pydict().values())

            tuples_by_data_type = zip(self.data, written_data)
            for i in tuples_by_data_type:
                tuples_by_order = zip(i[0], i[1])
                for j in tuples_by_order:
                    self.assertAlmostEquals(j[0], j[1], places=5)

            # Verify parquet file schema
            for i, field in enumerate(parq_table.schema):
                self.assertEqual(field.type.id, self.expected_datatypes[i].id)

            # Ensure timestamp column was written with int96; right now
            # there is no way to see except to check that the unit on
            # the timestamp type is 'ns'
            ts_col = parq_table.schema.field_by_name('timestamp_col')
            self.assertEqual(ts_col.type.unit, 'ns') 
Example #24
Source File: parquet.py    From ibis with Apache License 2.0 5 votes vote down vote up
def parquet_read_table(op, client, scope, **kwargs):
    path = client.dictionary[op.name]
    table = pq.read_table(str(path))
    df = table.to_pandas()
    return df 
Example #25
Source File: data_utils.py    From sagemaker-xgboost-container with Apache License 2.0 5 votes vote down vote up
def _get_parquet_dmatrix_pipe_mode(pipe_path):
    """Get Data Matrix from parquet data in pipe mode.

    :param pipe_path: SageMaker pipe path where parquet formatted training data is piped
    :return: xgb.DMatrix or None
    """
    try:
        f = mlio.SageMakerPipe(pipe_path)
        examples = []

        with f.open_read() as strm:
            reader = mlio.ParquetRecordReader(strm)

            for record in reader:
                table = pq.read_table(as_arrow_file(record))
                array = table.to_pandas()
                if type(array) is pd.DataFrame:
                    array = array.to_numpy()
                examples.append(array)

        if examples:
            data = np.vstack(examples)
            del examples

            dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e))