Python pyarrow.parquet.ParquetFile() Examples

The following are 19 code examples of pyarrow.parquet.ParquetFile(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow.parquet , or try the search function .
Example #1
Source File: test_client.py    From json2parquet with MIT License 6 votes vote down vote up
def test_convert_json():
    """
    Test converting a JSON file to Parquet
    """
    schema = pa.schema([
        pa.field("foo", pa.int32()),
        pa.field("bar", pa.int64())
    ])

    input_path = "{}/tests/fixtures/simple_json.txt".format(os.getcwd())
    expected_file = "{}/tests/fixtures/simple.parquet".format(os.getcwd())
    with tempfile.NamedTemporaryFile() as f:
        output_file = f.name
        client.convert_json(input_path, output_file, schema)
        output = pq.ParquetFile(output_file)
        expected = pq.ParquetFile(expected_file)
        assert output.metadata.num_columns == expected.metadata.num_columns
        assert output.metadata.num_rows == expected.metadata.num_rows
        assert output.schema.equals(expected.schema)
        assert output.read_row_group(0).to_pydict() == expected.read_row_group(0).to_pydict() 
Example #2
Source File: test_parquet.py    From kartothek with MIT License 6 votes vote down vote up
def test_predicate_accept_in(store, predicate_value, expected):
    df = pd.DataFrame({"A": [0, 4, 13, 29]})  # min = 0, max = 29
    predicate = ("A", "in", predicate_value)
    serialiser = ParquetSerializer(chunk_size=None)
    key = serialiser.store(store, "prefix", df)

    parquet_file = ParquetFile(store.open(key))
    row_meta = parquet_file.metadata.row_group(0)
    arrow_schema = parquet_file.schema.to_arrow_schema()
    parquet_reader = parquet_file.reader
    assert (
        _predicate_accepts(
            predicate,
            row_meta=row_meta,
            arrow_schema=arrow_schema,
            parquet_reader=parquet_reader,
        )
        == expected
    ) 
Example #3
Source File: parquet.py    From ibis with Apache License 2.0 6 votes vote down vote up
def table(self, name: str, path: Optional[str] = None) -> ir.TableExpr:
        if name not in self.list_tables(path):
            raise AttributeError(name)

        if path is None:
            path = self.root

        # get the schema
        f = path / "{}.parquet".format(name)

        parquet_file = pq.ParquetFile(str(f))
        schema = sch.infer(parquet_file.schema)

        table = self.table_class(name, schema, self).to_expr()
        self.dictionary[name] = f

        return table 
Example #4
Source File: test_write.py    From csv2parquet with Apache License 2.0 6 votes vote down vote up
def test_write_from_csv():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    assert pqf.num_row_groups == 1
    schema = pqf.schema
    assert schema.names == ['a', 'b']
    assert schema.column(0).logical_type.type == 'STRING'
    assert schema.column(1).logical_type.type == 'STRING'
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    col_a = row_group.column(0).to_pylist()
    assert col_a == ['1', '2', '3']
    col_b = row_group.column(1).to_pylist()
    assert col_b == ['a', 'b', 'c'] 
Example #5
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_opt_invalid_types():
    csv2parquet.main_with_args(csv2parquet.convert,
                               ['csvs/invalid-types.csv', '--type',
                                'bool=bool?', 'float32=float32?', 'float64=float64?', 'int8=int8?',
                                'int16=int16?', 'int32=int32?', 'int64=int64?', 'string=string?',
                                'timestamp=timestamp?'])
    pqf = pq.ParquetFile('csvs/invalid-types.parquet')
    schema = pqf.schema
    assert schema.names == ['bool', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64',
                            'string', 'timestamp']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 2
    bools = row_group.column(0).to_pylist()
    assert bools == [True, None]
    float32 = row_group.column(1).to_pylist()
    assert len(float32) == 2
    assert float32[0] == pytest.approx(0.5)
    assert float32[1] is None
    float64 = row_group.column(2).to_pylist()
    assert float64 == [0.75, None]
    int8 = row_group.column(3).to_pylist()
    assert int8 == [12, None]
    int16 = row_group.column(4).to_pylist()
    assert int16 == [400, None]
    int32 = row_group.column(5).to_pylist()
    assert int32 == [132000, None]
    int64 = row_group.column(6).to_pylist()
    assert int64 == [6000000000, None]
    string = row_group.column(7).to_pylist()
    assert string == ['string', 'blah']
    timestamp = row_group.column(8).to_pylist()
    assert timestamp == [datetime(2018, 7, 9, 0, 0), None] 
Example #6
Source File: test_parquet.py    From kartothek with MIT License 5 votes vote down vote up
def test_rowgroup_writing(store, use_categorical, chunk_size):
    df = pd.DataFrame({"string": ["abc", "affe", "banane", "buchstabe"]})
    serialiser = ParquetSerializer(chunk_size=2)
    # Arrow 0.9.0 has a bug in writing categorical columns to more than a single
    # RowGroup: "ArrowIOError: Column 2 had 2 while previous column had 4".
    # We have special handling for that in pandas-serialiser that should be
    # removed once we switch to 0.10.0
    if use_categorical:
        df_write = df.astype({"string": "category"})
    else:
        df_write = df
    key = serialiser.store(store, "prefix", df_write)

    parquet_file = ParquetFile(store.open(key))
    assert parquet_file.num_row_groups == 2 
Example #7
Source File: parquet.py    From gcr-catalogs with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def handle(self):
        if self._handle is None:
            self._handle = pq.ParquetFile(self.path)
        return self._handle 
Example #8
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_required_types():
    csv2parquet.main_with_args(csv2parquet.convert,
                               ['csvs/types.csv', '--type',
                                'bool=bool', 'float32=float32', 'float64=float64', 'int8=int8',
                                'int16=int16', 'int32=int32', 'int64=int64', 'string=string',
                                'timestamp=timestamp'])
    pqf = pq.ParquetFile('csvs/types.parquet')
    schema = pqf.schema
    assert schema.names == ['bool', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64',
                            'string', 'timestamp']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 2
    bools = row_group.column(0).to_pylist()
    assert bools == [True, False]
    float32 = row_group.column(1).to_pylist()
    assert float32 == pytest.approx([0.5, 0.6])
    float64 = row_group.column(2).to_pylist()
    assert float64 == [0.75, 1.75]
    int8 = row_group.column(3).to_pylist()
    assert int8 == [12, 13]
    int16 = row_group.column(4).to_pylist()
    assert int16 == [400, 401]
    int32 = row_group.column(5).to_pylist()
    assert int32 == [132000, 132001]
    int64 = row_group.column(6).to_pylist()
    assert int64 == [6000000000, 6000000001]
    string = row_group.column(7).to_pylist()
    assert string == ['string', 'string']
    timestamp = row_group.column(8).to_pylist()
    assert timestamp == [datetime(2018, 7, 9, 0, 0), datetime(2018, 7, 10, 0, 0)] 
Example #9
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_write_exclude_by_index():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--exclude', '0'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    schema = pqf.schema
    assert schema.names == ['b']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    col_b = row_group.column(0).to_pylist()
    assert col_b == ['a', 'b', 'c'] 
Example #10
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_write_exclude_by_name():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--exclude', 'a'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    schema = pqf.schema
    assert schema.names == ['b']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    col_b = row_group.column(0).to_pylist()
    assert col_b == ['a', 'b', 'c'] 
Example #11
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_write_include_by_index():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--include', '0'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    schema = pqf.schema
    assert schema.names == ['a']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    col_a = row_group.column(0).to_pylist()
    assert col_a == ['1', '2', '3'] 
Example #12
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_write_limit():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--rows', '1'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 1 
Example #13
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_write_row_group_size():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--row-group-size', '1'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    assert pqf.num_row_groups == 3 
Example #14
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_write_rename():
    csv2parquet.main_with_args(csv2parquet.convert,
                               ['csvs/simple.csv', '--rename', '0=alpha', 'b=bee'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    schema = pqf.schema
    assert schema.names == ['alpha', 'bee'] 
Example #15
Source File: test_write.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def test_write_from_tsv():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple2.tsv'])
    pqf = pq.ParquetFile('csvs/simple2.parquet')
    assert pqf.num_row_groups == 1
    schema = pqf.schema
    assert schema.names == ['a', 'b']
    assert schema.column(0).logical_type.type == 'STRING'
    assert schema.column(1).logical_type.type == 'STRING'
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 1
    col_a = row_group.column(0).to_pylist()
    assert col_a == ['1']
    col_b = row_group.column(1).to_pylist()
    assert col_b == ['b'] 
Example #16
Source File: test_schema.py    From ibis with Apache License 2.0 4 votes vote down vote up
def parquet_schema():
    np.random.seed(0)
    size = 100
    df = pd.DataFrame(
        {
            'uint8': np.arange(size, dtype=np.uint8),
            'uint16': np.arange(size, dtype=np.uint16),
            'uint32': np.arange(size, dtype=np.uint32),
            'uint64': np.arange(size, dtype=np.uint64),
            'int8': np.arange(size, dtype=np.int16),
            'int16': np.arange(size, dtype=np.int16),
            'int32': np.arange(size, dtype=np.int32),
            'int64': np.arange(size, dtype=np.int64),
            'float32': np.arange(size, dtype=np.float32),
            'float64': np.arange(size, dtype=np.float64),
            'bool': np.random.randn(size) > 0,
            # TODO(wesm): Test other timestamp resolutions now that arrow
            # supports them
            'datetime': np.arange(
                "2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'
            ),
            'str': [str(x) for x in range(size)],
            'str_with_nulls': [None]
            + [str(x) for x in range(size - 2)]
            + [None],
            'empty_str': [''] * size,
            'bytes': [b'foo'] * size,
        },
        columns=[
            'uint8',
            'uint16',
            'uint32',
            'uint64',
            'int8',
            'int16',
            'int32',
            'int64',
            'float32',
            'float64',
            'bool',
            'datetime',
            'str',
            'str_with_nulls',
            'empty_str',
            'bytes',
        ],
    )

    with tempfile.TemporaryFile() as path:
        table = pa.Table.from_pandas(df)
        pq.write_table(table, path)
        parquet_file = pq.ParquetFile(path)
        return parquet_file.schema 
Example #17
Source File: arrow_reader_worker.py    From petastorm with Apache License 2.0 4 votes vote down vote up
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(
                self._dataset_path_or_paths,
                filesystem=self._filesystem,
                validate_schema=False)

        if self._dataset.partitions is None:
            # When read from parquet file list, the `dataset.partitions` will be None.
            # But other petastorm code require at least an empty `ParquetPartitions` object.
            self._dataset.partitions = pq.ParquetPartitions()

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
        else:
            # Using hash of the dataset path with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            if isinstance(self._dataset_path_or_paths, list):
                path_str = ','.join(self._dataset_path_or_paths)
            else:
                path_str = self._dataset_path_or_paths
            cache_key = '{}:{}:{}'.format(hashlib.md5(path_str.encode('utf-8')).hexdigest(),
                                          piece.path, piece_index)
            all_cols = self._local_cache.get(cache_key,
                                             lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))

        if all_cols:
            self.publish_func(all_cols) 
Example #18
Source File: py_dict_reader_worker.py    From petastorm with Apache License 2.0 4 votes vote down vote up
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(
                self._dataset_path,
                filesystem=self._filesystem,
                validate_schema=False)

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
        else:
            # Using hash of the dataset path with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            cache_key = '{}:{}:{}'.format(hashlib.md5(self._dataset_path.encode('utf-8')).hexdigest(),
                                          piece.path, piece_index)
            all_cols = self._local_cache.get(cache_key,
                                             lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))

        if self._ngram:
            all_cols = self._ngram.form_ngram(data=all_cols, schema=self._schema)

        if all_cols:
            self.publish_func(all_cols) 
Example #19
Source File: test_common_metadata.py    From kartothek with MIT License 4 votes vote down vote up
def test_store_schema_metadata(store, df_all_types):
    store_schema_metadata(
        schema=make_meta(df_all_types, origin="df_all_types"),
        dataset_uuid="some_uuid",
        store=store,
        table="some_table",
    )

    key = "some_uuid/some_table/_common_metadata"
    assert key in store.keys()
    pq_file = pq.ParquetFile(store.open(key))
    actual_schema = pq_file.schema.to_arrow_schema()
    fields = [
        pa.field("array_float32", pa.list_(pa.float64())),
        pa.field("array_float64", pa.list_(pa.float64())),
        pa.field("array_int16", pa.list_(pa.int64())),
        pa.field("array_int32", pa.list_(pa.int64())),
        pa.field("array_int64", pa.list_(pa.int64())),
        pa.field("array_int8", pa.list_(pa.int64())),
        pa.field("array_uint16", pa.list_(pa.uint64())),
        pa.field("array_uint32", pa.list_(pa.uint64())),
        pa.field("array_uint64", pa.list_(pa.uint64())),
        pa.field("array_uint8", pa.list_(pa.uint64())),
        pa.field("array_unicode", pa.list_(pa.string())),
        pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
        pa.field("int64", pa.int64()),
        pa.field("int8", pa.int64()),
        pa.field("null", pa.null()),
        pa.field("uint16", pa.uint64()),
        pa.field("uint32", pa.uint64()),
        pa.field("uint64", pa.uint64()),
        pa.field("uint8", pa.uint64()),
        pa.field("unicode", pa.string()),
    ]
    expected_schema = pa.schema(fields)

    assert actual_schema.remove_metadata() == expected_schema