Java Code Examples for org.apache.parquet.hadoop.ParquetReader

The following examples show how to use org.apache.parquet.hadoop.ParquetReader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: beam   Source File: ParquetIO.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext processContext) throws Exception {
  FileIO.ReadableFile file = processContext.element();

  if (!file.getMetadata().isReadSeekEfficient()) {
    ResourceId filename = file.getMetadata().resourceId();
    throw new RuntimeException(String.format("File has to be seekable: %s", filename));
  }

  SeekableByteChannel seekableByteChannel = file.openSeekable();

  AvroParquetReader.Builder builder =
      AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel));
  if (modelClass != null) {
    // all GenericData implementations have a static get method
    builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null));
  }

  try (ParquetReader<GenericRecord> reader = builder.build()) {
    GenericRecord read;
    while ((read = reader.read()) != null) {
      processContext.output(read);
    }
  }
}
 
Example 2
Source Project: datacollector   Source File: LargeInputFileIT.java    License: Apache License 2.0 6 votes vote down vote up
public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  for(long i = 0; i < recourdCount; i++) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + i, actualRow);

    Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0);
    Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i)));
    Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i);
    Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100);
    Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100)));
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
 
Example 3
Source Project: datacollector   Source File: BaseAvroParquetConvertIT.java    License: Apache License 2.0 6 votes vote down vote up
public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  int position = 0;
  for(Map<String, Object> expectedRow : data) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + position, actualRow);

    for(Map.Entry<String, Object> entry : expectedRow.entrySet()) {
      Object value = actualRow.get(entry.getKey());
      Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value);
    }
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
 
Example 4
private List<TestRecord> readParquetFilesAvro(File outputFile)
    throws IOException {
  ParquetReader<org.apache.gobblin.test.avro.TestRecord> reader = null;
  List<TestRecord> records = new ArrayList<>();
  try {
    reader = new AvroParquetReader<>(new Path(outputFile.toString()));
    for (org.apache.gobblin.test.avro.TestRecord value = reader.read(); value != null; value = reader.read()) {
      records.add(new TestRecord(value.getPartition(),
          value.getSequence(),
          value.getPayload()));
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records;

}
 
Example 5
protected List<TestRecord> readParquetFilesProto(File outputFile)
    throws IOException {
  ParquetReader<TestRecordProtos.TestRecordOrBuilder> reader = null;
  List<TestRecord> records = new ArrayList<>();
  try {
    reader = new ProtoParquetReader<>(new Path(outputFile.toString()));
    for (TestRecordProtos.TestRecordOrBuilder value = reader.read(); value != null; value = reader.read()) {
      records.add(new TestRecord(value.getPartition(),
          value.getSequence(),
          value.getPayload()));
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records;
}
 
Example 6
protected List<TestRecord> readParquetFilesGroup(File outputFile)
    throws IOException {
  ParquetReader<Group> reader = null;
  List<Group> records = new ArrayList<>();
  try {
    reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport());
    for (Group value = reader.read(); value != null; value = reader.read()) {
      records.add(value);
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records.stream().map(value -> new TestRecord(
      value.getInteger(TestConstants.PARTITION_FIELD_NAME, 0),
      value.getLong(TestConstants.SEQUENCE_FIELD_NAME, 0),
      value.getString(TestConstants.PAYLOAD_FIELD_NAME, 0)
  )).collect(Collectors.toList());
}
 
Example 7
Source Project: parquet-mr   Source File: TestPruneColumnsCommand.java    License: Apache License 2.0 6 votes vote down vote up
private void validateColumns(String inputFile, List<String> prunePaths) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(inputFile)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    if (!prunePaths.contains("DocId")) {
      assertEquals(1l, group.getLong("DocId", 0));
    }
    if (!prunePaths.contains("Name")) {
      assertEquals("foo", group.getBinary("Name", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Gender")) {
      assertEquals("male", group.getBinary("Gender", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Links")) {
      Group subGroup = group.getGroup("Links", 0);
      if (!prunePaths.contains("Links.Backward")) {
        assertEquals(2l, subGroup.getLong("Backward", 0));
      }
      if (!prunePaths.contains("Links.Forward")) {
        assertEquals(3l, subGroup.getLong("Forward", 0));
      }
    }
  }
  reader.close();
}
 
Example 8
Source Project: parquet-mr   Source File: PageChecksumReadBenchmarks.java    License: Apache License 2.0 6 votes vote down vote up
private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole)
  throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(configuration)
      .usePageChecksumVerification(verifyChecksums)
      .build()) {
    for (int i = 0; i < nRows; i++) {
      Group group = reader.read();
      blackhole.consume(group.getLong("long_field", 0));
      blackhole.consume(group.getBinary("binary_field", 0));
      Group subgroup = group.getGroup("group", 0);
      blackhole.consume(subgroup.getInteger("int_field", 0));
      blackhole.consume(subgroup.getInteger("int_field", 1));
      blackhole.consume(subgroup.getInteger("int_field", 2));
      blackhole.consume(subgroup.getInteger("int_field", 3));
    }
  }
}
 
Example 9
Source Project: parquet-mr   Source File: ReadBenchmarks.java    License: Apache License 2.0 6 votes vote down vote up
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException
{
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
  for (int i = 0; i < nRows; i++) {
    Group group = reader.read();
    blackhole.consume(group.getBinary("binary_field", 0));
    blackhole.consume(group.getInteger("int32_field", 0));
    blackhole.consume(group.getLong("int64_field", 0));
    blackhole.consume(group.getBoolean("boolean_field", 0));
    blackhole.consume(group.getFloat("float_field", 0));
    blackhole.consume(group.getDouble("double_field", 0));
    blackhole.consume(group.getBinary("flba_field", 0));
    blackhole.consume(group.getInt96("int96_field", 0));
  }
  reader.close();
}
 
Example 10
Source Project: parquet-mr   Source File: TestFiltersWithMissingColumns.java    License: Apache License 2.0 6 votes vote down vote up
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), path)
      .withFilter(FilterCompat.get(pred))
      .build();

  long count = 0;
  try {
    while (reader.read() != null) {
      count += 1;
    }
  } finally {
    reader.close();
  }
  return count;
}
 
Example 11
Source Project: parquet-mr   Source File: TestThriftToParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteFile() throws IOException, InterruptedException, TException {
  final AddressBook a = new AddressBook(
      Arrays.asList(
          new Person(
              new Name("Bob", "Roberts"),
              0,
              "[email protected]",
              Arrays.asList(new PhoneNumber("1234567890")))));

  final Path fileToCreate = createFile(a);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  int i = 0;
  while((g = reader.read()) != null) {
    assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons"));
    assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0));
    // just some sanity check, we're testing the various layers somewhere else
    ++i;
  }
  assertEquals("read 1 record", 1, i);

}
 
Example 12
Source Project: parquet-mr   Source File: TestThriftToParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteFileListOfMap() throws IOException, InterruptedException, TException {
  Map<String, String> map1 = new HashMap<String,String>();
  map1.put("key11", "value11");
  map1.put("key12", "value12");
  Map<String, String> map2 = new HashMap<String,String>();
  map2.put("key21", "value21");
  final TestMapInList listMap = new TestMapInList("listmap",
      Arrays.asList(map1, map2));

  final Path fileToCreate = createFile(listMap);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  while((g = reader.read()) != null) {
    assertEquals(listMap.names.size(),
        g.getGroup("names", 0).getFieldRepetitionCount("names_tuple"));
    assertEquals(listMap.names.get(0).size(),
        g.getGroup("names", 0).getGroup("names_tuple", 0).getFieldRepetitionCount("map"));
    assertEquals(listMap.names.get(1).size(),
        g.getGroup("names", 0).getGroup("names_tuple", 1).getFieldRepetitionCount("map"));
  }
}
 
Example 13
Source Project: nifi   Source File: PutParquetTest.java    License: Apache License 2.0 6 votes vote down vote up
private void verifyAvroParquetUsers(final Path avroParquetUsers, final int numExpectedUsers) throws IOException {
    final ParquetReader.Builder<GenericRecord> readerBuilder = AvroParquetReader
            .<GenericRecord>builder(avroParquetUsers)
            .withConf(testConf);

    int currUser = 0;

    try (final ParquetReader<GenericRecord> reader = readerBuilder.build()) {
        GenericRecord nextRecord;
        while((nextRecord = reader.read()) != null) {
            Assert.assertNotNull(nextRecord);
            Assert.assertEquals("name" + currUser, nextRecord.get("name").toString());
            Assert.assertEquals(currUser, nextRecord.get("favorite_number"));
            Assert.assertEquals("blue" + currUser, nextRecord.get("favorite_color").toString());
            currUser++;
        }
    }

    Assert.assertEquals(numExpectedUsers, currUser);
}
 
Example 14
Source Project: parquet-mr   Source File: TestThriftToParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteFileMapOfLists() throws IOException, InterruptedException, TException {
  Map<List<String>, List<String>> map = new HashMap<List<String>,List<String>>();
  map.put(Arrays.asList("key1","key2"), Arrays.asList("val1","val2"));
  final TestListsInMap mapList = new TestListsInMap("maplists", map);
  final Path fileToCreate = createFile(mapList);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  while((g = reader.read()) != null) {
    assertEquals("key1",
        g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 0).toStringUsingUTF8());
    assertEquals("key2",
        g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 1).toStringUsingUTF8());
    assertEquals("val1",
        g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 0).toStringUsingUTF8());
    assertEquals("val2",
        g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 1).toStringUsingUTF8());
  }
}
 
Example 15
Source Project: parquet-mr   Source File: AvroTestUtil.java    License: Apache License 2.0 6 votes vote down vote up
public static <D> List<D> read(Configuration conf, GenericData model, Schema schema, File file) throws IOException {
  List<D> data = new ArrayList<D>();
  AvroReadSupport.setRequestedProjection(conf, schema);
  AvroReadSupport.setAvroReadSchema(conf, schema);

  try (ParquetReader<D> fileReader = AvroParquetReader
    .<D>builder(HadoopInputFile.fromPath(new Path(file.toString()), conf))
    .withDataModel(model) // reflect disables compatibility
    .build()) {
    D datum;
    while ((datum = fileReader.read()) != null) {
      data.add(datum);
    }
  }

  return data;
}
 
Example 16
Source Project: parquet-mr   Source File: TestSpecificReadWrite.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterOnSubAttribute() throws IOException {
  Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false);

  ParquetReader<Car> reader = new AvroParquetReader<Car>(testConf, path, column("engine.type", equalTo(EngineType.DIESEL)));
  assertEquals(reader.read().toString(), getVwPassat().toString());
  assertNull(reader.read());

  reader = new AvroParquetReader<Car>(testConf, path, column("engine.capacity", equalTo(1.4f)));
  assertEquals(getVwPolo().toString(), reader.read().toString());
  assertNull(reader.read());

  reader = new AvroParquetReader<Car>(testConf, path, column("engine.hasTurboCharger", equalTo(true)));
  assertEquals(getBmwMini().toString(), reader.read().toString());
  assertNull(reader.read());
}
 
Example 17
Source Project: parquet-mr   Source File: TestSpecificReadWrite.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testAvroReadSchema() throws IOException {
  Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false);
  Configuration conf = new Configuration(testConf);
  AvroReadSupport.setAvroReadSchema(conf, NewCar.SCHEMA$);

  try(ParquetReader<NewCar> reader = new AvroParquetReader<>(conf, path)) {
    for (NewCar car = reader.read(); car != null; car = reader.read()) {
      assertNotNull(car.getEngine());
      assertNotNull(car.getBrand());
      assertEquals(2010, car.getYear());
      assertNotNull(car.getVin());
      assertNull(car.getDescription());
      assertEquals(5, car.getOpt());
    }
  }
}
 
Example 18
Source Project: parquet-mr   Source File: TestBackwardCompatibility.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCompatStringCompatibility() throws IOException {
  // some older versions of Parquet used avro.schema instead of
  // parquet.avro.schema and didn't annotate binary with UTF8 when the type
  // was converted from an Avro string. this validates that the old read
  // schema is recognized and used to read the file as expected.
  Path testFile = new Path(Resources.getResource("strings-2.parquet").getFile());
  Configuration conf = new Configuration();
  ParquetReader<GenericRecord> reader = AvroParquetReader
      .builder(new AvroReadSupport<GenericRecord>(), testFile)
      .withConf(conf)
      .build();
  GenericRecord r;
  while ((r = reader.read()) != null) {
    Assert.assertTrue("Should read value into a String",
        r.get("text") instanceof String);
  }
}
 
Example 19
private GenericRecord readActualRecord(String parquetPath) throws IOException
{
    try (ParquetReader<GenericRecord> reader = AvroParquetReader
            .<GenericRecord>builder(
                    HadoopInputFile.fromPath(new Path(new File(parquetPath).toURI()), new Configuration()))
            .build())
    {
        return reader.read();
    }
}
 
Example 20
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}
 
Example 21
Source Project: flink   Source File: ParquetStreamingFileSinkITCase.java    License: Apache License 2.0 5 votes vote down vote up
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}
 
Example 22
Source Project: garmadon   Source File: ProtoParquetWriterWithOffsetTest.java    License: Apache License 2.0 5 votes vote down vote up
private List<EventHeaderProtos.Header> checkSingleFileWithFileSystem(
    Collection<EventHeaderProtos.Header> inputHeaders) throws IOException {
    final List<EventHeaderProtos.Header> headers = new LinkedList<>();

    Path newTmpFile = new Path(tmpPath, "file");
    final ProtoParquetWriter<Message> writer = new ProtoParquetWriter<>(newTmpFile,
        EventHeaderProtos.Header.class);
    long offset = 1;
    final BiConsumer<String, String> protoMetadataWriter = mock(BiConsumer.class);

    final ProtoParquetWriterWithOffset consumer = new ProtoParquetWriterWithOffset<>(writer, newTmpFile, finalPath,
        localFs, new FixedOffsetComputer(FINAL_FILE_NAME, 123), UTC_EPOCH, "ignored",
        protoMetadataWriter, 1);

    for (EventHeaderProtos.Header header : inputHeaders) {
        consumer.write(1234567890L, header, new TopicPartitionOffset(TOPIC, 1, offset++));
    }
    consumer.close();

    final RemoteIterator<LocatedFileStatus> filesIterator = localFs.listFiles(finalPath, false);
    final LocatedFileStatus fileStatus = filesIterator.next();
    Assert.assertEquals(FINAL_FILE_NAME, fileStatus.getPath().getName());
    Assert.assertFalse("There should be only one output file", filesIterator.hasNext());

    final ParquetReader<EventHeaderProtos.Header.Builder> reader;
    reader = ProtoParquetReader.<EventHeaderProtos.Header.Builder>builder(fileStatus.getPath()).build();

    EventHeaderProtos.Header.Builder current = reader.read();
    while (current != null) {
        headers.add(current.build());
        current = reader.read();
    }

    return headers;
}
 
Example 23
Source Project: iceberg   Source File: ParquetIterable.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public CloseableIterator<T> iterator() {
  try {
    ParquetReader<T> reader = builder.build();
    addCloseable(reader);
    return new ParquetIterator<>(reader);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to create Parquet reader");
  }
}
 
Example 24
Source Project: pxf   Source File: ParquetFileAccessor.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Opens the resource for read.
 *
 * @throws IOException if opening the resource failed
 */
@Override
public boolean openForRead() throws IOException {
    file = new Path(context.getDataSource());
    FileSplit fileSplit = HdfsUtilities.parseFileSplit(context);

    // Read the original schema from the parquet file
    MessageType originalSchema = getSchema(file, fileSplit);
    // Get a map of the column name to Types for the given schema
    Map<String, Type> originalFieldsMap = getOriginalFieldsMap(originalSchema);
    // Get the read schema. This is either the full set or a subset (in
    // case of column projection) of the greenplum schema.
    MessageType readSchema = buildReadSchema(originalFieldsMap, originalSchema);
    // Get the record filter in case of predicate push-down
    FilterCompat.Filter recordFilter = getRecordFilter(context.getFilterString(), originalFieldsMap, readSchema);

    // add column projection
    configuration.set(PARQUET_READ_SCHEMA, readSchema.toString());

    fileReader = ParquetReader.builder(new GroupReadSupport(), file)
            .withConf(configuration)
            // Create reader for a given split, read a range in file
            .withFileRange(fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength())
            .withFilter(recordFilter)
            .build();
    context.setMetadata(readSchema);
    return true;
}
 
Example 25
Source Project: jstarcraft-ai   Source File: ParquetConverter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public int convert(DataModule module, ParquetReader iterator) {
    try {
        return parseData(module, iterator);
    } catch (Exception exception) {
        // TODO 处理日志.
        throw new DataException(exception);
    }
}
 
Example 26
Source Project: arvo2parquet   Source File: DataLoad.java    License: MIT License 5 votes vote down vote up
private static void readFromParquet(@Nonnull final Path filePathToRead) throws IOException {
  try (final ParquetReader<GenericData.Record> reader = AvroParquetReader
          .<GenericData.Record>builder(nioPathToInputFile(filePathToRead))
          .withConf(new Configuration())
          .build())
  {
    GenericData.Record record;
    while ((record = reader.read()) != null) {
      System.out.println(record);
    }
  }
}
 
Example 27
Source Project: kafka-connect-fs   Source File: ParquetFileReader.java    License: Apache License 2.0 5 votes vote down vote up
private ParquetReader<GenericRecord> initReader() throws IOException {
    Configuration configuration = getFs().getConf();
    if (this.schema != null) {
        AvroReadSupport.setAvroReadSchema(configuration, this.schema);
    }
    if (this.projection != null) {
        AvroReadSupport.setRequestedProjection(configuration, this.projection);
    }
    return AvroParquetReader
            .<GenericRecord>builder(HadoopInputFile.fromPath(getFilePath(), configuration))
            .build();
}
 
Example 28
Source Project: iceberg   Source File: ParquetIterable.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Iterator<T> iterator() {
  try {
    ParquetReader<T> reader = builder.build();
    addCloseable(reader);
    return new ParquetIterator<>(reader);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to create Parquet reader");
  }
}
 
Example 29
Source Project: streamx   Source File: ParquetFileReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Schema getSchema(Configuration conf, Path path) throws IOException {
  AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>();
  ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path);
  ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build();
  GenericRecord record;
  Schema schema = null;
  while ((record = parquetReader.read()) != null) {
    schema = avroData.toConnectSchema(record.getSchema());
  }
  parquetReader.close();
  return schema;
}
 
Example 30
Source Project: streamx   Source File: ParquetFileReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Collection<Object> readData(Configuration conf, Path path) throws IOException {
  Collection<Object> result = new ArrayList<>();
  AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>();
  ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path);
  ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build();
  GenericRecord record;
  while ((record = parquetReader.read()) != null) {
    result.add(record);
  }
  parquetReader.close();
  return result;
}