org.apache.parquet.avro.AvroParquetReader Java Examples

The following examples show how to use org.apache.parquet.avro.AvroParquetReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetRecordReader.java    From nifi with Apache License 2.0 7 votes vote down vote up
public ParquetRecordReader(final InputStream inputStream, final long inputLength, final Configuration configuration) throws IOException {
    if (inputLength < 0) {
        throw new IllegalArgumentException("Invalid input length of '" + inputLength + "'. This record reader requires knowing " +
                "the length of the InputStream and cannot be used in some cases where the length may not be known.");
    }

    this.inputStream = inputStream;

    inputFile = new NifiParquetInputFile(inputStream, inputLength);
    parquetReader = AvroParquetReader.<GenericRecord>builder(inputFile).withConf(configuration).build();

    // Read the first record so that we can extract the schema
    lastParquetRecord = parquetReader.read();
    if (lastParquetRecord == null) {
        throw new EOFException("Unable to obtain schema because no records were available");
    }

    // Convert Avro schema to RecordSchema
    recordSchema = AvroTypeUtil.createSchema(lastParquetRecord.getSchema());
}
 
Example #2
Source File: ParquetReader.java    From reef with Apache License 2.0 7 votes vote down vote up
/**
 * Serialize Avro data to a in-memory ByteBuffer.
 * @return A ByteBuffer that contains avro data.
 * @throws IOException if the parquet file couldn't be parsed correctly.
 */
public ByteBuffer serializeToByteBuffer() throws IOException {
  final ByteArrayOutputStream stream = new ByteArrayOutputStream();
  final Encoder encoder = EncoderFactory.get().binaryEncoder(stream, null);
  final DatumWriter writer = new GenericDatumWriter<GenericRecord>();
  writer.setSchema(createAvroSchema());
  final AvroParquetReader<GenericRecord> reader = createAvroReader();

  GenericRecord record = reader.read();
  while (record != null) {
    writer.write(record, encoder);
    record = reader.read();
  }

  try {
    reader.close();
  } catch (IOException ex){
    LOG.log(Level.SEVERE, ex.getMessage());
    throw ex;
  }

  encoder.flush();
  final ByteBuffer buf = ByteBuffer.wrap(stream.toByteArray());
  buf.order(ByteOrder.LITTLE_ENDIAN);
  return buf;
}
 
Example #3
Source File: ParquetFileSystemDatasetReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize() {
  Preconditions.checkState(state.equals(ReaderWriterState.NEW),
    "A reader may not be opened more than once - current state:%s", state);

  LOG.debug("Opening reader on path:{}", path);

  try {
    final Configuration conf = fileSystem.getConf();
    AvroReadSupport.setAvroReadSchema(conf, readerSchema);
    reader = new AvroParquetReader<E>(
        conf, fileSystem.makeQualified(path));
  } catch (IOException e) {
    throw new DatasetIOException("Unable to create reader path:" + path, e);
  }

  advance();

  state = ReaderWriterState.OPEN;
}
 
Example #4
Source File: PutParquetTest.java    From nifi with Apache License 2.0 6 votes vote down vote up
private void verifyAvroParquetUsers(final Path avroParquetUsers, final int numExpectedUsers) throws IOException {
    final ParquetReader.Builder<GenericRecord> readerBuilder = AvroParquetReader
            .<GenericRecord>builder(avroParquetUsers)
            .withConf(testConf);

    int currUser = 0;

    try (final ParquetReader<GenericRecord> reader = readerBuilder.build()) {
        GenericRecord nextRecord;
        while((nextRecord = reader.read()) != null) {
            Assert.assertNotNull(nextRecord);
            Assert.assertEquals("name" + currUser, nextRecord.get("name").toString());
            Assert.assertEquals(currUser, nextRecord.get("favorite_number"));
            Assert.assertEquals("blue" + currUser, nextRecord.get("favorite_color").toString());
            currUser++;
        }
    }

    Assert.assertEquals(numExpectedUsers, currUser);
}
 
Example #5
Source File: ParquetHdfsDataWriterTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
private List<TestRecord> readParquetFilesAvro(File outputFile)
    throws IOException {
  ParquetReader<org.apache.gobblin.test.avro.TestRecord> reader = null;
  List<TestRecord> records = new ArrayList<>();
  try {
    reader = new AvroParquetReader<>(new Path(outputFile.toString()));
    for (org.apache.gobblin.test.avro.TestRecord value = reader.read(); value != null; value = reader.read()) {
      records.add(new TestRecord(value.getPartition(),
          value.getSequence(),
          value.getPayload()));
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records;

}
 
Example #6
Source File: ParquetIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext processContext) throws Exception {
  FileIO.ReadableFile file = processContext.element();

  if (!file.getMetadata().isReadSeekEfficient()) {
    ResourceId filename = file.getMetadata().resourceId();
    throw new RuntimeException(String.format("File has to be seekable: %s", filename));
  }

  SeekableByteChannel seekableByteChannel = file.openSeekable();

  AvroParquetReader.Builder builder =
      AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel));
  if (modelClass != null) {
    // all GenericData implementations have a static get method
    builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null));
  }

  try (ParquetReader<GenericRecord> reader = builder.build()) {
    GenericRecord read;
    while ((read = reader.read()) != null) {
      processContext.output(read);
    }
  }
}
 
Example #7
Source File: LargeInputFileIT.java    From datacollector with Apache License 2.0 6 votes vote down vote up
public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  for(long i = 0; i < recourdCount; i++) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + i, actualRow);

    Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0);
    Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i)));
    Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i);
    Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100);
    Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100)));
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
 
Example #8
Source File: BaseAvroParquetConvertIT.java    From datacollector with Apache License 2.0 6 votes vote down vote up
public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  int position = 0;
  for(Map<String, Object> expectedRow : data) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + position, actualRow);

    for(Map.Entry<String, Object> entry : expectedRow.entrySet()) {
      Object value = actualRow.get(entry.getKey());
      Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value);
    }
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
 
Example #9
Source File: MiniDfsResource.java    From components with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that a file on the HDFS cluster contains the given parquet.
 *
 * @param path the name of the file on the HDFS cluster
 * @param expected the expected avro record in the file .
 */
public static void assertReadParquetFile(FileSystem fs, String path, Set<IndexedRecord> expected, boolean part) throws IOException {
    Path p = new Path(path);
    if (fs.isFile(p)) {
        try (AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(fs.getConf(), new Path(path))) {
            IndexedRecord record = null;
            while (null != (record = reader.read())){
                IndexedRecord eqRecord = null;
                for (IndexedRecord indexedRecord : expected) {
                    if(indexedRecord.equals(record)){
                        eqRecord = indexedRecord;
                        break;
                    }
                }
                expected.remove(eqRecord);
            }
        }
        // Check before asserting for the message.
        if (!part && expected.size() != 0)
            assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0));
    } else if (fs.isDirectory(p)) {
        for (FileStatus fstatus : FileSystemUtil.listSubFiles(fs, p)) {
            assertReadParquetFile(fs, fstatus.getPath().toString(), expected, true);
        }
        // Check before asserting for the message.
        if (expected.size() != 0)
            assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0));
    } else {
        fail("No such path: " + path);
    }
}
 
Example #10
Source File: TestParquetRecordSetWriter.java    From nifi with Apache License 2.0 5 votes vote down vote up
private void verifyParquetRecords(final File parquetFile, final int expectedRecordCount) throws IOException {
    final Configuration conf = new Configuration();
    final Path path = new Path(parquetFile.getPath());
    final InputFile inputFile = HadoopInputFile.fromPath(path, conf);

    try (final ParquetReader<GenericRecord> reader =
            AvroParquetReader.<GenericRecord>builder(inputFile).withConf(conf).build()){

        int recordCount = 0;
        while(reader.read() != null) {
            recordCount++;
        }
        assertEquals(expectedRecordCount, recordCount);
    }
}
 
Example #11
Source File: ParquetUtils.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a ParquetReader with the given path.
 */
public static ParquetReader<GenericRecord> getParquetReader(Path path)
    throws IOException {
  //noinspection unchecked
  return AvroParquetReader.<GenericRecord>builder(path).disableCompatibility().withDataModel(GenericData.get())
      .withConf(getConfiguration()).build();
}
 
Example #12
Source File: AvroParquetFileReaderWriterFactory.java    From secor with Apache License 2.0 5 votes vote down vote up
public AvroParquetFileReader(LogFilePath logFilePath, CompressionCodec codec) throws IOException {
    Path path = new Path(logFilePath.getLogFilePath());
    topic = logFilePath.getTopic();
    Schema schema = schemaRegistry.getSchema(topic);
    reader = AvroParquetReader.<GenericRecord>builder(path).build();
    writer = new SpecificDatumWriter(schema);
    offset = logFilePath.getOffset();
}
 
Example #13
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}
 
Example #14
Source File: ConvertCsvToParquetFileExpressionProcessorTests.java    From vividus with Apache License 2.0 5 votes vote down vote up
private GenericRecord readActualRecord(String parquetPath) throws IOException
{
    try (ParquetReader<GenericRecord> reader = AvroParquetReader
            .<GenericRecord>builder(
                    HadoopInputFile.fromPath(new Path(new File(parquetPath).toURI()), new Configuration()))
            .build())
    {
        return reader.read();
    }
}
 
Example #15
Source File: TestParquetInLining.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Test
public void testSimpleInlineFileSystem() throws IOException {
  Path outerInMemFSPath = getRandomOuterInMemPath();
  Path outerPath = new Path(FILE_SCHEME + outerInMemFSPath.toString().substring(outerInMemFSPath.toString().indexOf(':')));
  generatedPath = outerPath;
  ParquetWriter inlineWriter = new AvroParquetWriter(outerInMemFSPath, HoodieTestDataGenerator.AVRO_SCHEMA,
      CompressionCodecName.GZIP, 100 * 1024 * 1024, 1024 * 1024, true, inMemoryConf);
  // write few records
  List<GenericRecord> recordsToWrite = getParquetHoodieRecords();
  for (GenericRecord rec : recordsToWrite) {
    inlineWriter.write(rec);
  }
  inlineWriter.close();
  byte[] inlineBytes = getBytesToInline(outerInMemFSPath);
  long startOffset = generateOuterFile(outerPath, inlineBytes);

  long inlineLength = inlineBytes.length;

  // Generate phantom inline file
  Path inlinePath = getPhantomFile(outerPath, startOffset, inlineLength);

  // instantiate Parquet reader
  ParquetReader inLineReader = AvroParquetReader.builder(inlinePath).withConf(inlineConf).build();
  List<GenericRecord> records = readParquetGenericRecords(inLineReader);
  assertArrayEquals(recordsToWrite.toArray(), records.toArray());
  inLineReader.close();
}
 
Example #16
Source File: ParquetUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Fetch {@link HoodieKey}s from the given parquet file.
 *
 * @param filePath      The parquet file path.
 * @param configuration configuration to build fs object
 * @return {@link List} of {@link HoodieKey}s fetched from the parquet file
 */
public static List<HoodieKey> fetchRecordKeyPartitionPathFromParquet(Configuration configuration, Path filePath) {
  List<HoodieKey> hoodieKeys = new ArrayList<>();
  try {
    if (!filePath.getFileSystem(configuration).exists(filePath)) {
      return new ArrayList<>();
    }

    Configuration conf = new Configuration(configuration);
    conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
    Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
    AvroReadSupport.setAvroReadSchema(conf, readSchema);
    AvroReadSupport.setRequestedProjection(conf, readSchema);
    ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build();
    Object obj = reader.read();
    while (obj != null) {
      if (obj instanceof GenericRecord) {
        String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
        String partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
        hoodieKeys.add(new HoodieKey(recordKey, partitionPath));
        obj = reader.read();
      }
    }
  } catch (IOException e) {
    throw new HoodieIOException("Failed to read from Parquet file " + filePath, e);
  }
  return hoodieKeys;
}
 
Example #17
Source File: ParquetUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
 * return all the rowkeys.
 *
 * @param filePath      The parquet file path.
 * @param configuration configuration to build fs object
 * @param filter        record keys filter
 * @param readSchema    schema of columns to be read
 * @return Set Set of row keys matching candidateRecordKeys
 */
private static Set<String> filterParquetRowKeys(Configuration configuration, Path filePath, Set<String> filter,
                                                Schema readSchema) {
  Option<RecordKeysFilterFunction> filterFunction = Option.empty();
  if (filter != null && !filter.isEmpty()) {
    filterFunction = Option.of(new RecordKeysFilterFunction(filter));
  }
  Configuration conf = new Configuration(configuration);
  conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
  AvroReadSupport.setAvroReadSchema(conf, readSchema);
  AvroReadSupport.setRequestedProjection(conf, readSchema);
  Set<String> rowKeys = new HashSet<>();
  try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build()) {
    Object obj = reader.read();
    while (obj != null) {
      if (obj instanceof GenericRecord) {
        String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
        if (!filterFunction.isPresent() || filterFunction.get().apply(recordKey)) {
          rowKeys.add(recordKey);
        }
      }
      obj = reader.read();
    }
  } catch (IOException e) {
    throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e);

  }
  // ignore
  return rowKeys;
}
 
Example #18
Source File: ParquetFileReader.java    From kafka-connect-fs with Apache License 2.0 5 votes vote down vote up
private ParquetReader<GenericRecord> initReader() throws IOException {
    Configuration configuration = getFs().getConf();
    if (this.schema != null) {
        AvroReadSupport.setAvroReadSchema(configuration, this.schema);
    }
    if (this.projection != null) {
        AvroReadSupport.setRequestedProjection(configuration, this.projection);
    }
    return AvroParquetReader
            .<GenericRecord>builder(HadoopInputFile.fromPath(getFilePath(), configuration))
            .build();
}
 
Example #19
Source File: DataLoad.java    From arvo2parquet with MIT License 5 votes vote down vote up
private static void readFromParquet(@Nonnull final Path filePathToRead) throws IOException {
  try (final ParquetReader<GenericData.Record> reader = AvroParquetReader
          .<GenericData.Record>builder(nioPathToInputFile(filePathToRead))
          .withConf(new Configuration())
          .build())
  {
    GenericData.Record record;
    while ((record = reader.read()) != null) {
      System.out.println(record);
    }
  }
}
 
Example #20
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}
 
Example #21
Source File: ParquetStreamingFileSinkITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}
 
Example #22
Source File: HoodieParquetReader.java    From hudi with Apache License 2.0 4 votes vote down vote up
@Override
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
  AvroReadSupport.setAvroReadSchema(conf, schema);
  ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(path).withConf(conf).build();
  return new ParquetReaderIterator(reader);
}
 
Example #23
Source File: FetchParquet.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Override
public HDFSRecordReader createHDFSRecordReader(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path)
        throws IOException {
    final ParquetReader.Builder<GenericRecord> readerBuilder = AvroParquetReader.<GenericRecord>builder(path).withConf(conf);
    return new AvroParquetHDFSRecordReader(readerBuilder.build());
}
 
Example #24
Source File: ParquetReader.java    From reef with Apache License 2.0 2 votes vote down vote up
/**
 * Construct an avro reader from parquet file.
 * @return avro reader based on the provided parquet file.
 * @throws IOException if the parquet file couldn't be parsed correctly.
 */
private AvroParquetReader<GenericRecord> createAvroReader() throws IOException {
  return new AvroParquetReader<GenericRecord>(parquetFilePath);
}