org.apache.parquet.hadoop.util.HadoopInputFile Java Examples

The following examples show how to use org.apache.parquet.hadoop.util.HadoopInputFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TransCompressionCommand.java    From parquet-mr with Apache License 2.0 7 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(input != null && output != null,
    "Both input and output parquet file paths are required.");

  Preconditions.checkArgument(codec != null,
    "The codec cannot be null");

  Path inPath = new Path(input);
  Path outPath = new Path(output);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
  return 0;
}
 
Example #2
Source File: AvroTestUtil.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static <D> List<D> read(Configuration conf, GenericData model, Schema schema, File file) throws IOException {
  List<D> data = new ArrayList<D>();
  AvroReadSupport.setRequestedProjection(conf, schema);
  AvroReadSupport.setAvroReadSchema(conf, schema);

  try (ParquetReader<D> fileReader = AvroParquetReader
    .<D>builder(HadoopInputFile.fromPath(new Path(file.toString()), conf))
    .withDataModel(model) // reflect disables compatibility
    .build()) {
    D datum;
    while ((datum = fileReader.read()) != null) {
      data.add(datum);
    }
  }

  return data;
}
 
Example #3
Source File: TestColumnIndexes.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testColumnIndexes() throws IOException {
  LOGGER.info("Starting test with context: {}", context);

  Path file = null;
  try {
    file = context.write(new Path(tmp.getRoot().getAbsolutePath()));
    LOGGER.info("Parquet file \"{}\" is successfully created for the context: {}", file, context);

    List<ContractViolation> violations = ColumnIndexValidator
        .checkContractViolations(HadoopInputFile.fromPath(file, new Configuration()));
    assertTrue(violations.toString(), violations.isEmpty());
  } finally {
    if (file != null) {
      file.getFileSystem(new Configuration()).delete(file, false);
    }
  }
}
 
Example #4
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param conf the Hadoop Configuration
 * @param file Path to a parquet file
 * @param footer a {@link ParquetMetadata} footer already read from the file
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
  this.converter = new ParquetMetadataConverter(conf);
  this.file = HadoopInputFile.fromPath(file, conf);
  this.f = this.file.newStream();
  this.options = HadoopReadOptions.builder(conf).build();
  this.footer = footer;
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
 
Example #5
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
 
Example #6
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public ParquetReader<T> build() throws IOException {
  ParquetReadOptions options = optionsBuilder.build();

  if (path != null) {
    FileSystem fs = path.getFileSystem(conf);
    FileStatus stat = fs.getFileStatus(path);

    if (stat.isFile()) {
      return new ParquetReader<>(
          Collections.singletonList((InputFile) HadoopInputFile.fromStatus(stat, conf)),
          options,
          getReadSupport());

    } else {
      List<InputFile> files = new ArrayList<>();
      for (FileStatus fileStatus : fs.listStatus(path, HiddenFileFilter.INSTANCE)) {
        files.add(HadoopInputFile.fromStatus(fileStatus, conf));
      }
      return new ParquetReader<T>(files, options, getReadSupport());
    }

  } else {
    return new ParquetReader<>(Collections.singletonList(file), options, getReadSupport());
  }
}
 
Example #7
Source File: TransCompressionCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example #8
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
protected Builder(InputFile file) {
  this.readSupport = null;
  this.file = Objects.requireNonNull(file, "file cannot be null");
  this.path = null;
  if (file instanceof HadoopInputFile) {
    this.conf = ((HadoopInputFile) file).getConfiguration();
  } else {
    this.conf = new Configuration();
  }
  optionsBuilder = HadoopReadOptions.builder(conf);
}
 
Example #9
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadSimpleGroup() throws IOException {
	Long[] array = {1L};
	GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
		.set("bar", "test")
		.set("foo", 32L)
		.set("arr", array).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(3, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals("test", row.getField(1));
	assertArrayEquals(array, (Long[]) row.getField(2));
	assertTrue(rowReader.reachEnd());
}
 
Example #10
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadMultipleSimpleGroup() throws IOException {
	Long[] array = {1L};

	List<IndexedRecord> records = new ArrayList<>();
	for (int i = 0; i < 100; i++) {
		GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
			.set("bar", "test")
			.set("foo", i)
			.set("arr", array).build();
		records.add(record);
	}

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records);
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertTrue(!rowReader.reachEnd());

	for (long i = 0; i < 100; i++) {
		assertFalse(rowReader.reachEnd());
		Row row = rowReader.nextRecord();
		assertEquals(3, row.getArity());
		assertEquals(i, row.getField(0));
		assertEquals("test", row.getField(1));
		assertArrayEquals(array, (Long[]) row.getField(2));
	}

	assertTrue(rowReader.reachEnd());
}
 
Example #11
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadNestedGroup() throws IOException {
	Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema());
	GenericData.Record barRecord = new GenericRecordBuilder(schema)
		.set("spam", 31L).build();

	GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("bar", barRecord)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals(31L, ((Row) row.getField(2)).getField(0));
	assertTrue(rowReader.reachEnd());
}
 
Example #12
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}
 
Example #13
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testNestedMapGroup() throws IOException {
	Schema nestedMapSchema = unWrapSchema(NESTED_SCHEMA.getField("nestedMap").schema());
	Preconditions.checkState(nestedMapSchema.getType().equals(Schema.Type.MAP));

	Schema mapValueSchema = nestedMapSchema.getValueType();
	GenericRecord mapValue = new GenericRecordBuilder(mapValueSchema)
		.set("type", "nested")
		.set("value", "nested_value").build();

	ImmutableMap.Builder<String, GenericRecord> map = ImmutableMap.builder();
	map.put("testKey", mapValue);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedMap", map.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Map result = (Map) row.getField(5);

	Row nestedRow = (Row) result.get("testKey");
	assertEquals("nested", nestedRow.getField(0));
	assertEquals("nested_value", nestedRow.getField(1));
}
 
Example #14
Source File: PruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  List<String> args = options.getArgList();
  Path inputFile = new Path(args.get(0));
  Path outputFile = new Path(args.get(1));
  List<String> cols = args.subList(2, args.size());

  Set<ColumnPath> prunePaths = convertToColumnPaths(cols);

  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, inputFile, ParquetMetadataConverter.NO_FILTER);
  FileMetaData metaData = pmd.getFileMetaData();
  MessageType schema = metaData.getSchema();
  List<String> paths = new ArrayList<>();
  getPaths(schema, paths, null);

  for (String col : cols) {
    if (!paths.contains(col)) {
      LOG.warn("Input column name {} doesn't show up in the schema of file {}", col, inputFile.getName());
    }
  }

  ParquetFileWriter writer = new ParquetFileWriter(conf,
    pruneColumnsInSchema(schema, prunePaths), outputFile, ParquetFileWriter.Mode.CREATE);

  writer.start();
  writer.appendFile(HadoopInputFile.fromPath(inputFile, conf));
  writer.end(metaData.getKeyValueMetaData());
}
 
Example #15
Source File: MergeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  // Prepare arguments
  List<String> args = options.getArgList();
  List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
  Path outputFile = new Path(args.get(args.size() - 1));

  // Merge schema and extraMeta
  FileMetaData mergedMeta = mergedMetadata(inputFiles);
  PrintWriter out = new PrintWriter(Main.out, true);

  // Merge data
  ParquetFileWriter writer = new ParquetFileWriter(conf,
          mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
  writer.start();
  boolean tooSmallFilesMerged = false;
  for (Path input: inputFiles) {
    if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
      out.format("Warning: file %s is too small, length: %d\n",
        input,
        input.getFileSystem(conf).getFileStatus(input).getLen());
      tooSmallFilesMerged = true;
    }

    writer.appendFile(HadoopInputFile.fromPath(input, conf));
  }

  if (tooSmallFilesMerged) {
    out.println("Warning: you merged too small files. " +
      "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " +
      "which usually leads to bad query performance!");
  }
  writer.end(mergedMeta.getKeyValueMetaData());
}
 
Example #16
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetReader.Builder<Group> createReaderBuilder() throws IOException {
  ReadConfigurator readConfigurator = getReadConfigurator();
  return readConfigurator.configureBuilder(
      new ParquetReader.Builder<Group>(HadoopInputFile.fromPath(file, new Configuration())) {
        @Override
        protected ReadSupport<Group> getReadSupport() {
          return new GroupReadSupport();
        }
      }.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString()));
}
 
Example #17
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}
 
Example #18
Source File: ConvertCsvToParquetFileExpressionProcessorTests.java    From vividus with Apache License 2.0 5 votes vote down vote up
private GenericRecord readActualRecord(String parquetPath) throws IOException
{
    try (ParquetReader<GenericRecord> reader = AvroParquetReader
            .<GenericRecord>builder(
                    HadoopInputFile.fromPath(new Path(new File(parquetPath).toURI()), new Configuration()))
            .build())
    {
        return reader.read();
    }
}
 
Example #19
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Reads the meta data block in the footer of the file using provided input stream
 * @param file a {@link InputFile} to read
 * @param filter the filter to apply to row groups
 * @return the metadata blocks in the footer
 * @throws IOException if an error occurs while reading the file
 * @deprecated will be removed in 2.0.0;
 *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
 */
@Deprecated
public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException {
  ParquetReadOptions options;
  if (file instanceof HadoopInputFile) {
    options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration())
        .withMetadataFilter(filter).build();
  } else {
    options = ParquetReadOptions.builder().withMetadataFilter(filter).build();
  }

  try (SeekableInputStream in = file.newStream()) {
    return readFooter(file, options, in);
  }
}
 
Example #20
Source File: ParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException {
  Path path = split.getPath();
  long[] rowGroupOffsets = split.getRowGroupOffsets();

  // if task.side.metadata is set, rowGroupOffsets is null
  ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration);
  if (rowGroupOffsets != null) {
    optionsBuilder.withOffsets(rowGroupOffsets);
  } else {
    optionsBuilder.withRange(split.getStart(), split.getEnd());
  }

  // open a reader with the metadata filter
  ParquetFileReader reader = ParquetFileReader.open(
      HadoopInputFile.fromPath(path, configuration), optionsBuilder.build());

  if (rowGroupOffsets != null) {
    // verify a row group was found for each offset
    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    if (blocks.size() != rowGroupOffsets.length) {
      throw new IllegalStateException(
          "All of the offsets in the split should be found in the file."
          + " expected: " + Arrays.toString(rowGroupOffsets)
          + " found: " + blocks);
    }
  }

  if (!reader.getRowGroups().isEmpty()) {
    checkDeltaByteArrayProblem(
        reader.getFooter().getFileMetaData(), configuration,
        reader.getRowGroups().get(0));
  }

  internalReader.initialize(reader, configuration);
}
 
Example #21
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testParquetFileWithBloomFilter() throws IOException {
  MessageType schema = Types.buildMessage().
    required(BINARY).as(stringType()).named("name").named("msg");

  String[] testNames = {"hello", "parquet", "bloom", "filter"};
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  GroupFactory factory = new SimpleGroupFactory(schema);
  File file = temp.newFile();
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
    .withPageRowCountLimit(10)
    .withConf(conf)
    .withDictionaryEncoding(false)
    .withBloomFilterEnabled("name", true)
    .build()) {
    for (String testName : testNames) {
      writer.write(factory.newGroup().append("name", testName));
    }
  }

  ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()));
  BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
  BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData)
    .readBloomFilter(blockMetaData.getColumns().get(0));

  for (String name: testNames) {
    assertTrue(bloomFilter.findHash(
      LongHashFunction.xx(0).hashBytes(Binary.fromString(name).toByteBuffer())));
  }
}
 
Example #22
Source File: TestParquetRecordSetWriter.java    From nifi with Apache License 2.0 5 votes vote down vote up
private void verifyParquetRecords(final File parquetFile, final int expectedRecordCount) throws IOException {
    final Configuration conf = new Configuration();
    final Path path = new Path(parquetFile.getPath());
    final InputFile inputFile = HadoopInputFile.fromPath(path, conf);

    try (final ParquetReader<GenericRecord> reader =
            AvroParquetReader.<GenericRecord>builder(inputFile).withConf(conf).build()){

        int recordCount = 0;
        while(reader.read() != null) {
            recordCount++;
        }
        assertEquals(expectedRecordCount, recordCount);
    }
}
 
Example #23
Source File: ParquetFileLineFetcher.java    From hugegraph-loader with Apache License 2.0 5 votes vote down vote up
@Override
public void openReader(Readable readable) {
    Path path = new Path(this.source().path());
    try {
        HadoopInputFile file = HadoopInputFile.fromPath(path, this.conf);
        this.reader = ParquetFileReader.open(file);
        this.schema = this.reader.getFooter().getFileMetaData().getSchema();
        this.columnIO = new ColumnIOFactory().getColumnIO(this.schema);
    } catch (IOException e) {
        throw new LoadException("Failed to open parquet reader for '%s'",
                                e, readable);
    }
    this.resetOffset();
}
 
Example #24
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}
 
Example #25
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadNestedGroup() throws IOException {
	Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema());
	GenericData.Record barRecord = new GenericRecordBuilder(schema)
		.set("spam", 31L).build();

	GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("bar", barRecord)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals(31L, ((Row) row.getField(2)).getField(0));
	assertTrue(rowReader.reachEnd());
}
 
Example #26
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadMultipleSimpleGroup() throws IOException {
	Long[] array = {1L};

	List<IndexedRecord> records = new ArrayList<>();
	for (int i = 0; i < 100; i++) {
		GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
			.set("bar", "test")
			.set("foo", i)
			.set("arr", array).build();
		records.add(record);
	}

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records);
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertTrue(!rowReader.reachEnd());

	for (long i = 0; i < 100; i++) {
		assertFalse(rowReader.reachEnd());
		Row row = rowReader.nextRecord();
		assertEquals(3, row.getArity());
		assertEquals(i, row.getField(0));
		assertEquals("test", row.getField(1));
		assertArrayEquals(array, (Long[]) row.getField(2));
	}

	assertTrue(rowReader.reachEnd());
}
 
Example #27
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadSimpleGroup() throws IOException {
	Long[] array = {1L};
	GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
		.set("bar", "test")
		.set("foo", 32L)
		.set("arr", array).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(3, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals("test", row.getField(1));
	assertArrayEquals(array, (Long[]) row.getField(2));
	assertTrue(rowReader.reachEnd());
}
 
Example #28
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testNestedMapGroup() throws IOException {
	Schema nestedMapSchema = unWrapSchema(NESTED_SCHEMA.getField("nestedMap").schema());
	Preconditions.checkState(nestedMapSchema.getType().equals(Schema.Type.MAP));

	Schema mapValueSchema = nestedMapSchema.getValueType();
	GenericRecord mapValue = new GenericRecordBuilder(mapValueSchema)
		.set("type", "nested")
		.set("value", "nested_value").build();

	ImmutableMap.Builder<String, GenericRecord> map = ImmutableMap.builder();
	map.put("testKey", mapValue);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedMap", map.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Map result = (Map) row.getField(5);

	Row nestedRow = (Row) result.get("testKey");
	assertEquals("nested", nestedRow.getField(0));
	assertEquals("nested_value", nestedRow.getField(1));
}
 
Example #29
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}
 
Example #30
Source File: ProtoParquetWriterWithOffsetTest.java    From garmadon with Apache License 2.0 5 votes vote down vote up
private void checkFileLatestCommittedTimestamp(Path p, long timestamp) throws IOException {
    ParquetFileReader reader = new ParquetFileReader(
        HadoopInputFile.fromPath(p, new Configuration()),
        ParquetReadOptions.builder().build()
    );
    String actualTimestamp = reader.getFooter().getFileMetaData().getKeyValueMetaData().get(ProtoParquetWriterWithOffset.LATEST_TIMESTAMP_META_KEY);
    assertThat(actualTimestamp, is(String.valueOf(timestamp)));
}