org.apache.parquet.avro.AvroSchemaConverter Java Examples

The following examples show how to use org.apache.parquet.avro.AvroSchemaConverter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HoodieFileWriterFactory.java    From hudi with Apache License 2.0 6 votes vote down vote up
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
    String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
    SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
  BloomFilter filter = BloomFilterFactory
      .createBloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP(),
          config.getDynamicBloomFilterMaxNumEntries(),
          config.getBloomFilterType());
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);

  HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(),
      config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
      hoodieTable.getHadoopConf(), config.getParquetCompressionRatio());

  return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, sparkTaskContextSupplier);
}
 
Example #2
Source File: Schemas.java    From kite with Apache License 2.0 6 votes vote down vote up
public static Schema fromParquet(FileSystem fs, Path location) throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), location);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}
 
Example #3
Source File: ParquetUtils.java    From nifi with Apache License 2.0 6 votes vote down vote up
public static void applyCommonConfig(Configuration conf, ParquetConfig parquetConfig) {
    if (parquetConfig.getAvroReadCompatibility() != null) {
        conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY,
                parquetConfig.getAvroReadCompatibility().booleanValue());
    }

    if (parquetConfig.getAvroAddListElementRecords() != null) {
        conf.setBoolean(AvroSchemaConverter.ADD_LIST_ELEMENT_RECORDS,
                parquetConfig.getAvroAddListElementRecords().booleanValue());
    }

    if (parquetConfig.getAvroWriteOldListStructure() != null) {
        conf.setBoolean(AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE,
                parquetConfig.getAvroWriteOldListStructure().booleanValue());
    }
}
 
Example #4
Source File: ParquetUtils.java    From incubator-pinot with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the schema for the given Parquet file path.
 */
public static Schema getParquetSchema(Path path)
    throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER);
  Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData();
  String schemaString = metaData.get("parquet.avro.schema");
  if (schemaString == null) {
    // Try the older property
    schemaString = metaData.get("avro.schema");
  }
  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
  }
}
 
Example #5
Source File: Schemas.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static Schema fromParquet(Configuration conf, URI location) throws IOException {
  Path path = new Path(location);
  FileSystem fs = path.getFileSystem(conf);

  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}
 
Example #6
Source File: TestParquetUtils.java    From hudi with Apache License 2.0 6 votes vote down vote up
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception {
  // Write out a parquet file
  BloomFilter filter = BloomFilterFactory
      .createBloomFilter(1000, 0.0001, 10000, typeCode);
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    if (addPartitionPathField) {
      rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath);
    }
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}
 
Example #7
Source File: TestHoodieAvroWriteSupport.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException {
  List<String> rowKeys = new ArrayList<>();
  for (int i = 0; i < 1000; i++) {
    rowKeys.add(UUID.randomUUID().toString());
  }
  String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString();
  Schema schema = HoodieAvroUtils.getRecordKeySchema();
  BloomFilter filter = BloomFilterFactory.createBloomFilter(
      1000, 0.0001, 10000,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
      new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}
 
Example #8
Source File: TableSchemaResolver.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Read the schema from the log file on path.
 *
 * @return
 */
public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException {
  Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null);
  HoodieDataBlock lastBlock = null;
  while (reader.hasNext()) {
    HoodieLogBlock block = reader.next();
    if (block instanceof HoodieDataBlock) {
      lastBlock = (HoodieDataBlock) block;
    }
  }
  reader.close();
  if (lastBlock != null) {
    return new AvroSchemaConverter().convert(lastBlock.getSchema());
  }
  return null;
}
 
Example #9
Source File: HiveTestUtil.java    From hudi with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings({"unchecked", "deprecation"})
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
    throws IOException, URISyntaxException {
  Schema schema = getTestDataSchema(isParquetSchemaSimple);
  org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
  BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
  ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
      ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
      ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());

  List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
      : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
  testRecords.forEach(s -> {
    try {
      writer.write(s);
    } catch (IOException e) {
      fail("IOException while writing test records as parquet" + e.toString());
    }
  });
  writer.close();
}
 
Example #10
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadNestedGroup() throws IOException {
	Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema());
	GenericData.Record barRecord = new GenericRecordBuilder(schema)
		.set("spam", 31L).build();

	GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("bar", barRecord)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals(31L, ((Row) row.getField(2)).getField(0));
	assertTrue(rowReader.reachEnd());
}
 
Example #11
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadMultipleSimpleGroup() throws IOException {
	Long[] array = {1L};

	List<IndexedRecord> records = new ArrayList<>();
	for (int i = 0; i < 100; i++) {
		GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
			.set("bar", "test")
			.set("foo", i)
			.set("arr", array).build();
		records.add(record);
	}

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records);
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertTrue(!rowReader.reachEnd());

	for (long i = 0; i < 100; i++) {
		assertFalse(rowReader.reachEnd());
		Row row = rowReader.nextRecord();
		assertEquals(3, row.getArity());
		assertEquals(i, row.getField(0));
		assertEquals("test", row.getField(1));
		assertArrayEquals(array, (Long[]) row.getField(2));
	}

	assertTrue(rowReader.reachEnd());
}
 
Example #12
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadNestedGroup() throws IOException {
	Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema());
	GenericData.Record barRecord = new GenericRecordBuilder(schema)
		.set("spam", 31L).build();

	GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("bar", barRecord)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals(31L, ((Row) row.getField(2)).getField(0));
	assertTrue(rowReader.reachEnd());
}
 
Example #13
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}
 
Example #14
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testNestedMapGroup() throws IOException {
	Schema nestedMapSchema = unWrapSchema(NESTED_SCHEMA.getField("nestedMap").schema());
	Preconditions.checkState(nestedMapSchema.getType().equals(Schema.Type.MAP));

	Schema mapValueSchema = nestedMapSchema.getValueType();
	GenericRecord mapValue = new GenericRecordBuilder(mapValueSchema)
		.set("type", "nested")
		.set("value", "nested_value").build();

	ImmutableMap.Builder<String, GenericRecord> map = ImmutableMap.builder();
	map.put("testKey", mapValue);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedMap", map.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Map result = (Map) row.getField(5);

	Row nestedRow = (Row) result.get("testKey");
	assertEquals("nested", nestedRow.getField(0));
	assertEquals("nested_value", nestedRow.getField(1));
}
 
Example #15
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}
 
Example #16
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadMultipleSimpleGroup() throws IOException {
	Long[] array = {1L};

	List<IndexedRecord> records = new ArrayList<>();
	for (int i = 0; i < 100; i++) {
		GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
			.set("bar", "test")
			.set("foo", i)
			.set("arr", array).build();
		records.add(record);
	}

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records);
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertTrue(!rowReader.reachEnd());

	for (long i = 0; i < 100; i++) {
		assertFalse(rowReader.reachEnd());
		Row row = rowReader.nextRecord();
		assertEquals(3, row.getArity());
		assertEquals(i, row.getField(0));
		assertEquals("test", row.getField(1));
		assertArrayEquals(array, (Long[]) row.getField(2));
	}

	assertTrue(rowReader.reachEnd());
}
 
Example #17
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadSimpleGroup() throws IOException {
	Long[] array = {1L};
	GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
		.set("bar", "test")
		.set("foo", 32L)
		.set("arr", array).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(3, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals("test", row.getField(1));
	assertArrayEquals(array, (Long[]) row.getField(2));
	assertTrue(rowReader.reachEnd());
}
 
Example #18
Source File: HoodieClientTestUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static String writeParquetFile(String basePath, String partitionPath, String filename,
                                      List<HoodieRecord> records, Schema schema, BloomFilter filter, boolean createCommitTime) throws IOException {

  if (filter == null) {
    filter = BloomFilterFactory
        .createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
  }
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
  String instantTime = FSUtils.getCommitTime(filename);
  HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
      ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
      HoodieTestUtils.getDefaultHadoopConf(), Double.valueOf(HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO));
  HoodieParquetWriter writer =
      new HoodieParquetWriter(instantTime, new Path(basePath + "/" + partitionPath + "/" + filename), config,
              schema, new SparkTaskContextSupplier());
  int seqId = 1;
  for (HoodieRecord record : records) {
    GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
    HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, instantTime, "" + seqId++);
    HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
    writer.writeAvro(record.getRecordKey(), avroRecord);
    filter.add(record.getRecordKey());
  }
  writer.close();

  if (createCommitTime) {
    HoodieTestUtils.createMetadataFolder(basePath);
    HoodieTestUtils.createCommitFiles(basePath, instantTime);
  }
  return filename;
}
 
Example #19
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadSimpleGroup() throws IOException {
	Long[] array = {1L};
	GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
		.set("bar", "test")
		.set("foo", 32L)
		.set("arr", array).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(3, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals("test", row.getField(1));
	assertArrayEquals(array, (Long[]) row.getField(2));
	assertTrue(rowReader.reachEnd());
}
 
Example #20
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testNestedMapGroup() throws IOException {
	Schema nestedMapSchema = unWrapSchema(NESTED_SCHEMA.getField("nestedMap").schema());
	Preconditions.checkState(nestedMapSchema.getType().equals(Schema.Type.MAP));

	Schema mapValueSchema = nestedMapSchema.getValueType();
	GenericRecord mapValue = new GenericRecordBuilder(mapValueSchema)
		.set("type", "nested")
		.set("value", "nested_value").build();

	ImmutableMap.Builder<String, GenericRecord> map = ImmutableMap.builder();
	map.put("testKey", mapValue);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedMap", map.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Map result = (Map) row.getField(5);

	Row nestedRow = (Row) result.get("testKey");
	assertEquals("nested", nestedRow.getField(0));
	assertEquals("nested_value", nestedRow.getField(1));
}
 
Example #21
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testArrayGroup() throws IOException {
	Schema arraySchema = unWrapSchema(NESTED_SCHEMA.getField("arr").schema());
	Preconditions.checkState(arraySchema.getType().equals(Schema.Type.ARRAY));

	List<Long> arrayData = new ArrayList<>();
	arrayData.add(1L);
	arrayData.add(1000L);

	List<String> arrayString = new ArrayList<>();
	arrayString.add("abcd");

	@SuppressWarnings("unchecked")
	GenericData.Array array = new GenericData.Array(arraySchema, arrayData);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("arr", array)
		.set("strArray", arrayString)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Long[] result = (Long[]) row.getField(3);
	assertEquals(1L, result[0].longValue());
	assertEquals(1000L, result[1].longValue());

	String[] strResult = (String[]) row.getField(4);
	assertEquals("abcd", strResult[0]);
}
 
Example #22
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testNestedArrayGroup() throws IOException {
	Schema nestedArraySchema = unWrapSchema(NESTED_SCHEMA.getField("nestedArray").schema());
	Preconditions.checkState(nestedArraySchema.getType().equals(Schema.Type.ARRAY));

	Schema arrayItemSchema = nestedArraySchema.getElementType();
	GenericRecord item = new GenericRecordBuilder(arrayItemSchema)
		.set("type", "nested")
		.set("value", 1L).build();

	ImmutableList.Builder<GenericRecord> list = ImmutableList.builder();
	list.add(item);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedArray", list.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Object[] result = (Object[]) row.getField(6);

	assertEquals(1, result.length);

	Row nestedRow = (Row) result[0];
	assertEquals("nested", nestedRow.getField(0));
	assertEquals(1L, nestedRow.getField(1));
}
 
Example #23
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testArrayGroup() throws IOException {
	Schema arraySchema = unWrapSchema(NESTED_SCHEMA.getField("arr").schema());
	Preconditions.checkState(arraySchema.getType().equals(Schema.Type.ARRAY));

	List<Long> arrayData = new ArrayList<>();
	arrayData.add(1L);
	arrayData.add(1000L);

	List<String> arrayString = new ArrayList<>();
	arrayString.add("abcd");

	@SuppressWarnings("unchecked")
	GenericData.Array array = new GenericData.Array(arraySchema, arrayData);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("arr", array)
		.set("strArray", arrayString)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Long[] result = (Long[]) row.getField(3);
	assertEquals(1L, result[0].longValue());
	assertEquals(1000L, result[1].longValue());

	String[] strResult = (String[]) row.getField(4);
	assertEquals("abcd", strResult[0]);
}
 
Example #24
Source File: ParquetRecordReaderTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testNestedArrayGroup() throws IOException {
	Schema nestedArraySchema = unWrapSchema(NESTED_SCHEMA.getField("nestedArray").schema());
	Preconditions.checkState(nestedArraySchema.getType().equals(Schema.Type.ARRAY));

	Schema arrayItemSchema = nestedArraySchema.getElementType();
	GenericRecord item = new GenericRecordBuilder(arrayItemSchema)
		.set("type", "nested")
		.set("value", 1L).build();

	ImmutableList.Builder<GenericRecord> list = ImmutableList.builder();
	list.add(item);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedArray", list.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Object[] result = (Object[]) row.getField(6);

	assertEquals(1, result.length);

	Row nestedRow = (Row) result[0];
	assertEquals("nested", nestedRow.getField(0));
	assertEquals(1L, nestedRow.getField(1));
}
 
Example #25
Source File: ParquetUtils.java    From hudi with Apache License 2.0 4 votes vote down vote up
public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) {
  return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath));
}
 
Example #26
Source File: ParquetReader.java    From reef with Apache License 2.0 3 votes vote down vote up
/**
 * Retrieve avro schema from parquet file.
 * @param configuration Hadoop configuration.
 * @param filter Filter for Avro metadata.
 * @return avro schema from parquet file.
 * @throws IOException if the Avro schema couldn't be parsed from the parquet file.
 */
private Schema createAvroSchema(final Configuration configuration, final MetadataFilter filter) throws IOException {
  final ParquetMetadata footer = ParquetFileReader.readFooter(configuration, parquetFilePath, filter);
  final AvroSchemaConverter converter = new AvroSchemaConverter();
  final MessageType schema = footer.getFileMetaData().getSchema();
  return converter.convert(schema);
}
 
Example #27
Source File: TableSchemaResolver.java    From hudi with Apache License 2.0 2 votes vote down vote up
/**
 * Convert a parquet scheme to the avro format.
 *
 * @param parquetSchema The parquet schema to convert
 * @return The converted avro schema
 */
public Schema convertParquetSchemaToAvro(MessageType parquetSchema) {
  AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(metaClient.getHadoopConf());
  return avroSchemaConverter.convert(parquetSchema);
}
 
Example #28
Source File: TableSchemaResolver.java    From hudi with Apache License 2.0 2 votes vote down vote up
/**
 * Convert a avro scheme to the parquet format.
 *
 * @param schema The avro schema to convert
 * @return The converted parquet schema
 */
public MessageType convertAvroSchemaToParquet(Schema schema) {
  AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(metaClient.getHadoopConf());
  return avroSchemaConverter.convert(schema);
}