org.apache.parquet.avro.AvroParquetWriter Java Examples

The following examples show how to use org.apache.parquet.avro.AvroParquetWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IntegrationTestHelper.java    From circus-train with Apache License 2.0 9 votes vote down vote up
URI createData(
    URI tableUri,
    Schema schema,
    String hour,
    int id,
    String fieldName,
    Object data) throws IOException {
  GenericData.Record record = new GenericData.Record(schema);
  record.put("id", id);

  if (fieldName != null) {
    Schema.Field field = schema.getField(fieldName);
    Schema fieldSchema = field.schema();
    if (data instanceof Map) {
      GenericData.Record schemaRecord = new GenericData.Record(fieldSchema);
      ((Map<String, String>) data).forEach(schemaRecord::put);
      record.put(fieldName, schemaRecord);
    } else if (data != null) {
      record.put(fieldName, data);
    }
  }

  URI partition = URI.create(tableUri + "/hour=" + hour);
  String path = partition.getPath();
  File parentFolder = new File(path);
  parentFolder.mkdirs();
  File partitionFile = new File(parentFolder, "parquet0000");
  Path filePath = new Path(partitionFile.toURI());
  ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(filePath)
      .withSchema(schema)
      .withConf(new Configuration())
      .build();

  try {
    writer.write(record);
  } finally {
    writer.close();
  }
  return partition;
}
 
Example #2
Source File: ConvertAvroToParquet.java    From nifi with Apache License 2.0 6 votes vote down vote up
private ParquetWriter createParquetWriter(final ProcessContext context, final FlowFile flowFile, final OutputStream out, final Schema schema)
        throws IOException {

    NifiParquetOutputFile nifiParquetOutputFile = new NifiParquetOutputFile(out);

    final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter
            .<GenericRecord>builder(nifiParquetOutputFile)
            .withSchema(schema);

    final ParquetConfig parquetConfig = createParquetConfig(context, flowFile.getAttributes());
    parquetConfig.setAvroReadCompatibility(true);
    parquetConfig.setAvroAddListElementRecords(false);
    parquetConfig.setAvroWriteOldListStructure(false);

    final Configuration conf = new Configuration();
    applyCommonConfig(parquetWriter, conf, parquetConfig);

    return parquetWriter.build();
}
 
Example #3
Source File: FetchParquetTest.java    From nifi with Apache License 2.0 6 votes vote down vote up
private void writeParquetUsersWithDecimal(final File parquetFile, int numUsers) throws IOException {
    if (parquetFile.exists()) {
        Assert.assertTrue(parquetFile.delete());
    }

    final BigDecimal initialAmount = new BigDecimal("1234567.0123456789");
    final AvroParquetWriter.Builder<GenericRecord> writerBuilder = createAvroParquetWriter(parquetFile, schemaWithDecimal);

    final List<Schema> amountSchemaUnion = schemaWithDecimal.getField("amount").schema().getTypes();
    final Schema amountSchema = amountSchemaUnion.stream().filter(s -> s.getType() == Schema.Type.FIXED).findFirst().orElse(null);
    Assert.assertNotNull(amountSchema);

    final Conversions.DecimalConversion decimalConversion = new Conversions.DecimalConversion();

    try (final ParquetWriter<GenericRecord> writer = writerBuilder.build()) {
        for (int i=0; i < numUsers; i++) {
            final BigDecimal incrementedAmount = initialAmount.add(new BigDecimal("1"));
            final GenericRecord user = new GenericData.Record(schemaWithDecimal);
            user.put("name", "Bob" + i);
            user.put("amount", decimalConversion.toFixed(incrementedAmount, amountSchema, amountSchema.getLogicalType()));

            writer.write(user);
        }
    }

}
 
Example #4
Source File: ParquetFileReaderTest.java    From kafka-connect-fs with Apache License 2.0 6 votes vote down vote up
@Override
protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
    FileSystem fs = fsConfig.getFs();
    File parquetFile = File.createTempFile("test-", "." + getFileExtension());

    try (ParquetWriter writer = AvroParquetWriter.<GenericRecord>builder(new Path(parquetFile.toURI()))
            .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) {
        IntStream.range(0, NUM_RECORDS).forEach(index -> {
            GenericRecord datum = new GenericData.Record(readerSchema);
            datum.put(FIELD_INDEX, index);
            String uuid = UUID.randomUUID().toString();
            datum.put(FIELD_NAME, String.format("%d_name_%s", index, uuid));
            datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, uuid));
            try {
                fsConfig.offsetsByIndex().put(index, (long) index);
                writer.write(datum);
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        });
    }
    Path path = new Path(new Path(fsConfig.getFsUri()), parquetFile.getName());
    fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path);
    return path;
}
 
Example #5
Source File: InputFormatTestUtil.java    From hudi with Apache License 2.0 6 votes vote down vote up
public static void simulateParquetUpdates(File directory, Schema schema, String originalCommit,
    int totalNumberOfRecords, int numberOfRecordsToUpdate, String newCommit) throws IOException {
  File fileToUpdate = Objects.requireNonNull(directory.listFiles((dir, name) -> name.endsWith("parquet")))[0];
  String fileId = FSUtils.getFileId(fileToUpdate.getName());
  File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId));
  try (AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), schema)) {
    for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords, originalCommit, fileId)) {
      if (numberOfRecordsToUpdate > 0) {
        // update this record
        record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, newCommit);
        String oldSeqNo = (String) record.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD);
        record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, oldSeqNo.replace(originalCommit, newCommit));
        numberOfRecordsToUpdate--;
      }
      parquetWriter.write(record);
    }
  }
}
 
Example #6
Source File: TestHDFSParquetImporter.java    From hudi with Apache License 2.0 6 votes vote down vote up
public List<GenericRecord> createInsertRecords(Path srcFolder) throws ParseException, IOException {
  Path srcFile = new Path(srcFolder.toString(), "file1.parquet");
  long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000;
  List<GenericRecord> records = new ArrayList<GenericRecord>();
  for (long recordNum = 0; recordNum < 96; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-" + recordNum,
        "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
      .withSchema(HoodieTestDataGenerator.AVRO_SCHEMA).withConf(HoodieTestUtils.getDefaultHadoopConf()).build()) {
    for (GenericRecord record : records) {
      writer.write(record);
    }
  }
  return records;
}
 
Example #7
Source File: TestHDFSParquetImporter.java    From hudi with Apache License 2.0 6 votes vote down vote up
public List<GenericRecord> createUpsertRecords(Path srcFolder) throws ParseException, IOException {
  Path srcFile = new Path(srcFolder.toString(), "file1.parquet");
  long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000;
  List<GenericRecord> records = new ArrayList<GenericRecord>();
  // 10 for update
  for (long recordNum = 0; recordNum < 11; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-upsert-" + recordNum,
        "driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  // 4 for insert
  for (long recordNum = 96; recordNum < 100; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-upsert-" + recordNum,
        "driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
      .withSchema(HoodieTestDataGenerator.AVRO_SCHEMA).withConf(HoodieTestUtils.getDefaultHadoopConf()).build()) {
    for (GenericRecord record : records) {
      writer.write(record);
    }
  }
  return records;
}
 
Example #8
Source File: ConvertCsvToParquetFileExpressionProcessor.java    From vividus with Apache License 2.0 6 votes vote down vote up
private void write(File file, String avroSchemaPath, List<Map<String, String>> data) throws IOException
{
    Schema schema = new Parser().parse(ResourceUtils.loadResource(avroSchemaPath));
    try (ParquetWriter<GenericRecord> writer = AvroParquetWriter
            .<GenericRecord>builder(new Path(file.toURI()))
            .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
            .withDataModel(GenericData.get())
            .withSchema(schema)
            .build())
    {
        for (Map<String, String> map : data)
        {
            GenericRecord record = new GenericData.Record(schema);
            map.forEach(record::put);
            writer.write(record);
        }
    }
}
 
Example #9
Source File: FetchParquetTest.java    From nifi with Apache License 2.0 6 votes vote down vote up
private void writeParquetUsersWithArray(final File parquetFile, int numUsers) throws IOException {
    if (parquetFile.exists()) {
        Assert.assertTrue(parquetFile.delete());
    }

    final AvroParquetWriter.Builder<GenericRecord> writerBuilder = createAvroParquetWriter(parquetFile, schemaWithArray);

    final Schema favoriteColorsSchema = schemaWithArray.getField("favorite_colors").schema();

    try (final ParquetWriter<GenericRecord> writer = writerBuilder.build()) {
        for (int i=0; i < numUsers; i++) {
            final GenericRecord user = new GenericData.Record(schema);
            user.put("name", "Bob" + i);
            user.put("favorite_number", i);


            final GenericData.Array<String> colors = new GenericData.Array<>(1, favoriteColorsSchema);
            colors.add("blue" + i);

            user.put("favorite_color", colors);

            writer.write(user);
        }
    }
}
 
Example #10
Source File: ParquetIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public void open(WritableByteChannel channel) throws IOException {
  checkNotNull(getJsonSchema(), "Schema cannot be null");

  Schema schema = new Schema.Parser().parse(getJsonSchema());

  BeamParquetOutputFile beamParquetOutputFile =
      new BeamParquetOutputFile(Channels.newOutputStream(channel));

  AvroParquetWriter.Builder<GenericRecord> builder =
      AvroParquetWriter.<GenericRecord>builder(beamParquetOutputFile)
          .withSchema(schema)
          .withCompressionCodec(getCompressionCodec())
          .withWriteMode(OVERWRITE);

  if (getConfiguration() != null) {
    builder = builder.withConf(getConfiguration().get());
  }

  this.writer = builder.build();
}
 
Example #11
Source File: ParquetPartition.java    From entrada with GNU General Public License v3.0 6 votes vote down vote up
public ParquetPartition(String partition, Schema schema) {

    Configuration conf = new Configuration();
    Path file =
        new Path(partition + System.getProperty("file.separator") + UUID.randomUUID() + ".parquet");
    filename = file.toString();

    log.info("Create new parquet file: {}", filename);

    try {
      Files.createDirectories(Paths.get(partition));

      writer = AvroParquetWriter
          .<T>builder(file)
          .enableDictionaryEncoding()
          .withCompressionCodec(CompressionCodecName.SNAPPY)
          .withConf(conf)
          .withWriterVersion(WriterVersion.PARQUET_1_0)
          .withSchema(schema)
          .withRowGroupSize(ROWGROUP_SIZE)
          .build();
    } catch (IOException e) {
      throw new ApplicationException("Cannot create a Parquet parition", e);
    }
  }
 
Example #12
Source File: AvroParquetConvertCreator.java    From datacollector with Apache License 2.0 6 votes vote down vote up
@Override
protected void addNecessaryJarsToJob(Configuration conf) {
  MapreduceUtils.addJarsToJob(conf,
      SemanticVersion.class,
      ParquetWriter.class,
      AvroParquetWriter.class,
      AvroParquetWriterBuilder190Int96.class,
      AvroSchemaConverter190Int96Avro18.class,
      FsInput.class,
      CompressionCodec.class,
      ParquetProperties.class,
      BytesInput.class,
      AvroToParquetConverterUtil.class,
      AvroLogicalTypeSupport.class
  );
}
 
Example #13
Source File: FetchParquetTest.java    From nifi with Apache License 2.0 6 votes vote down vote up
private void writeParquetUsers(final File parquetFile, int numUsers) throws IOException {
    if (parquetFile.exists()) {
        Assert.assertTrue(parquetFile.delete());
    }

    final AvroParquetWriter.Builder<GenericRecord> writerBuilder = createAvroParquetWriter(parquetFile, schema);

    try (final ParquetWriter<GenericRecord> writer = writerBuilder.build()) {
        for (int i=0; i < numUsers; i++) {
            final GenericRecord user = new GenericData.Record(schema);
            user.put("name", "Bob" + i);
            user.put("favorite_number", i);
            user.put("favorite_color", "blue" + i);

            writer.write(user);
        }
    }
}
 
Example #14
Source File: FetchParquetTest.java    From nifi with Apache License 2.0 5 votes vote down vote up
private void writeParquetUsersWithNullableArray(final File parquetFile, int numUsers) throws IOException {
    if (parquetFile.exists()) {
        Assert.assertTrue(parquetFile.delete());
    }

    final AvroParquetWriter.Builder<GenericRecord> writerBuilder = createAvroParquetWriter(parquetFile, schemaWithNullableArray);

    // use the schemaWithArray here just to get the schema for the array part of the favorite_colors fields, the overall
    // schemaWithNullableArray has a union of the array schema and null
    final Schema favoriteColorsSchema = schemaWithArray.getField("favorite_colors").schema();

    try (final ParquetWriter<GenericRecord> writer = writerBuilder.build()) {
        for (int i=0; i < numUsers; i++) {
            final GenericRecord user = new GenericData.Record(schema);
            user.put("name", "Bob" + i);
            user.put("favorite_number", i);


            final GenericData.Array<String> colors = new GenericData.Array<>(1, favoriteColorsSchema);
            colors.add("blue" + i);

            user.put("favorite_color", colors);

            writer.write(user);
        }
    }
}
 
Example #15
Source File: PutParquet.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Override
public HDFSRecordWriter createHDFSRecordWriter(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path, final RecordSchema schema)
        throws IOException, SchemaNotFoundException {

    final Schema avroSchema = AvroTypeUtil.extractAvroSchema(schema);

    final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter
            .<GenericRecord>builder(path)
            .withSchema(avroSchema);

    final ParquetConfig parquetConfig = createParquetConfig(context, flowFile.getAttributes());
    applyCommonConfig(parquetWriter, conf, parquetConfig);

    return new AvroParquetHDFSRecordWriter(parquetWriter.build(), avroSchema);
}
 
Example #16
Source File: WriteParquetResult.java    From nifi with Apache License 2.0 5 votes vote down vote up
public WriteParquetResult(final Schema schema, final OutputStream out, final ParquetConfig parquetConfig, final ComponentLog componentLogger) throws IOException {
    super(out);
    this.schema = schema;
    this.componentLogger = componentLogger;

    final Configuration conf = new Configuration();
    final OutputFile outputFile = new NifiParquetOutputFile(out);

    final AvroParquetWriter.Builder<GenericRecord> writerBuilder =
            AvroParquetWriter.<GenericRecord>builder(outputFile).withSchema(schema);
    applyCommonConfig(writerBuilder, conf, parquetConfig);
    parquetWriter = writerBuilder.build();
}
 
Example #17
Source File: FetchParquetTest.java    From nifi with Apache License 2.0 5 votes vote down vote up
private AvroParquetWriter.Builder<GenericRecord> createAvroParquetWriter(final File parquetFile, final Schema schema) {
    final Path parquetPath = new Path(parquetFile.getPath());

    return AvroParquetWriter
            .<GenericRecord>builder(parquetPath)
            .withSchema(schema)
            .withConf(testConf);
}
 
Example #18
Source File: TestUtil.java    From flink with Apache License 2.0 5 votes vote down vote up
public static Path createTempParquetFile(File folder, Schema schema, List<IndexedRecord> records) throws IOException {
	Path path = new Path(folder.getPath(), UUID.randomUUID().toString());
	ParquetWriter<IndexedRecord> writer = AvroParquetWriter.<IndexedRecord>builder(
		new org.apache.hadoop.fs.Path(path.toUri())).withSchema(schema).withRowGroupSize(10).build();

	for (IndexedRecord record : records) {
		writer.write(record);
	}

	writer.close();
	return path;
}
 
Example #19
Source File: TestParquetReader.java    From nifi with Apache License 2.0 5 votes vote down vote up
private ParquetWriter<GenericRecord> createParquetWriter(final Schema schema, final File parquetFile) throws IOException {
    final Configuration conf = new Configuration();
    final Path parquetPath = new Path(parquetFile.getPath());

    final ParquetWriter<GenericRecord> writer =
            AvroParquetWriter.<GenericRecord>builder(parquetPath)
                    .withSchema(schema)
                    .withConf(conf)
                    .build();

    return writer;
}
 
Example #20
Source File: AvroParquetMorphlineTest.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapWithUtf8Key() throws Exception {
  Schema schema = new Schema.Parser().parse(new File("src/test/resources/test-avro-schemas/map.avsc"));

  File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
  tmp.deleteOnExit();
  tmp.delete();
  Path file = new Path(tmp.getPath());

  AvroParquetWriter<GenericRecord> writer = 
      new AvroParquetWriter<GenericRecord>(file, schema);

  // Write a record with a map with Utf8 keys.
  GenericData.Record record = new GenericRecordBuilder(schema)
      .set("mymap", new HashMap(ImmutableMap.of(utf8("a"), 1, utf8("b"), 2)))
      .build();
  writer.write(record);
  writer.close();

  for (String configFile : Arrays.asList(
      "readAvroParquetFile", 
      "readAvroParquetFileWithProjectionSchema", 
      "readAvroParquetFileWithReaderSchema1",
      "readAvroParquetFileWithReaderSchemaExternal"
      )) {
    morphline = createMorphline("test-morphlines/" + configFile);
    
    Record morphlineRecord = new Record();
    morphlineRecord.put(ReadAvroParquetFileBuilder.FILE_UPLOAD_URL, file.toString());
    collector.reset();
    
    assertTrue(morphline.process(morphlineRecord));

    assertEquals(1, collector.getRecords().size());
    GenericData.Record actualRecord = (GenericData.Record) collector.getFirstRecord().getFirstValue(Fields.ATTACHMENT_BODY);
    assertEquals(record, actualRecord);      
  }
}
 
Example #21
Source File: ParquetAppender.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public void open() throws IOException {
  CompressionCodecName codecName = CompressionCodecName.UNCOMPRESSED;
  if (enableCompression) {
    codecName = getCompressionCodecName();
  }
  avroParquetWriter = new AvroParquetWriter<E>(fileSystem.makeQualified(path),
      schema, codecName, DEFAULT_ROW_GROUP_SIZE,
      ParquetWriter.DEFAULT_PAGE_SIZE,
      ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, conf);
}
 
Example #22
Source File: AvroParquetFileReaderWriterFactory.java    From secor with Apache License 2.0 5 votes vote down vote up
public AvroParquetFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException {
    Path path = new Path(logFilePath.getLogFilePath());
    LOG.debug("Creating Brand new Writer for path {}", path);
    CompressionCodecName codecName = CompressionCodecName
            .fromCompressionCodec(codec != null ? codec.getClass() : null);
    topic = logFilePath.getTopic();
    // Not setting blockSize, pageSize, enableDictionary, and validating
    writer = AvroParquetWriter.builder(path)
            .withSchema(schemaRegistry.getSchema(topic))
            .withCompressionCodec(codecName)
            .build();
}
 
Example #23
Source File: ParquetAvroWriters.java    From flink with Apache License 2.0 5 votes vote down vote up
private static <T> ParquetWriter<T> createAvroParquetWriter(
		String schemaString,
		GenericData dataModel,
		OutputFile out) throws IOException {

	final Schema schema = new Schema.Parser().parse(schemaString);

	return AvroParquetWriter.<T>builder(out)
			.withSchema(schema)
			.withDataModel(dataModel)
			.build();
}
 
Example #24
Source File: AvroToParquetConverterUtil.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private static ParquetWriter.Builder getParquetWriterBuilder(Path tempFile, Schema avroSchema, Configuration conf) {
  // Parquet Avro pre-1.9 doesn't work with logical types, so in that case we use custom Builder that injects our own
  // avro schema -> parquet schema generator class (which is a copy of the one that was provided in PARQUET-358).
  // Additionally, Parquet Avro 1.9.x does not support converting from Avro timestamps (logical types TIMESTAMP_MILLIS
  // and TIMESTAMP_MICROS) and so we have to extend Parquet Avro classes to support timestamps conversion.
  ParquetWriter.Builder builder = null;
  try {
    SemanticVersion parquetVersion = SemanticVersion.parse(Version.VERSION_NUMBER);
    if(parquetVersion.major > 1 || (parquetVersion.major == 1 && parquetVersion.minor >= 9)) {
      if (parquetVersion.major == 1 && parquetVersion.minor >= 9) {
        LOG.debug("Creating AvroParquetWriterBuilder190Int96");
        if (propertyDefined(conf, AvroParquetConstants.TIMEZONE)) {
          String timeZoneId = conf.get(AvroParquetConstants.TIMEZONE);
          builder = new AvroParquetWriterBuilder190Int96(tempFile, timeZoneId).withSchema(avroSchema);
        } else {
          builder = new AvroParquetWriterBuilder190Int96(tempFile).withSchema(avroSchema);
        }
      } else {
        LOG.debug("Creating AvroParquetWriter.builder");
        builder = AvroParquetWriter.builder(tempFile).withSchema(avroSchema);
      }
    } else {
      LOG.debug("Creating AvroParquetWriterBuilder");
      builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
    }
  } catch (SemanticVersion.SemanticVersionParseException e) {
    LOG.warn("Can't parse parquet version string: " + Version.VERSION_NUMBER, e);
    builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
  }
  return builder;
}
 
Example #25
Source File: UtilitiesTestBase.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static void saveParquetToDFS(List<GenericRecord> records, Path targetFile) throws IOException {
  try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(targetFile)
      .withSchema(HoodieTestDataGenerator.AVRO_SCHEMA)
      .withConf(HoodieTestUtils.getDefaultHadoopConf())
      .withWriteMode(Mode.OVERWRITE)
      .build()) {
    for (GenericRecord record : records) {
      writer.write(record);
    }
  }
}
 
Example #26
Source File: TestParquetInLining.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Test
public void testSimpleInlineFileSystem() throws IOException {
  Path outerInMemFSPath = getRandomOuterInMemPath();
  Path outerPath = new Path(FILE_SCHEME + outerInMemFSPath.toString().substring(outerInMemFSPath.toString().indexOf(':')));
  generatedPath = outerPath;
  ParquetWriter inlineWriter = new AvroParquetWriter(outerInMemFSPath, HoodieTestDataGenerator.AVRO_SCHEMA,
      CompressionCodecName.GZIP, 100 * 1024 * 1024, 1024 * 1024, true, inMemoryConf);
  // write few records
  List<GenericRecord> recordsToWrite = getParquetHoodieRecords();
  for (GenericRecord rec : recordsToWrite) {
    inlineWriter.write(rec);
  }
  inlineWriter.close();
  byte[] inlineBytes = getBytesToInline(outerInMemFSPath);
  long startOffset = generateOuterFile(outerPath, inlineBytes);

  long inlineLength = inlineBytes.length;

  // Generate phantom inline file
  Path inlinePath = getPhantomFile(outerPath, startOffset, inlineLength);

  // instantiate Parquet reader
  ParquetReader inLineReader = AvroParquetReader.builder(inlinePath).withConf(inlineConf).build();
  List<GenericRecord> records = readParquetGenericRecords(inLineReader);
  assertArrayEquals(recordsToWrite.toArray(), records.toArray());
  inLineReader.close();
}
 
Example #27
Source File: DataLoad.java    From arvo2parquet with MIT License 5 votes vote down vote up
private static ParquetWriter<GenericData.Record> createParquetWriterInstance(@Nonnull final Schema schema,
                                                                             @Nonnull final Path fileToWrite)
        throws IOException
{
  return AvroParquetWriter
          .<GenericData.Record>builder(nioPathToOutputFile(fileToWrite))
          .withRowGroupSize(256 * 1024 * 1024)
          .withPageSize(128 * 1024)
          .withSchema(schema)
          .withConf(new Configuration())
          .withCompressionCodec(CompressionCodecName.GZIP)
          .withValidation(false)
          .withDictionaryEncoding(false)
          .build();
}
 
Example #28
Source File: TestUtil.java    From flink with Apache License 2.0 5 votes vote down vote up
public static Path createTempParquetFile(File folder, Schema schema, List<IndexedRecord> records) throws IOException {
	Path path = new Path(folder.getPath(), UUID.randomUUID().toString());
	ParquetWriter<IndexedRecord> writer = AvroParquetWriter.<IndexedRecord>builder(
		new org.apache.hadoop.fs.Path(path.toUri())).withSchema(schema).withRowGroupSize(10).build();

	for (IndexedRecord record : records) {
		writer.write(record);
	}

	writer.close();
	return path;
}
 
Example #29
Source File: ParquetAvroWriters.java    From flink with Apache License 2.0 5 votes vote down vote up
private static <T> ParquetWriter<T> createAvroParquetWriter(
		String schemaString,
		GenericData dataModel,
		OutputFile out) throws IOException {

	final Schema schema = new Schema.Parser().parse(schemaString);

	return AvroParquetWriter.<T>builder(out)
			.withSchema(schema)
			.withDataModel(dataModel)
			.build();
}
 
Example #30
Source File: ParquetAvroWriters.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private static <T> ParquetWriter<T> createAvroParquetWriter(
		String schemaString,
		GenericData dataModel,
		OutputFile out) throws IOException {

	final Schema schema = new Schema.Parser().parse(schemaString);

	return AvroParquetWriter.<T>builder(out)
			.withSchema(schema)
			.withDataModel(dataModel)
			.build();
}