org.apache.parquet.hadoop.ParquetWriter Java Examples

The following examples show how to use org.apache.parquet.hadoop.ParquetWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IntegrationTestHelper.java    From circus-train with Apache License 2.0 9 votes vote down vote up
URI createData(
    URI tableUri,
    Schema schema,
    String hour,
    int id,
    String fieldName,
    Object data) throws IOException {
  GenericData.Record record = new GenericData.Record(schema);
  record.put("id", id);

  if (fieldName != null) {
    Schema.Field field = schema.getField(fieldName);
    Schema fieldSchema = field.schema();
    if (data instanceof Map) {
      GenericData.Record schemaRecord = new GenericData.Record(fieldSchema);
      ((Map<String, String>) data).forEach(schemaRecord::put);
      record.put(fieldName, schemaRecord);
    } else if (data != null) {
      record.put(fieldName, data);
    }
  }

  URI partition = URI.create(tableUri + "/hour=" + hour);
  String path = partition.getPath();
  File parentFolder = new File(path);
  parentFolder.mkdirs();
  File partitionFile = new File(parentFolder, "parquet0000");
  Path filePath = new Path(partitionFile.toURI());
  ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(filePath)
      .withSchema(schema)
      .withConf(new Configuration())
      .build();

  try {
    writer.write(record);
  } finally {
    writer.close();
  }
  return partition;
}
 
Example #2
Source File: TestReadWrite.java    From parquet-mr with Apache License 2.0 7 votes vote down vote up
@Test(expected=RuntimeException.class)
public void testMapRequiredValueWithNull() throws Exception {
  Schema schema = Schema.createRecord("record1", null, null, false);
  schema.setFields(Lists.newArrayList(
      new Schema.Field("mymap", Schema.createMap(Schema.create(Schema.Type.INT)), null, null)));

  Path file = new Path(createTempFile().getPath());

  try(ParquetWriter<GenericRecord> writer = AvroParquetWriter
      .<GenericRecord>builder(file)
      .withSchema(schema)
      .withConf(testConf)
      .build()) {

    // Write a record with a null value
    Map<String, Integer> map = new HashMap<String, Integer>();
    map.put("thirty-four", 34);
    map.put("eleventy-one", null);
    map.put("one-hundred", 100);

    GenericData.Record record = new GenericRecordBuilder(schema)
      .set("mymap", map).build();
    writer.write(record);
  }
}
 
Example #3
Source File: AvroTestUtil.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public static <D> File write(TemporaryFolder temp, Configuration conf, GenericData model, Schema schema, D... data)
    throws IOException {
  File file = temp.newFile();
  Assert.assertTrue(file.delete());

  try (ParquetWriter<D> writer = AvroParquetWriter
    .<D>builder(new Path(file.toString()))
    .withDataModel(model)
    .withSchema(schema)
    .build()) {
    for (D datum : data) {
      writer.write(datum);
    }
  }

  return file;
}
 
Example #4
Source File: HiveTestUtil.java    From hudi with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings({"unchecked", "deprecation"})
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
    throws IOException, URISyntaxException {
  Schema schema = getTestDataSchema(isParquetSchemaSimple);
  org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
  BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
  ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
      ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
      ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());

  List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
      : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
  testRecords.forEach(s -> {
    try {
      writer.write(s);
    } catch (IOException e) {
      fail("IOException while writing test records as parquet" + e.toString());
    }
  });
  writer.close();
}
 
Example #5
Source File: ExaParquetWriterImpl.java    From hadoop-etl-udfs with MIT License 6 votes vote down vote up
private ExaParquetWriterImpl(final MessageType schema,
                             final int numColumns,
                             final Configuration conf,
                             final Path path,
                             final String compressionType,
                             final ExaIterator exa,
                             final int firstColumnIndex,
                             final List<Integer> dynamicPartitionExaColNums) throws Exception {
    super(path,
            new TupleWriteSupport(schema, conf),
            CompressionCodecName.fromConf(compressionType),
            ParquetWriter.DEFAULT_BLOCK_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
            ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
            PARQUET_WRITER_VERSION,
            conf);

    System.out.println("Path: " + path.toString());
    System.out.println("Parquet schema:\n" + schema);

    // Create Tuple object with ExaIterator reference.
    this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums);
}
 
Example #6
Source File: TestHoodieAvroWriteSupport.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException {
  List<String> rowKeys = new ArrayList<>();
  for (int i = 0; i < 1000; i++) {
    rowKeys.add(UUID.randomUUID().toString());
  }
  String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString();
  Schema schema = HoodieAvroUtils.getRecordKeySchema();
  BloomFilter filter = BloomFilterFactory.createBloomFilter(
      1000, 0.0001, 10000,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
      new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}
 
Example #7
Source File: ParquetFileReaderTest.java    From kafka-connect-fs with Apache License 2.0 6 votes vote down vote up
@Override
protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
    FileSystem fs = fsConfig.getFs();
    File parquetFile = File.createTempFile("test-", "." + getFileExtension());

    try (ParquetWriter writer = AvroParquetWriter.<GenericRecord>builder(new Path(parquetFile.toURI()))
            .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) {
        IntStream.range(0, NUM_RECORDS).forEach(index -> {
            GenericRecord datum = new GenericData.Record(readerSchema);
            datum.put(FIELD_INDEX, index);
            String uuid = UUID.randomUUID().toString();
            datum.put(FIELD_NAME, String.format("%d_name_%s", index, uuid));
            datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, uuid));
            try {
                fsConfig.offsetsByIndex().put(index, (long) index);
                writer.write(datum);
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        });
    }
    Path path = new Path(new Path(fsConfig.getFsUri()), parquetFile.getName());
    fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path);
    return path;
}
 
Example #8
Source File: TestParquetUtils.java    From hudi with Apache License 2.0 6 votes vote down vote up
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception {
  // Write out a parquet file
  BloomFilter filter = BloomFilterFactory
      .createBloomFilter(1000, 0.0001, 10000, typeCode);
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    if (addPartitionPathField) {
      rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath);
    }
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}
 
Example #9
Source File: HoodieParquetWriter.java    From hudi with Apache License 2.0 6 votes vote down vote up
public HoodieParquetWriter(String instantTime, Path file, HoodieParquetConfig parquetConfig,
    Schema schema, SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
  super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
      ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
      parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
      ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
      ParquetWriter.DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
  this.fs =
      (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  // We cannot accurately measure the snappy compressed output file size. We are choosing a
  // conservative 10%
  // TODO - compute this compression ratio dynamically by looking at the bytes written to the
  // stream and the actual file size reported by HDFS
  this.maxFileSize = parquetConfig.getMaxFileSize()
      + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
  this.writeSupport = parquetConfig.getWriteSupport();
  this.instantTime = instantTime;
  this.sparkTaskContextSupplier = sparkTaskContextSupplier;
}
 
Example #10
Source File: TestHDFSParquetImporter.java    From hudi with Apache License 2.0 6 votes vote down vote up
public List<GenericRecord> createInsertRecords(Path srcFolder) throws ParseException, IOException {
  Path srcFile = new Path(srcFolder.toString(), "file1.parquet");
  long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000;
  List<GenericRecord> records = new ArrayList<GenericRecord>();
  for (long recordNum = 0; recordNum < 96; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-" + recordNum,
        "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
      .withSchema(HoodieTestDataGenerator.AVRO_SCHEMA).withConf(HoodieTestUtils.getDefaultHadoopConf()).build()) {
    for (GenericRecord record : records) {
      writer.write(record);
    }
  }
  return records;
}
 
Example #11
Source File: TestHDFSParquetImporter.java    From hudi with Apache License 2.0 6 votes vote down vote up
public List<GenericRecord> createUpsertRecords(Path srcFolder) throws ParseException, IOException {
  Path srcFile = new Path(srcFolder.toString(), "file1.parquet");
  long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000;
  List<GenericRecord> records = new ArrayList<GenericRecord>();
  // 10 for update
  for (long recordNum = 0; recordNum < 11; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-upsert-" + recordNum,
        "driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  // 4 for insert
  for (long recordNum = 96; recordNum < 100; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-upsert-" + recordNum,
        "driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
      .withSchema(HoodieTestDataGenerator.AVRO_SCHEMA).withConf(HoodieTestUtils.getDefaultHadoopConf()).build()) {
    for (GenericRecord record : records) {
      writer.write(record);
    }
  }
  return records;
}
 
Example #12
Source File: TestColumnSizeCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}
 
Example #13
Source File: ProtoParquetWriterWithOffset.java    From garmadon with Apache License 2.0 6 votes vote down vote up
/**
 * @param writer            The actual Proto + Parquet writer
 * @param temporaryHdfsPath The path to which the writer will output events
 * @param finalHdfsDir      The directory to write the final output to (renamed from temporaryHdfsPath)
 * @param fs                The filesystem on which both the temporary and final files reside
 * @param fileNamer         File-naming logic for the final path
 * @param dayStartTime      The day partition the final file will go to
 * @param eventName         Event name used for logging &amp; monitoring
 */
public ProtoParquetWriterWithOffset(ParquetWriter<MESSAGE_KIND> writer, Path temporaryHdfsPath,
                                    Path finalHdfsDir, FileSystem fs, OffsetComputer fileNamer,
                                    LocalDateTime dayStartTime, String eventName,
                                    BiConsumer<String, String> protoMetadataWriter, int partition) {
    this.writer = writer;
    this.temporaryHdfsPath = temporaryHdfsPath;
    this.finalHdfsDir = finalHdfsDir;
    this.fs = fs;
    this.fileNamer = fileNamer;
    this.dayStartTime = dayStartTime;
    this.eventName = eventName;
    this.fsBlockSize = fs.getDefaultBlockSize(finalHdfsDir);
    this.protoMetadataWriter = protoMetadataWriter;
    this.partition = partition;

    initializeLatestCommittedTimestampGauge();
}
 
Example #14
Source File: FetchParquetTest.java    From nifi with Apache License 2.0 6 votes vote down vote up
private void writeParquetUsersWithArray(final File parquetFile, int numUsers) throws IOException {
    if (parquetFile.exists()) {
        Assert.assertTrue(parquetFile.delete());
    }

    final AvroParquetWriter.Builder<GenericRecord> writerBuilder = createAvroParquetWriter(parquetFile, schemaWithArray);

    final Schema favoriteColorsSchema = schemaWithArray.getField("favorite_colors").schema();

    try (final ParquetWriter<GenericRecord> writer = writerBuilder.build()) {
        for (int i=0; i < numUsers; i++) {
            final GenericRecord user = new GenericData.Record(schema);
            user.put("name", "Bob" + i);
            user.put("favorite_number", i);


            final GenericData.Array<String> colors = new GenericData.Array<>(1, favoriteColorsSchema);
            colors.add("blue" + i);

            user.put("favorite_color", colors);

            writer.write(user);
        }
    }
}
 
Example #15
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void writeAndTest(WriteContext context) throws IOException {
  // Create the configuration, and then apply the schema to our configuration.
  Configuration configuration = new Configuration();
  GroupWriteSupport.setSchema(context.schema, configuration);
  GroupWriteSupport groupWriteSupport = new GroupWriteSupport();

  // Create the writer properties
  final int blockSize = context.blockSize;
  final int pageSize = context.pageSize;
  final int dictionaryPageSize = pageSize;
  final boolean enableDictionary = context.enableDictionary;
  final boolean enableValidation = context.enableValidation;
  ParquetProperties.WriterVersion writerVersion = context.version;
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
      groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, enableValidation, writerVersion, configuration);

  context.write(writer);
  writer.close();

  context.test();

  context.path.delete();
}
 
Example #16
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}
 
Example #17
Source File: ColumnSizeCommandTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}
 
Example #18
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static void write(ParquetWriter.Builder<Group, ?> builder, List<User> users) throws IOException {
  builder.config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());
  try (ParquetWriter<Group> writer = builder.build()) {
    for (User u : users) {
      writer.write(groupFromUser(u));
    }
  }
}
 
Example #19
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void write(ParquetWriter<Group> writer) throws IOException {
  for (int index = 0; index < recordCount; index++) {
    Group group = new SimpleGroup(super.schema);

    for (int column = 0, columnCnt = schema.getFieldCount(); column < columnCnt; ++column) {
      Type type = schema.getType(column);
      RandomValueGenerator<?> generator = randomGenerators.get(column);
      if (type.isRepetition(OPTIONAL) && generator.shouldGenerateNull()) {
        continue;
      }
      switch (type.asPrimitiveType().getPrimitiveTypeName()) {
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
      case INT96:
        group.append(type.getName(), ((RandomBinaryBase<?>) generator).nextBinaryValue());
        break;
      case INT32:
        group.append(type.getName(), (Integer) generator.nextValue());
        break;
      case INT64:
        group.append(type.getName(), (Long) generator.nextValue());
        break;
      case FLOAT:
        group.append(type.getName(), (Float) generator.nextValue());
        break;
      case DOUBLE:
        group.append(type.getName(), (Double) generator.nextValue());
        break;
      case BOOLEAN:
        group.append(type.getName(), (Boolean) generator.nextValue());
        break;
      }
    }
    writer.write(group);
  }
}
 
Example #20
Source File: ScroogeBinaryTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testScroogeBinaryEncoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  // read using the parquet-thrift version to isolate the write path
  ParquetReader<org.apache.parquet.thrift.test.binary.StringAndBinary> reader = ThriftParquetReader.<org.apache.parquet.thrift.test.binary.StringAndBinary>
      build(path)
      .withThriftClass(org.apache.parquet.thrift.test.binary.StringAndBinary.class)
      .build();
  org.apache.parquet.thrift.test.binary.StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s);
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b);
}
 
Example #21
Source File: TestReadWrite.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNestedLists() throws Exception {
  Schema schema = new Schema.Parser().parse(
    Resources.getResource("nested_array.avsc").openStream());
  Path file = new Path(createTempFile().getPath());

  // Parquet writer
  ParquetWriter parquetWriter = AvroParquetWriter.builder(file).withSchema(schema)
    .withConf(testConf)
    .build();

  Schema innerRecordSchema = schema.getField("l1").schema().getTypes()
    .get(1).getElementType().getTypes().get(1);

  GenericRecord record = new GenericRecordBuilder(schema)
    .set("l1", Collections.singletonList(
      new GenericRecordBuilder(innerRecordSchema).set("l2", Collections.singletonList("hello")).build()
    ))
    .build();

  parquetWriter.write(record);
  parquetWriter.close();

  AvroParquetReader<GenericRecord> reader = new AvroParquetReader(testConf, file);
  GenericRecord nextRecord = reader.read();

  assertNotNull(nextRecord);
  assertNotNull(nextRecord.get("l1"));
  List l1List = (List) nextRecord.get("l1");
  assertNotNull(l1List.get(0));
  List l2List = (List) ((GenericRecord) l1List.get(0)).get("l2");
  assertEquals(str("hello"), l2List.get(0));
}
 
Example #22
Source File: AvroToParquetConverterUtil.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private static ParquetWriter.Builder getParquetWriterBuilder(Path tempFile, Schema avroSchema, Configuration conf) {
  // Parquet Avro pre-1.9 doesn't work with logical types, so in that case we use custom Builder that injects our own
  // avro schema -> parquet schema generator class (which is a copy of the one that was provided in PARQUET-358).
  // Additionally, Parquet Avro 1.9.x does not support converting from Avro timestamps (logical types TIMESTAMP_MILLIS
  // and TIMESTAMP_MICROS) and so we have to extend Parquet Avro classes to support timestamps conversion.
  ParquetWriter.Builder builder = null;
  try {
    SemanticVersion parquetVersion = SemanticVersion.parse(Version.VERSION_NUMBER);
    if(parquetVersion.major > 1 || (parquetVersion.major == 1 && parquetVersion.minor >= 9)) {
      if (parquetVersion.major == 1 && parquetVersion.minor >= 9) {
        LOG.debug("Creating AvroParquetWriterBuilder190Int96");
        if (propertyDefined(conf, AvroParquetConstants.TIMEZONE)) {
          String timeZoneId = conf.get(AvroParquetConstants.TIMEZONE);
          builder = new AvroParquetWriterBuilder190Int96(tempFile, timeZoneId).withSchema(avroSchema);
        } else {
          builder = new AvroParquetWriterBuilder190Int96(tempFile).withSchema(avroSchema);
        }
      } else {
        LOG.debug("Creating AvroParquetWriter.builder");
        builder = AvroParquetWriter.builder(tempFile).withSchema(avroSchema);
      }
    } else {
      LOG.debug("Creating AvroParquetWriterBuilder");
      builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
    }
  } catch (SemanticVersion.SemanticVersionParseException e) {
    LOG.warn("Can't parse parquet version string: " + Version.VERSION_NUMBER, e);
    builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
  }
  return builder;
}
 
Example #23
Source File: ParquetFileTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createTestParquetFile() throws IOException {
  File file = parquetFile();
  Path fsPath = new Path(file.getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    for (int i = 0; i < 10; i++) {
      final byte[] bytes = new byte[12];
      ThreadLocalRandom.current().nextBytes(bytes);

      writer.write(fact.newGroup()
       .append(INT32_FIELD, 32 + i)
       .append(INT64_FIELD, 64L + i)
       .append(FLOAT_FIELD, 1.0f + i)
       .append(DOUBLE_FIELD, 2.0d + i)
       .append(BINARY_FIELD, Binary.fromString(COLORS[i % COLORS.length]))
       .append(FIXED_LEN_BYTE_ARRAY_FIELD,
         Binary.fromConstantByteArray(bytes)));
    }
  }
}
 
Example #24
Source File: ScroogeBinaryTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
@SuppressWarnings("unchecked")
public void testScroogeBinaryDecoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  Configuration conf = new Configuration();
  conf.set("parquet.thrift.converter.class", ScroogeRecordConverter.class.getName());
  ParquetReader<StringAndBinary> reader = ParquetReader.<StringAndBinary>
      builder(new ScroogeReadSupport(), path)
      .withConf(conf)
      .build();
  StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s());
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b());
}
 
Example #25
Source File: TestSimpleRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createTestParquetFile() throws IOException {
  Path fsPath = new Path(testFile().getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    writer.write(fact.newGroup()
     .append(INT32_FIELD, 32)
     .append(INT64_FIELD, 64L)
     .append(FLOAT_FIELD, 1.0f)
     .append(DOUBLE_FIELD, 2.0d)
     .append(BINARY_FIELD, Binary.fromString("foobar"))
     .append(FIXED_LEN_BYTE_ARRAY_FIELD,
       Binary.fromConstantByteArray(new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 })));
  }
}
 
Example #26
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}
 
Example #27
Source File: TestUtil.java    From flink with Apache License 2.0 5 votes vote down vote up
public static Path createTempParquetFile(File folder, Schema schema, List<IndexedRecord> records) throws IOException {
	Path path = new Path(folder.getPath(), UUID.randomUUID().toString());
	ParquetWriter<IndexedRecord> writer = AvroParquetWriter.<IndexedRecord>builder(
		new org.apache.hadoop.fs.Path(path.toUri())).withSchema(schema).withRowGroupSize(10).build();

	for (IndexedRecord record : records) {
		writer.write(record);
	}

	writer.close();
	return path;
}
 
Example #28
Source File: ParquetAvroWriters.java    From flink with Apache License 2.0 5 votes vote down vote up
private static <T> ParquetWriter<T> createAvroParquetWriter(
		String schemaString,
		GenericData dataModel,
		OutputFile out) throws IOException {

	final Schema schema = new Schema.Parser().parse(schemaString);

	return AvroParquetWriter.<T>builder(out)
			.withSchema(schema)
			.withDataModel(dataModel)
			.build();
}
 
Example #29
Source File: ParquetRowDataBuilder.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetWriter<RowData> createWriter(OutputFile out) throws IOException {
	Configuration conf = configuration.conf();
	return new ParquetRowDataBuilder(out, rowType, utcTimestamp)
			.withCompressionCodec(getParquetCompressionCodec(conf))
			.withRowGroupSize(getBlockSize(conf))
			.withPageSize(getPageSize(conf))
			.withDictionaryPageSize(getDictionaryPageSize(conf))
			.withMaxPaddingSize(conf.getInt(
					MAX_PADDING_BYTES, ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
			.withDictionaryEncoding(getEnableDictionary(conf))
			.withValidation(getValidation(conf))
			.withWriterVersion(getWriterVersion(conf))
			.withConf(conf).build();
}
 
Example #30
Source File: AvroParquetConvertMapper.java    From datacollector with Apache License 2.0 5 votes vote down vote up
@Override
protected void initializeWriter(
    Path tempFile,
    Schema avroSchema,
    Configuration conf,
    Context context
) throws IOException {
  ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter(tempFile, avroSchema, conf);

  // Parquet writer
  parquetWriter = builder
      .withConf(context.getConfiguration())
      .build();
}