Java Code Examples for org.apache.parquet.hadoop.ParquetWriter#Builder

The following examples show how to use org.apache.parquet.hadoop.ParquetWriter#Builder . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroParquetConvertMapper.java    From datacollector with Apache License 2.0 5 votes vote down vote up
@Override
protected void initializeWriter(
    Path tempFile,
    Schema avroSchema,
    Configuration conf,
    Context context
) throws IOException {
  ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter(tempFile, avroSchema, conf);

  // Parquet writer
  parquetWriter = builder
      .withConf(context.getConfiguration())
      .build();
}
 
Example 2
Source File: AvroToParquetConverterUtil.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private static ParquetWriter.Builder getParquetWriterBuilder(Path tempFile, Schema avroSchema, Configuration conf) {
  // Parquet Avro pre-1.9 doesn't work with logical types, so in that case we use custom Builder that injects our own
  // avro schema -> parquet schema generator class (which is a copy of the one that was provided in PARQUET-358).
  // Additionally, Parquet Avro 1.9.x does not support converting from Avro timestamps (logical types TIMESTAMP_MILLIS
  // and TIMESTAMP_MICROS) and so we have to extend Parquet Avro classes to support timestamps conversion.
  ParquetWriter.Builder builder = null;
  try {
    SemanticVersion parquetVersion = SemanticVersion.parse(Version.VERSION_NUMBER);
    if(parquetVersion.major > 1 || (parquetVersion.major == 1 && parquetVersion.minor >= 9)) {
      if (parquetVersion.major == 1 && parquetVersion.minor >= 9) {
        LOG.debug("Creating AvroParquetWriterBuilder190Int96");
        if (propertyDefined(conf, AvroParquetConstants.TIMEZONE)) {
          String timeZoneId = conf.get(AvroParquetConstants.TIMEZONE);
          builder = new AvroParquetWriterBuilder190Int96(tempFile, timeZoneId).withSchema(avroSchema);
        } else {
          builder = new AvroParquetWriterBuilder190Int96(tempFile).withSchema(avroSchema);
        }
      } else {
        LOG.debug("Creating AvroParquetWriter.builder");
        builder = AvroParquetWriter.builder(tempFile).withSchema(avroSchema);
      }
    } else {
      LOG.debug("Creating AvroParquetWriterBuilder");
      builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
    }
  } catch (SemanticVersion.SemanticVersionParseException e) {
    LOG.warn("Can't parse parquet version string: " + Version.VERSION_NUMBER, e);
    builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
  }
  return builder;
}
 
Example 3
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static void write(ParquetWriter.Builder<Group, ?> builder, List<User> users) throws IOException {
  builder.config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());
  try (ParquetWriter<Group> writer = builder.build()) {
    for (User u : users) {
      writer.write(groupFromUser(u));
    }
  }
}
 
Example 4
Source File: WholeFileTransformerProcessor.java    From datacollector with Apache License 2.0 4 votes vote down vote up
/**
 * Convert Avro record to Parquet
 * @param sourceFileName the source Avro file name
 * @param fileReader the {@link org.apache.avro.file.DataFileStream} Avro file reader
 * @param tempParquetFile the {@link java.nio.file.Path} temporary parquet file path
 */
private void writeParquet(String sourceFileName, DataFileStream<GenericRecord> fileReader, Path tempParquetFile) throws StageException {
  long recordCount = 0;
  GenericRecord avroRecord;
  Schema schema = fileReader.getSchema();

  LOG.debug("Start reading input file : {}", sourceFileName);
  try {
    // initialize parquet writer
    Configuration jobConfiguration = new Configuration();
    String compressionCodecName = compressionElEval.eval(variables, jobConfig.avroParquetConfig.compressionCodec, String.class);
    jobConfiguration.set(AvroParquetConstants.COMPRESSION_CODEC_NAME, compressionCodecName);
    jobConfiguration.setInt(AvroParquetConstants.ROW_GROUP_SIZE, jobConfig.avroParquetConfig.rowGroupSize);
    jobConfiguration.setInt(AvroParquetConstants.PAGE_SIZE, jobConfig.avroParquetConfig.pageSize);
    jobConfiguration.setInt(AvroParquetConstants.DICTIONARY_PAGE_SIZE, jobConfig.avroParquetConfig.dictionaryPageSize);
    jobConfiguration.setInt(AvroParquetConstants.MAX_PADDING_SIZE, jobConfig.avroParquetConfig.maxPaddingSize);

    // Parquet writer
    ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter(
        new org.apache.hadoop.fs.Path(tempParquetFile.toString()),
        schema,
        jobConfiguration
    );
    parquetWriter = builder.build();

    while (fileReader.hasNext()) {
      avroRecord = fileReader.next();
      parquetWriter.write(avroRecord);
      recordCount++;
    }
    parquetWriter.close();

  } catch (IOException ex) {
    throw new TransformerStageCheckedException(
        Errors.CONVERT_08,
        sourceFileName,
        recordCount,
        ex
    );
  }
  LOG.debug("Finished writing {} records to {}", recordCount, tempParquetFile.getFileName());
}
 
Example 5
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public <T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder) {
  return builder
      .withPageSize(Integer.MAX_VALUE) // Ensure that only the row count limit takes into account
      .withPageRowCountLimit(1_000);
}
 
Example 6
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public <T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder) {
  return builder
      .withPageSize(Integer.MAX_VALUE) // Ensure that only the row count limit takes into account
      .withPageRowCountLimit(10_000);
}
 
Example 7
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public <T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder) {
  return builder
      .withPageSize(Integer.MAX_VALUE) // Ensure that only the row count limit takes into account
      .withPageRowCountLimit(50_000);
}
 
Example 8
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public <T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder) {
  return builder
      .withPageSize(Integer.MAX_VALUE) // Ensure that only the row count limit takes into account
      .withPageRowCountLimit(100_000);
}
 
Example 9
Source File: ParquetUtils.java    From nifi with Apache License 2.0 4 votes vote down vote up
public static void applyCommonConfig(final ParquetWriter.Builder<?, ?> builder, final Configuration conf,
                                     final ParquetConfig parquetConfig) {
    builder.withConf(conf);
    builder.withCompressionCodec(parquetConfig.getCompressionCodec());

    // Optional properties

    if (parquetConfig.getRowGroupSize() != null){
        builder.withRowGroupSize(parquetConfig.getRowGroupSize());
    }

    if (parquetConfig.getPageSize() != null) {
        builder.withPageSize(parquetConfig.getPageSize());
    }

    if (parquetConfig.getDictionaryPageSize() != null) {
        builder.withDictionaryPageSize(parquetConfig.getDictionaryPageSize());
    }

    if (parquetConfig.getMaxPaddingSize() != null) {
        builder.withMaxPaddingSize(parquetConfig.getMaxPaddingSize());
    }

    if (parquetConfig.getEnableDictionaryEncoding() != null) {
        builder.withDictionaryEncoding(parquetConfig.getEnableDictionaryEncoding());
    }

    if (parquetConfig.getEnableValidation() != null) {
        builder.withValidation(parquetConfig.getEnableValidation());
    }

    if (parquetConfig.getWriterVersion() != null) {
        builder.withWriterVersion(parquetConfig.getWriterVersion());
    }

    if (parquetConfig.getWriterMode() != null) {
        builder.withWriteMode(parquetConfig.getWriterMode());
    }

    applyCommonConfig(conf, parquetConfig);
}
 
Example 10
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 votes vote down vote up
<T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder);