org.apache.parquet.hadoop.api.WriteSupport Java Examples

The following examples show how to use org.apache.parquet.hadoop.api.WriteSupport. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: InternalParquetRecordWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param parquetFileWriter the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param rowGroupSize the size of a block in the file (this will be approximate)
 * @param compressor the codec used to compress
 */
public InternalParquetRecordWriter(
    ParquetFileWriter parquetFileWriter,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long rowGroupSize,
    BytesCompressor compressor,
    boolean validating,
    ParquetProperties props) {
  this.parquetFileWriter = parquetFileWriter;
  this.writeSupport = Objects.requireNonNull(writeSupport, "writeSupport cannot be null");
  this.schema = schema;
  this.extraMetaData = extraMetaData;
  this.rowGroupSize = rowGroupSize;
  this.rowGroupSizeThreshold = rowGroupSize;
  this.nextRowGroupSize = rowGroupSizeThreshold;
  this.compressor = compressor;
  this.validating = validating;
  this.props = props;
  initStore();
}
 
Example #2
Source File: ParquetRecordWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param pageSize the size of a page in the file (this will be approximate)
 * @param compressor the compressor used to compress the pages
 * @param dictionaryPageSize the threshold for dictionary size
 * @param enableDictionary to enable the dictionary
 * @param validating if schema validation should be turned on
 * @param writerVersion writer compatibility version
 * @param memoryManager memory manager for the write
 */
@Deprecated
public ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long blockSize, int pageSize,
    BytesCompressor compressor,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating,
    WriterVersion writerVersion,
    MemoryManager memoryManager) {
  ParquetProperties props = ParquetProperties.builder()
      .withPageSize(pageSize)
      .withDictionaryPageSize(dictionaryPageSize)
      .withDictionaryEncoding(enableDictionary)
      .withWriterVersion(writerVersion)
      .build();
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, compressor, validating, props);
  this.memoryManager = Objects.requireNonNull(memoryManager, "memoryManager cannot be null");
  memoryManager.addWriter(internalWriter, blockSize);
  this.codecFactory = null;
}
 
Example #3
Source File: ParquetRecordWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param codec the compression codec used to compress the pages
 * @param validating if schema validation should be turned on
 * @param props parquet encoding properties
 */
ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long blockSize,
    CompressionCodecName codec,
    boolean validating,
    ParquetProperties props,
    MemoryManager memoryManager,
    Configuration conf) {
  this.codecFactory = new CodecFactory(conf, props.getPageSizeThreshold());
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, codecFactory.getCompressor(codec), validating,
      props);
  this.memoryManager = Objects.requireNonNull(memoryManager, "memoryManager cannot be null");
  memoryManager.addWriter(internalWriter, blockSize);
}
 
Example #4
Source File: AvroParquetWriterBuilder190Int96.java    From datacollector with Apache License 2.0 6 votes vote down vote up
protected WriteSupport<T> getWriteSupport(Configuration conf) {
  AvroLogicalTypeSupport avroLogicalTypeSupport = AvroLogicalTypeSupport.getAvroLogicalTypeSupport();
  if (avroLogicalTypeSupport.isLogicalTypeSupported()) {
    LOG.debug("Returning write support with converter = AvroSchemaConverter190Int96Avro18");
    return new AvroWriteSupportInt96Avro18<>(
        (new AvroSchemaConverter190Int96Avro18(conf)).convert(this.schema),
        this.schema,
        this.model,
        this.timeZoneId
    );
  } else {
    LOG.debug("Returning write support with converter = AvroSchemaConverter190Int96Avro17");
    return new AvroWriteSupportInt96Avro17<>(
        (new AvroSchemaConverter190Int96Avro17(conf)).convert(this.schema),
        this.schema,
        this.model,
        this.timeZoneId
    );
  }
}
 
Example #5
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a new ParquetWriter.
 *
 * @param file the file to create
 * @param mode file creation mode
 * @param writeSupport the implementation to write a record to a RecordConsumer
 * @param compressionCodecName the compression codec to use
 * @param blockSize the block size threshold
 * @param pageSize the page size threshold
 * @param dictionaryPageSize the page size threshold for the dictionary pages
 * @param enableDictionary to turn dictionary encoding on
 * @param validating to turn on validation using the schema
 * @param writerVersion version of parquetWriter from {@link ParquetProperties.WriterVersion}
 * @param conf Hadoop configuration to use while accessing the filesystem
 * @throws IOException if there is an error while writing
 * @deprecated will be removed in 2.0.0
 */
@Deprecated
public ParquetWriter(
    Path file,
    ParquetFileWriter.Mode mode,
    WriteSupport<T> writeSupport,
    CompressionCodecName compressionCodecName,
    int blockSize,
    int pageSize,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating,
    WriterVersion writerVersion,
    Configuration conf) throws IOException {
  this(HadoopOutputFile.fromPath(file, conf),
      mode, writeSupport, compressionCodecName, blockSize,
      validating, conf, MAX_PADDING_SIZE_DEFAULT,
      ParquetProperties.builder()
          .withPageSize(pageSize)
          .withDictionaryPageSize(dictionaryPageSize)
          .withDictionaryEncoding(enableDictionary)
          .withWriterVersion(writerVersion)
          .build());
}
 
Example #6
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a new ParquetWriter.
 *
 * @param file the file to create
 * @param writeSupport the implementation to write a record to a RecordConsumer
 * @param compressionCodecName the compression codec to use
 * @param blockSize the block size threshold
 * @param pageSize the page size threshold
 * @param dictionaryPageSize the page size threshold for the dictionary pages
 * @param enableDictionary to turn dictionary encoding on
 * @param validating to turn on validation using the schema
 * @param writerVersion version of parquetWriter from {@link ParquetProperties.WriterVersion}
 * @param conf Hadoop configuration to use while accessing the filesystem
 * @throws IOException if there is an error while writing
 * @deprecated will be removed in 2.0.0
 */
@Deprecated
public ParquetWriter(
    Path file,
    WriteSupport<T> writeSupport,
    CompressionCodecName compressionCodecName,
    int blockSize,
    int pageSize,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating,
    WriterVersion writerVersion,
    Configuration conf) throws IOException {
  this(file, ParquetFileWriter.Mode.CREATE, writeSupport,
      compressionCodecName, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, validating, writerVersion, conf);
}
 
Example #7
Source File: GroupWriteSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.WriteSupport.WriteContext init(Configuration configuration) {
  // if present, prefer the schema passed to the constructor
  if (schema == null) {
    schema = getSchema(configuration);
  }
  return new WriteContext(schema, this.extraMetaData);
}
 
Example #8
Source File: ParquetRecordWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param pageSize the size of a page in the file (this will be approximate)
 * @param compressor the compressor used to compress the pages
 * @param dictionaryPageSize the threshold for dictionary size
 * @param enableDictionary to enable the dictionary
 * @param validating if schema validation should be turned on
 * @param writerVersion writer compatibility version
 */
@Deprecated
public ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    int blockSize, int pageSize,
    BytesCompressor compressor,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating,
    WriterVersion writerVersion) {
  ParquetProperties props = ParquetProperties.builder()
      .withPageSize(pageSize)
      .withDictionaryPageSize(dictionaryPageSize)
      .withDictionaryEncoding(enableDictionary)
      .withWriterVersion(writerVersion)
      .build();
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, compressor, validating, props);
  this.memoryManager = null;
  this.codecFactory = null;
}
 
Example #9
Source File: ParquetOutputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static Class<?> getWriteSupportClass(Configuration configuration) {
  final String className = configuration.get(WRITE_SUPPORT_CLASS);
  if (className == null) {
    return null;
  }
  final Class<?> writeSupportClass = ConfigurationUtil.getClassFromConfig(configuration, WRITE_SUPPORT_CLASS, WriteSupport.class);
  return writeSupportClass;
}
 
Example #10
Source File: ParquetOutputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param configuration to find the configuration for the write support class
 * @return the configured write support
 */
@SuppressWarnings("unchecked")
public WriteSupport<T> getWriteSupport(Configuration configuration){
  if (writeSupport != null) return writeSupport;
  Class<?> writeSupportClass = getWriteSupportClass(configuration);
  try {
    return (WriteSupport<T>) Objects
        .requireNonNull(writeSupportClass, "writeSupportClass cannot be null")
        .newInstance();
  } catch (InstantiationException | IllegalAccessException e) {
    throw new BadConfigurationException("could not instantiate write support class: " + writeSupportClass, e);
  }
}
 
Example #11
Source File: HoodieAvroWriteSupport.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public WriteSupport.FinalizedWriteContext finalizeWrite() {
  HashMap<String, String> extraMetaData = new HashMap<>();
  if (bloomFilter != null) {
    extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString());
    if (minRecordKey != null && maxRecordKey != null) {
      extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey);
      extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey);
    }
    if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
      extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name());
    }
  }
  return new WriteSupport.FinalizedWriteContext(extraMetaData);
}
 
Example #12
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Create a new ParquetWriter.
 *
 * @param file the file to create
 * @param writeSupport the implementation to write a record to a RecordConsumer
 * @param compressionCodecName the compression codec to use
 * @param blockSize the block size threshold
 * @param pageSize the page size threshold
 * @param dictionaryPageSize the page size threshold for the dictionary pages
 * @param enableDictionary to turn dictionary encoding on
 * @param validating to turn on validation using the schema
 * @throws IOException if there is an error while writing
 * @deprecated will be removed in 2.0.0
 */
@Deprecated
public ParquetWriter(
    Path file,
    WriteSupport<T> writeSupport,
    CompressionCodecName compressionCodecName,
    int blockSize,
    int pageSize,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating) throws IOException {
  this(file, writeSupport, compressionCodecName, blockSize, pageSize,
      dictionaryPageSize, enableDictionary, validating,
      DEFAULT_WRITER_VERSION);
}
 
Example #13
Source File: Parquet.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected WriteSupport<T> getWriteSupport(Configuration configuration) {
  for (Map.Entry<String, String> entry : config.entrySet()) {
    configuration.set(entry.getKey(), entry.getValue());
  }
  return new ParquetWriteSupport<>(type, keyValueMetadata, writeSupport);
}
 
Example #14
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Deprecated
public ParquetWriter(Path file, Configuration conf, WriteSupport<T> writeSupport) throws IOException {
  this(file,
      writeSupport,
      DEFAULT_COMPRESSION_CODEC_NAME,
      DEFAULT_BLOCK_SIZE,
      DEFAULT_PAGE_SIZE,
      DEFAULT_PAGE_SIZE,
      DEFAULT_IS_DICTIONARY_ENABLED,
      DEFAULT_IS_VALIDATING_ENABLED,
      DEFAULT_WRITER_VERSION,
      conf);
}
 
Example #15
Source File: Parquet.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
private <T> WriteSupport<T> getWriteSupport(MessageType type) {
  if (writeSupport != null) {
    return (WriteSupport<T>) writeSupport;
  } else {
    return new AvroWriteSupport<>(
        type,
        ParquetAvro.parquetAvroSchema(AvroSchemaUtil.convert(schema, name)),
        ParquetAvro.DEFAULT_MODEL);
  }
}
 
Example #16
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
ParquetWriter(
    OutputFile file,
    ParquetFileWriter.Mode mode,
    WriteSupport<T> writeSupport,
    CompressionCodecName compressionCodecName,
    int rowGroupSize,
    boolean validating,
    Configuration conf,
    int maxPaddingSize,
    ParquetProperties encodingProps) throws IOException {

  WriteSupport.WriteContext writeContext = writeSupport.init(conf);
  MessageType schema = writeContext.getSchema();

  ParquetFileWriter fileWriter = new ParquetFileWriter(
    file, schema, mode, rowGroupSize, maxPaddingSize,
    encodingProps.getColumnIndexTruncateLength(), encodingProps.getStatisticsTruncateLength(),
    encodingProps.getPageWriteChecksumEnabled());
  fileWriter.start();

  this.codecFactory = new CodecFactory(conf, encodingProps.getPageSizeThreshold());
  CodecFactory.BytesCompressor compressor =	codecFactory.getCompressor(compressionCodecName);
  this.writer = new InternalParquetRecordWriter<T>(
      fileWriter,
      writeSupport,
      schema,
      writeContext.getExtraMetaData(),
      rowGroupSize,
      compressor,
      validating,
      encodingProps);
}
 
Example #17
Source File: Parquet.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected WriteSupport<T> getWriteSupport(Configuration configuration) {
  for (Map.Entry<String, String> entry : config.entrySet()) {
    configuration.set(entry.getKey(), entry.getValue());
  }
  return new ParquetWriteSupport<>(type, keyValueMetadata, writeSupport);
}
 
Example #18
Source File: Parquet.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
private <T> WriteSupport<T> getWriteSupport(MessageType type) {
  if (writeSupport != null) {
    return (WriteSupport<T>) writeSupport;
  } else {
    return new AvroWriteSupport<>(
        type,
        ParquetAvro.parquetAvroSchema(AvroSchemaUtil.convert(schema, name)),
        ParquetAvro.DEFAULT_MODEL);
  }
}
 
Example #19
Source File: ParquetOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, CompressionCodecName codec, Mode mode)
      throws IOException, InterruptedException {
  final WriteSupport<T> writeSupport = getWriteSupport(conf);

  ParquetProperties.Builder propsBuilder = ParquetProperties.builder()
      .withPageSize(getPageSize(conf))
      .withDictionaryPageSize(getDictionaryPageSize(conf))
      .withDictionaryEncoding(getEnableDictionary(conf))
      .withWriterVersion(getWriterVersion(conf))
      .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf))
      .withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf))
      .withMaxRowCountForPageSizeCheck(getMaxRowCountForPageSizeCheck(conf))
      .withColumnIndexTruncateLength(getColumnIndexTruncateLength(conf))
      .withStatisticsTruncateLength(getStatisticsTruncateLength(conf))
      .withMaxBloomFilterBytes(getBloomFilterMaxBytes(conf))
      .withBloomFilterEnabled(getBloomFilterEnabled(conf))
      .withPageRowCountLimit(getPageRowCountLimit(conf))
      .withPageWriteChecksumEnabled(getPageWriteChecksumEnabled(conf));
  new ColumnConfigParser()
      .withColumnConfig(ENABLE_DICTIONARY, key -> conf.getBoolean(key, false), propsBuilder::withDictionaryEncoding)
      .withColumnConfig(BLOOM_FILTER_ENABLED, key -> conf.getBoolean(key, false),
          propsBuilder::withBloomFilterEnabled)
      .withColumnConfig(BLOOM_FILTER_EXPECTED_NDV, key -> conf.getLong(key, -1L), propsBuilder::withBloomFilterNDV)
      .parseConfig(conf);

  ParquetProperties props = propsBuilder.build();

  long blockSize = getLongBlockSize(conf);
  int maxPaddingSize = getMaxPaddingSize(conf);
  boolean validating = getValidation(conf);

  if (LOG.isInfoEnabled()) {
    LOG.info("Parquet block size to {}", blockSize);
    LOG.info("Validation is {}", (validating ? "on" : "off"));
    LOG.info("Maximum row group padding size is {} bytes", maxPaddingSize);
    LOG.info("Parquet properties are:\n{}", props);
  }

  WriteContext init = writeSupport.init(conf);
  ParquetFileWriter w = new ParquetFileWriter(HadoopOutputFile.fromPath(file, conf),
      init.getSchema(), mode, blockSize, maxPaddingSize, props.getColumnIndexTruncateLength(),
      props.getStatisticsTruncateLength(), props.getPageWriteChecksumEnabled());
  w.start();

  float maxLoad = conf.getFloat(ParquetOutputFormat.MEMORY_POOL_RATIO,
      MemoryManager.DEFAULT_MEMORY_POOL_RATIO);
  long minAllocation = conf.getLong(ParquetOutputFormat.MIN_MEMORY_ALLOCATION,
      MemoryManager.DEFAULT_MIN_MEMORY_ALLOCATION);
  synchronized (ParquetOutputFormat.class) {
    if (memoryManager == null) {
      memoryManager = new MemoryManager(maxLoad, minAllocation);
    }
  }
  if (memoryManager.getMemoryPoolRatio() != maxLoad) {
    LOG.warn("The configuration " + MEMORY_POOL_RATIO + " has been set. It should not " +
        "be reset by the new value: " + maxLoad);
  }

  return new ParquetRecordWriter<T>(
      w,
      writeSupport,
      init.getSchema(),
      init.getExtraMetaData(),
      blockSize,
      codec,
      validating,
      props,
      memoryManager,
      conf);
}
 
Example #20
Source File: AvroParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private static <T> WriteSupport<T> writeSupport(Schema avroSchema,
                                                GenericData model) {
  return new AvroWriteSupport<T>(
      new AvroSchemaConverter().convert(avroSchema), avroSchema, model);
}
 
Example #21
Source File: AvroParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private static <T> WriteSupport<T> writeSupport(Configuration conf,
                                                Schema avroSchema,
                                                GenericData model) {
  return new AvroWriteSupport<T>(
      new AvroSchemaConverter(conf).convert(avroSchema), avroSchema, model);
}
 
Example #22
Source File: ExtraMetadataWriteSupport.java    From garmadon with Apache License 2.0 4 votes vote down vote up
public ExtraMetadataWriteSupport(WriteSupport<T> delegate) {
    super(delegate);
}
 
Example #23
Source File: AvroParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
protected WriteSupport<T> getWriteSupport(Configuration conf) {
  return AvroParquetWriter.writeSupport(conf, schema, model);
}
 
Example #24
Source File: ParquetWriterFactory.java    From osm-parquetizer with Apache License 2.0 4 votes vote down vote up
@Override
protected WriteSupport<Relation> getWriteSupport(Configuration conf) {
    return new RelationWriteSupport(excludeMetadata);
}
 
Example #25
Source File: ParquetWriteSupport.java    From iceberg with Apache License 2.0 4 votes vote down vote up
ParquetWriteSupport(MessageType type, Map<String, String> keyValueMetadata, WriteSupport<T> writeSupport) {
  this.type = type;
  this.keyValueMetadata = keyValueMetadata;
  this.wrapped = writeSupport;
}
 
Example #26
Source File: Parquet.java    From iceberg with Apache License 2.0 4 votes vote down vote up
public WriteBuilder writeSupport(WriteSupport<?> newWriteSupport) {
  this.writeSupport = newWriteSupport;
  return this;
}
 
Example #27
Source File: Parquet.java    From iceberg with Apache License 2.0 4 votes vote down vote up
public ParquetWriteBuilder<T> setWriteSupport(WriteSupport<T> writeSupport) {
  this.writeSupport = writeSupport;
  return self();
}
 
Example #28
Source File: ParquetWriteSupport.java    From iceberg with Apache License 2.0 4 votes vote down vote up
ParquetWriteSupport(MessageType type, Map<String, String> keyValueMetadata, WriteSupport<T> writeSupport) {
  this.type = type;
  this.keyValueMetadata = keyValueMetadata;
  this.wrapped = writeSupport;
}
 
Example #29
Source File: Parquet.java    From iceberg with Apache License 2.0 4 votes vote down vote up
public ParquetWriteBuilder<T> setWriteSupport(WriteSupport<T> writeSupport) {
  this.writeSupport = writeSupport;
  return self();
}
 
Example #30
Source File: ParquetWriterFactory.java    From osm-parquetizer with Apache License 2.0 4 votes vote down vote up
@Override
protected WriteSupport<Way> getWriteSupport(Configuration conf) {
    return new WayWriteSupport(excludeMetadata);
}