Java Code Examples for org.apache.parquet.hadoop.ParquetWriter#DEFAULT_IS_DICTIONARY_ENABLED

The following examples show how to use org.apache.parquet.hadoop.ParquetWriter#DEFAULT_IS_DICTIONARY_ENABLED . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License

6 votes

private ExaParquetWriterImpl(final MessageType schema,
                             final int numColumns,
                             final Configuration conf,
                             final Path path,
                             final String compressionType,
                             final ExaIterator exa,
                             final int firstColumnIndex,
                             final List<Integer> dynamicPartitionExaColNums) throws Exception {
    super(path,
            new TupleWriteSupport(schema, conf),
            CompressionCodecName.fromConf(compressionType),
            ParquetWriter.DEFAULT_BLOCK_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
            ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
            PARQUET_WRITER_VERSION,
            conf);

    System.out.println("Path: " + path.toString());
    System.out.println("Parquet schema:\n" + schema);

    // Create Tuple object with ExaIterator reference.
    this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums);
}

Example 2

Source File: HiveTestUtil.java From hudi with Apache License 2.0

6 votes

@SuppressWarnings({"unchecked", "deprecation"})
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
    throws IOException, URISyntaxException {
  Schema schema = getTestDataSchema(isParquetSchemaSimple);
  org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
  BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
  ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
      ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
      ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());

  List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
      : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
  testRecords.forEach(s -> {
    try {
      writer.write(s);
    } catch (IOException e) {
      fail("IOException while writing test records as parquet" + e.toString());
    }
  });
  writer.close();
}

Example 3

Source File: HoodieParquetWriter.java From hudi with Apache License 2.0

6 votes

public HoodieParquetWriter(String instantTime, Path file, HoodieParquetConfig parquetConfig,
    Schema schema, SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
  super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
      ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
      parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
      ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
      ParquetWriter.DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
  this.fs =
      (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  // We cannot accurately measure the snappy compressed output file size. We are choosing a
  // conservative 10%
  // TODO - compute this compression ratio dynamically by looking at the bytes written to the
  // stream and the actual file size reported by HDFS
  this.maxFileSize = parquetConfig.getMaxFileSize()
      + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
  this.writeSupport = parquetConfig.getWriteSupport();
  this.instantTime = instantTime;
  this.sparkTaskContextSupplier = sparkTaskContextSupplier;
}

Example 4

Source File: ParquetAppender.java From kite with Apache License 2.0

5 votes

@Override
public void open() throws IOException {
  CompressionCodecName codecName = CompressionCodecName.UNCOMPRESSED;
  if (enableCompression) {
    codecName = getCompressionCodecName();
  }
  avroParquetWriter = new AvroParquetWriter<E>(fileSystem.makeQualified(path),
      schema, codecName, DEFAULT_ROW_GROUP_SIZE,
      ParquetWriter.DEFAULT_PAGE_SIZE,
      ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, conf);
}