Java Code Examples for org.apache.parquet.hadoop.metadata.CompressionCodecName#SNAPPY

The following examples show how to use org.apache.parquet.hadoop.metadata.CompressionCodecName#SNAPPY . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ParquetAppender.java From kite with Apache License 2.0

6 votes

private CompressionCodecName getCompressionCodecName() {
  switch (compressionType) {
    case Snappy:
      return CompressionCodecName.SNAPPY;

    case Lzo:
      return CompressionCodecName.LZO;

    case Deflate:
      return CompressionCodecName.GZIP;

    default:
      throw new IllegalArgumentException(String.format(
          "Unsupported compression format %s. Supported formats: %s",
          compressionType.getName(), Arrays.toString(
              Formats.PARQUET.getSupportedCompressionTypes().toArray())));
  }
}

Example 2

Source File: DirectCodecFactory.java From parquet-mr with Apache License 2.0

5 votes

@Override
protected BytesCompressor createCompressor(final CompressionCodecName codecName) {

  CompressionCodec codec = getCodec(codecName);
  if (codec == null) {
    return new NoopCompressor();
  } else if (codecName == CompressionCodecName.SNAPPY) {
    // avoid using the default Snappy codec since it allocates direct buffers at awkward spots.
    return new SnappyCompressor();
  } else {
    // todo: create class similar to the SnappyCompressor for zlib and exclude it as
    // snappy is above since it also generates allocateDirect calls.
    return new HeapBytesCompressor(codecName);
  }
}

Example 3

Source File: DirectCodecFactory.java From parquet-mr with Apache License 2.0

5 votes

@Override
protected BytesDecompressor createDecompressor(final CompressionCodecName codecName) {
  CompressionCodec codec = getCodec(codecName);
  if (codec == null) {
    return new NoopDecompressor();
  } else if (codecName == CompressionCodecName.SNAPPY ) {
    return new SnappyDecompressor();
  } else if (DirectCodecPool.INSTANCE.codec(codec).supportsDirectDecompression()) {
    return new FullDirectDecompressor(codecName);
  } else {
    return new IndirectDecompressor(codec);
  }
}

Example 4

Source File: TestBinary.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testBinary() throws IOException {
  StringAndBinary expected = new StringAndBinary("test",
      ByteBuffer.wrap(new byte[] { -123, 20, 33 }));
  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ThriftParquetWriter<StringAndBinary> writer =
      new ThriftParquetWriter<StringAndBinary>(
          path, StringAndBinary.class, CompressionCodecName.SNAPPY);
  writer.write(expected);
  writer.close();

  ParquetReader<StringAndBinary> reader = ThriftParquetReader.<StringAndBinary>
      build(path)
      .withThriftClass(StringAndBinary.class)
      .build();


  StringAndBinary record = reader.read();
  reader.close();

  assertSchema(ParquetFileReader.readFooter(new Configuration(), path));
  assertEquals("Should match after serialization round trip",
      expected, record);
}

Example 5

Source File: ParquetRecordWriter.java From Bats with Apache License 2.0

4 votes

@Override
public void init(Map<String, String> writerOptions) throws IOException {
  this.location = writerOptions.get("location");
  this.prefix = writerOptions.get("prefix");

  fs = FileSystem.get(conf);
  blockSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_BLOCK_SIZE));
  pageSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_PAGE_SIZE));
  dictionaryPageSize= Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_DICT_PAGE_SIZE));
  String codecName = writerOptions.get(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE).toLowerCase();
  switch(codecName) {
  case "snappy":
    codec = CompressionCodecName.SNAPPY;
    break;
  case "lzo":
    codec = CompressionCodecName.LZO;
    break;
  case "gzip":
    codec = CompressionCodecName.GZIP;
    break;
  case "none":
  case "uncompressed":
    codec = CompressionCodecName.UNCOMPRESSED;
    break;
  default:
    throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
  }

  String logicalTypeNameForDecimals = writerOptions.get(ExecConstants.PARQUET_WRITER_LOGICAL_TYPE_FOR_DECIMALS).toLowerCase();
  switch (logicalTypeNameForDecimals) {
    case "fixed_len_byte_array":
      logicalTypeForDecimals = PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
      break;
    case "binary":
      logicalTypeForDecimals = PrimitiveTypeName.BINARY;
      break;
    default:
      throw new UnsupportedOperationException(
          String.format(
              "Unsupported logical type for decimals: %s\n" +
              "Supported types: ['fixed_len_byte_array', 'binary']", codecName));
  }

  enableDictionary = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING));
  useSingleFSBlock = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_SINGLE_FS_BLOCK));
  usePrimitiveTypesForDecimals = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_PRIMITIVE_TYPES_FOR_DECIMALS));

  if (useSingleFSBlock) {
    // Round up blockSize to multiple of 64K.
    blockSize = (int)ceil((double)blockSize/BLOCKSIZE_MULTIPLE) * BLOCKSIZE_MULTIPLE;
  }
}

Example 6

Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0

4 votes

public ParquetRecordWriter(OperatorContext context, ParquetWriter writer, ParquetFormatConfig config) throws OutOfMemoryException{
  this.context = context;
  this.codecAllocator = context.getAllocator().newChildAllocator("ParquetCodecFactory", 0, Long.MAX_VALUE);
  this.columnEncoderAllocator = context.getAllocator().newChildAllocator("ParquetColEncoder", 0, Long.MAX_VALUE);
  this.codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(),
      new ParquetDirectByteBufferAllocator(codecAllocator), pageSize);
  this.extraMetaData.put(DREMIO_VERSION_PROPERTY, DremioVersionInfo.getVersion());
  this.extraMetaData.put(IS_DATE_CORRECT_PROPERTY, "true");

  this.plugin = writer.getFormatPlugin().getFsPlugin();
  this.queryUser = writer.getProps().getUserName();

  FragmentHandle handle = context.getFragmentHandle();
  String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId());

  this.location = writer.getLocation();
  this.prefix = fragmentId;
  this.extension = config.outputExtension;
  if (writer.getOptions() != null) {
    this.partitionColumns = writer.getOptions().getPartitionColumns();
    this.isIcebergWriter = (writer.getOptions().getIcebergWriterOperation() != WriterOptions.IcebergWriterOperation.NONE);
  } else {
    this.partitionColumns = null;
    this.isIcebergWriter = false;
  }

  if (this.isIcebergWriter && writer.getOptions().getExtendedProperty() != null) {
    initIcebergColumnIDList(writer.getOptions().getExtendedProperty());
  }

  memoryThreshold = (int) context.getOptions().getOption(ExecConstants.PARQUET_MEMORY_THRESHOLD_VALIDATOR);
  blockSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_BLOCK_SIZE_VALIDATOR);
  pageSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_PAGE_SIZE_VALIDATOR);
  final String codecName = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE_VALIDATOR).toLowerCase();
  switch(codecName) {
  case "snappy":
    codec = CompressionCodecName.SNAPPY;
    break;
  case "lzo":
    codec = CompressionCodecName.LZO;
    break;
  case "gzip":
    codec = CompressionCodecName.GZIP;
    break;
  case "none":
  case "uncompressed":
    codec = CompressionCodecName.UNCOMPRESSED;
    break;
  default:
    throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
  }

  enableDictionary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_VALIDATOR);
  enableDictionaryForBinary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_BINARY_TYPE_VALIDATOR);
  maxPartitions = context.getOptions().getOption(ExecConstants.PARQUET_MAXIMUM_PARTITIONS_VALIDATOR);
  minRecordsForFlush = context.getOptions().getOption(ExecConstants.PARQUET_MIN_RECORDS_FOR_FLUSH_VALIDATOR);
  parquetFileWriteTimeThresholdMilliSecs = (int)context.getOptions().getOption(ExecConstants.PARQUET_WRITE_TIME_THRESHOLD_MILLI_SECS_VALIDATOR);
  parquetFileWriteIoRateThresholdMbps = context.getOptions().getOption(ExecConstants.PARQUET_WRITE_IO_RATE_THRESHOLD_MBPS_VALIDATOR);
}

Example 7

Source File: DirectCodecFactory.java From parquet-mr with Apache License 2.0

4 votes

public SnappyDecompressor() {
  this.extraDecompressor = new HeapBytesDecompressor(CompressionCodecName.SNAPPY);
}

Example 8

Source File: DirectCodecFactory.java From parquet-mr with Apache License 2.0

4 votes

@Override
public CompressionCodecName getCodecName() {
  return CompressionCodecName.SNAPPY;
}