org.apache.parquet.hadoop.metadata.CompressionCodecName Java Examples

The following examples show how to use org.apache.parquet.hadoop.metadata.CompressionCodecName. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TransCompressionCommand.java    From parquet-mr with Apache License 2.0 7 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(input != null && output != null,
    "Both input and output parquet file paths are required.");

  Preconditions.checkArgument(codec != null,
    "The codec cannot be null");

  Path inPath = new Path(input);
  Path outPath = new Path(output);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
  return 0;
}
 
Example #2
Source File: ApacheParquet.java    From sylph with Apache License 2.0 7 votes vote down vote up
private ApacheParquet(String outputPath, MessageType schema, WriterVersion writerVersion)
        throws IOException
{
    this.schema = schema;
    this.outputPath = outputPath;

    Configuration configuration = new Configuration();
    GroupWriteSupport.setSchema(schema, configuration);

    this.writer = ExampleParquetWriter.builder(new Path(outputPath))
            .withType(schema)
            .withConf(configuration)
            .withPageSize(DEFAULT_PAGE_SIZE)
            .withDictionaryPageSize(DEFAULT_PAGE_SIZE)
            .withDictionaryEncoding(DEFAULT_IS_DICTIONARY_ENABLED)
            .withValidation(DEFAULT_IS_VALIDATING_ENABLED)
            .withWriterVersion(writerVersion)
            .withRowGroupSize(DEFAULT_BLOCK_SIZE) // set Parquet file block size and page size values
            .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) //压缩类型
            .build();

    this.groupFactory = new SimpleGroupFactory(this.schema);
}
 
Example #3
Source File: Codecs.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static CodecFactory avroCodec(String codec) {
  CompressionCodecName parquetCodec = parquetCodec(codec);
  switch (parquetCodec) {
    case UNCOMPRESSED:
      return CodecFactory.nullCodec();
    case SNAPPY:
      return CodecFactory.snappyCodec();
    case GZIP:
      return CodecFactory.deflateCodec(9);
    case ZSTD:
      return CodecFactory.zstandardCodec(CodecFactory.DEFAULT_ZSTANDARD_LEVEL);
    default:
      throw new IllegalArgumentException(
          "Codec incompatible with Avro: " + codec);
  }
}
 
Example #4
Source File: ParquetCompressor.java    From presto with Apache License 2.0 6 votes vote down vote up
static ParquetCompressor getCompressor(CompressionCodecName codec)
{
    // TODO Support LZO and LZ4 compression
    // When using airlift LZO or LZ4 compressor, decompressing page in reader throws exception.
    switch (codec.getParquetCompressionCodec()) {
        case GZIP:
            return new GzipCompressor();
        case SNAPPY:
            return new AirLiftCompressor(new SnappyCompressor());
        case ZSTD:
            return new AirLiftCompressor(new ZstdCompressor());
        case UNCOMPRESSED:
            return null;
    }
    throw new RuntimeException(String.format("%s codec is not supported", codec));
}
 
Example #5
Source File: TestDirectCodecFactory.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void compressionCodecs() throws Exception {
  final int[] sizes = { 4 * 1024, 1 * 1024 * 1024 };
  final boolean[] comp = { true, false };
  Set<CompressionCodecName> codecsToSkip = new HashSet<>();
  codecsToSkip.add(LZO); // not distributed because it is GPL
  codecsToSkip.add(LZ4); // not distributed in the default version of Hadoop
  codecsToSkip.add(ZSTD); // not distributed in the default version of Hadoop

  for (final int size : sizes) {
    for (final boolean useOnHeapComp : comp) {
      for (final Decompression decomp : Decompression.values()) {
        for (final CompressionCodecName codec : CompressionCodecName.values()) {
          if (codecsToSkip.contains(codec)) {
            continue;
          }
          test(size, codec, useOnHeapComp, decomp);
        }
      }
    }
  }
}
 
Example #6
Source File: PrimitiveColumnWriter.java    From presto with Apache License 2.0 6 votes vote down vote up
public PrimitiveColumnWriter(Type type, ColumnDescriptor columnDescriptor, PrimitiveValueWriter primitiveValueWriter, RunLengthBitPackingHybridEncoder definitionLevelEncoder, RunLengthBitPackingHybridEncoder repetitionLevelEncoder, CompressionCodecName compressionCodecName, int pageSizeThreshold)
{
    this.type = requireNonNull(type, "type is null");
    this.columnDescriptor = requireNonNull(columnDescriptor, "columnDescriptor is null");
    this.maxDefinitionLevel = columnDescriptor.getMaxDefinitionLevel();

    this.definitionLevelEncoder = requireNonNull(definitionLevelEncoder, "definitionLevelEncoder is null");
    this.repetitionLevelEncoder = requireNonNull(repetitionLevelEncoder, "repetitionLevelEncoder is null");
    this.primitiveValueWriter = requireNonNull(primitiveValueWriter, "primitiveValueWriter is null");
    this.encodings = new HashSet<>();
    this.compressionCodec = requireNonNull(compressionCodecName, "compressionCodecName is null");
    this.compressor = getCompressor(compressionCodecName);
    this.pageSizeThreshold = pageSizeThreshold;

    this.columnStatistics = Statistics.createStats(columnDescriptor.getPrimitiveType());
}
 
Example #7
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void writeAndTest(WriteContext context) throws IOException {
  // Create the configuration, and then apply the schema to our configuration.
  Configuration configuration = new Configuration();
  GroupWriteSupport.setSchema(context.schema, configuration);
  GroupWriteSupport groupWriteSupport = new GroupWriteSupport();

  // Create the writer properties
  final int blockSize = context.blockSize;
  final int pageSize = context.pageSize;
  final int dictionaryPageSize = pageSize;
  final boolean enableDictionary = context.enableDictionary;
  final boolean enableValidation = context.enableValidation;
  ParquetProperties.WriterVersion writerVersion = context.version;
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
      groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, enableValidation, writerVersion, configuration);

  context.write(writer);
  writer.close();

  context.test();

  context.path.delete();
}
 
Example #8
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void testReadWrite(CompressionCodecName codec, Map<String, String> conf) throws IOException, ClassNotFoundException, InterruptedException {
  runMapReduceJob(codec, conf);
  final BufferedReader in = new BufferedReader(new FileReader(new File(inputPath.toString())));
  final BufferedReader out = new BufferedReader(new FileReader(new File(outputPath.toString(), "part-m-00000")));
  String lineIn;
  String lineOut = null;
  int lineNumber = 0;
  while ((lineIn = in.readLine()) != null && (lineOut = out.readLine()) != null) {
    ++lineNumber;
    lineOut = lineOut.substring(lineOut.indexOf("\t") + 1);
    assertEquals("line " + lineNumber, lineIn, lineOut);
  }
  assertNull("line " + lineNumber, out.readLine());
  assertNull("line " + lineNumber, lineIn);
  in.close();
  out.close();
}
 
Example #9
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
  Path inPath = new Path(inputFile);
  Path outPath = new Path(outputFile);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example #10
Source File: TestZstandardCodec.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private RunningJob runMapReduceJob(CompressionCodecName codec, JobConf jobConf, Configuration conf, Path parquetPath) throws IOException, ClassNotFoundException, InterruptedException {
  String writeSchema = "message example {\n" +
    "required int32 line;\n" +
    "required binary content;\n" +
    "}";

  FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  jobConf.setInputFormat(TextInputFormat.class);
  TextInputFormat.addInputPath(jobConf, inputPath);
  jobConf.setNumReduceTasks(0);
  jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
  DeprecatedParquetOutputFormat.setCompression(jobConf, codec);
  DeprecatedParquetOutputFormat.setOutputPath(jobConf, parquetPath);
  DeprecatedParquetOutputFormat.setWriteSupportClass(jobConf, GroupWriteSupport.class);
  GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), jobConf);

  jobConf.setMapperClass(TestZstandardCodec.DumpMapper.class);
  return JobClient.runJob(jobConf);
}
 
Example #11
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Do not write out page level crc checksums, but enable verification on the read path. Tests
 * that the read still succeeds and does not throw an exception.
 */
@Test
public void testWriteOffVerifyOn() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(),
      colAPage1Bytes);
    assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(),
      colAPage2Bytes);
    assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(),
      colBPage1Bytes);
    assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(),
      colBPage2Bytes);
  }
}
 
Example #12
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/** Test that we do not write out checksums if the feature is turned off */
@Test
public void testWriteOffVerifyOff() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    assertCrcNotSet(readNextPage(colADesc, pageReadStore));
    assertCrcNotSet(readNextPage(colADesc, pageReadStore));
    assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
    assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
  }
}
 
Example #13
Source File: TestHoodieAvroWriteSupport.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException {
  List<String> rowKeys = new ArrayList<>();
  for (int i = 0; i < 1000; i++) {
    rowKeys.add(UUID.randomUUID().toString());
  }
  String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString();
  Schema schema = HoodieAvroUtils.getRecordKeySchema();
  BloomFilter filter = BloomFilterFactory.createBloomFilter(
      1000, 0.0001, 10000,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
      new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}
 
Example #14
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a new ParquetWriter.
 *
 * @param file the file to create
 * @param writeSupport the implementation to write a record to a RecordConsumer
 * @param compressionCodecName the compression codec to use
 * @param blockSize the block size threshold
 * @param pageSize the page size threshold
 * @param dictionaryPageSize the page size threshold for the dictionary pages
 * @param enableDictionary to turn dictionary encoding on
 * @param validating to turn on validation using the schema
 * @param writerVersion version of parquetWriter from {@link ParquetProperties.WriterVersion}
 * @param conf Hadoop configuration to use while accessing the filesystem
 * @throws IOException if there is an error while writing
 * @deprecated will be removed in 2.0.0
 */
@Deprecated
public ParquetWriter(
    Path file,
    WriteSupport<T> writeSupport,
    CompressionCodecName compressionCodecName,
    int blockSize,
    int pageSize,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating,
    WriterVersion writerVersion,
    Configuration conf) throws IOException {
  this(file, ParquetFileWriter.Mode.CREATE, writeSupport,
      compressionCodecName, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, validating, writerVersion, conf);
}
 
Example #15
Source File: Util.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static String shortCodec(CompressionCodecName codec) {
  switch (codec) {
    case UNCOMPRESSED:
      return "_";
    case SNAPPY:
      return "S";
    case GZIP:
      return "G";
    case LZO:
      return "L";
    case BROTLI:
      return "B";
    case LZ4:
      return "4";
    case ZSTD:
      return "Z";
    default:
      return "?";
  }
}
 
Example #16
Source File: ParquetRecordWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param codec the compression codec used to compress the pages
 * @param validating if schema validation should be turned on
 * @param props parquet encoding properties
 */
ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long blockSize,
    CompressionCodecName codec,
    boolean validating,
    ParquetProperties props,
    MemoryManager memoryManager,
    Configuration conf) {
  this.codecFactory = new CodecFactory(conf, props.getPageSizeThreshold());
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, codecFactory.getCompressor(codec), validating,
      props);
  this.memoryManager = Objects.requireNonNull(memoryManager, "memoryManager cannot be null");
  memoryManager.addWriter(internalWriter, blockSize);
}
 
Example #17
Source File: TajoParquetWriter.java    From tajo with Apache License 2.0 6 votes vote down vote up
/**
 * Create a new TajoParquetWriter.
 *
 * @param file The file name to write to.
 * @param schema The Tajo schema of the table.
 * @param compressionCodecName Compression codec to use, or
 *                             CompressionCodecName.UNCOMPRESSED.
 * @param blockSize The block size threshold.
 * @param pageSize See parquet write up. Blocks are subdivided into pages
 *                 for alignment.
 * @param enableDictionary Whether to use a dictionary to compress columns.
 * @param validating Whether to turn on validation.
 * @throws java.io.IOException
 */
public TajoParquetWriter(Path file,
                         Schema schema,
                         CompressionCodecName compressionCodecName,
                         int blockSize,
                         int pageSize,
                         boolean enableDictionary,
                         boolean validating) throws IOException {
  super(file,
        new TajoWriteSupport(schema),
        compressionCodecName,
        blockSize,
        pageSize,
        enableDictionary,
        validating);
}
 
Example #18
Source File: TransCompressionCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example #19
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that the checksum is calculated using the compressed version of the data and that
 * checksum verification succeeds
 */
@Test
public void testCompression() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.SNAPPY);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage1, snappy(colAPage1Bytes));
    assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);

    DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage2, snappy(colAPage2Bytes));
    assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);

    DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage1, snappy(colBPage1Bytes));
    assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);

    DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage2, snappy(colBPage2Bytes));
    assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
  }
}
 
Example #20
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that we adhere to the checksum calculation specification, namely that the crc is
 * calculated using the compressed concatenation of the repetition levels, definition levels and
 * the actual data. This is done by generating sample data with a nested schema containing nulls
 * (generating non trivial repetition and definition levels).
 */
@Test
public void testNestedWithNulls() throws IOException {
  Configuration conf = new Configuration();

  // Write out sample file via the non-checksum code path, extract the raw bytes to calculate the
  // reference crc with
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
  Path refPath = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);

  try (ParquetFileReader refReader = getParquetFileReader(refPath, conf,
    Arrays.asList(colCIdDesc, colDValDesc))) {
    PageReadStore refPageReadStore = refReader.readNextRowGroup();
    byte[] colCIdPageBytes = readNextPage(colCIdDesc, refPageReadStore).getBytes().toByteArray();
    byte[] colDValPageBytes = readNextPage(colDValDesc, refPageReadStore).getBytes().toByteArray();

    // Write out sample file with checksums
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);

    try (ParquetFileReader reader = getParquetFileReader(path, conf,
      Arrays.asList(colCIdDesc, colDValDesc))) {
      PageReadStore pageReadStore = reader.readNextRowGroup();

      DataPageV1 colCIdPage = readNextPage(colCIdDesc, pageReadStore);
      assertCrcSetAndCorrect(colCIdPage, snappy(colCIdPageBytes));
      assertCorrectContent(colCIdPage.getBytes().toByteArray(), colCIdPageBytes);

      DataPageV1 colDValPage = readNextPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(colDValPage, snappy(colDValPageBytes));
      assertCorrectContent(colDValPage.getBytes().toByteArray(), colDValPageBytes);
    }
  }
}
 
Example #21
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Write out checksums and verify them on the read path. Tests that crc is set and that we can
 * read back what we wrote if checksums are enabled on both the write and read path.
 */
@Test
public void testWriteOnVerifyOn() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
    assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);

    DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
    assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);

    DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
    assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);

    DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
    assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
  }
}
 
Example #22
Source File: DeprecatedParquetOutputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public RecordWriterWrapper(ParquetOutputFormat<V> realOutputFormat,
    FileSystem fs, JobConf conf, String name, Progressable progress) throws IOException {

  CompressionCodecName codec = getCodec(conf);
  String extension = codec.getExtension() + ".parquet";
  Path file = getDefaultWorkFile(conf, name, extension);

  try {
    realWriter = (ParquetRecordWriter<V>) realOutputFormat.getRecordWriter(conf, file, codec);
  } catch (InterruptedException e) {
    Thread.interrupted();
    throw new IOException(e);
  }
}
 
Example #23
Source File: ParquetHdfsFileSink.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
protected void configure(Job job, KV<Void, IndexedRecord> sample) {
    super.configure(job, sample);
    IndexedRecord record = (IndexedRecord) sample.getValue();
    AvroWriteSupport.setSchema(job.getConfiguration(), record.getSchema());
    ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
}
 
Example #24
Source File: ParquetOutputPlugin.java    From embulk-output-parquet with MIT License 5 votes vote down vote up
private ParquetWriter<PageReader> createWriter(PluginTask task, Schema schema, int processorIndex)
{
    //In case of using Frankurt (eu-central-1) with Signature Version 4 Signing Process
    System.setProperty(SDKGlobalConfiguration.ENABLE_S3_SIGV4_SYSTEM_PROPERTY, task.getSignature());

    final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
    final boolean addUTF8 = task.getAddUTF8();

    final Path path = new Path(buildPath(task, processorIndex));
    final CompressionCodecName codec = CompressionCodecName.valueOf(task.getCompressionCodec());
    final int blockSize = task.getBlockSize();
    final int pageSize = task.getPageSize();
    final Configuration conf = createConfiguration(task.getExtraConfigurations(), task.getConfigFiles());
    final boolean overwrite = task.getOverwrite();

    ParquetWriter<PageReader> writer = null;
    try {
        EmbulkWriterBuilder builder = new EmbulkWriterBuilder(path, schema, timestampFormatters, addUTF8)
                .withCompressionCodec(codec)
                .withRowGroupSize(blockSize)
                .withPageSize(pageSize)
                .withDictionaryPageSize(pageSize)
                .withConf(conf);

        if (overwrite) {
            builder.withWriteMode(ParquetFileWriter.Mode.OVERWRITE);
        }

        writer = builder.build();
    } catch (IOException e) {
        Throwables.propagate(e);
    }
    return writer;
}
 
Example #25
Source File: ThriftParquetFileReaderWriterFactory.java    From secor with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
public ThriftParquetFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException {
    Path path = new Path(logFilePath.getLogFilePath());
    CompressionCodecName codecName = CompressionCodecName.fromCompressionCodec(codec != null ? codec.getClass() : null);
    topic = logFilePath.getTopic();
    writer = new ThriftParquetWriter(path, thriftUtil.getMessageClass(topic), codecName,
            blockSize, pageSize, enableDictionary, validating);
}
 
Example #26
Source File: CodecFactory.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesDecompressor getDecompressor(CompressionCodecName codecName) {
  BytesDecompressor decomp = decompressors.get(codecName);
  if (decomp == null) {
    decomp = createDecompressor(codecName);
    decompressors.put(codecName, decomp);
  }
  return decomp;
}
 
Example #27
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createParquetFile(File file) throws IOException {
  Path path = new Path(file.toURI());
  Configuration configuration = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
  String[] columnPath = {"a", "b"};
  ColumnDescriptor c1 = schema.getColumnDescription(columnPath);

  byte[] bytes1 = { 0, 1, 2, 3};
  byte[] bytes2 = { 2, 3, 4, 5};
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  BinaryStatistics stats = new BinaryStatistics();

  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(c1, 5, codec);
  w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.startBlock(4);
  w.startColumn(c1, 7, codec);
  w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.end(new HashMap<String, String>());
}
 
Example #28
Source File: ParquetFileWriter.java    From presto with Apache License 2.0 5 votes vote down vote up
public ParquetFileWriter(
        OutputStream outputStream,
        Callable<Void> rollbackAction,
        List<String> columnNames,
        List<Type> fileColumnTypes,
        ParquetWriterOptions parquetWriterOptions,
        int[] fileInputColumnIndexes,
        CompressionCodecName compressionCodecName)
{
    requireNonNull(outputStream, "outputStream is null");

    this.parquetWriter = new ParquetWriter(
            outputStream,
            columnNames,
            fileColumnTypes,
            parquetWriterOptions,
            compressionCodecName);

    this.rollbackAction = requireNonNull(rollbackAction, "rollbackAction is null");
    this.fileInputColumnIndexes = requireNonNull(fileInputColumnIndexes, "fileInputColumnIndexes is null");

    ImmutableList.Builder<Block> nullBlocks = ImmutableList.builder();
    for (Type fileColumnType : fileColumnTypes) {
        BlockBuilder blockBuilder = fileColumnType.createBlockBuilder(null, 1, 0);
        blockBuilder.appendNull();
        nullBlocks.add(blockBuilder.build());
    }
    this.nullBlocks = nullBlocks.build();
}
 
Example #29
Source File: ParquetIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Creates a {@link Sink} that, for use with {@link FileIO#write}. */
public static Sink sink(Schema schema) {
  return new AutoValue_ParquetIO_Sink.Builder()
      .setJsonSchema(schema.toString())
      .setCompressionCodec(CompressionCodecName.SNAPPY)
      .build();
}
 
Example #30
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
    long valueCount) {
  return ColumnChunkMetaData.get(ColumnPath.get("double", "column"),
      PrimitiveTypeName.DOUBLE,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      0L, 0L, valueCount, 0L, 0L);
}