org.apache.parquet.column.ParquetProperties Java Examples

The following examples show how to use org.apache.parquet.column.ParquetProperties. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetRecordWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param pageSize the size of a page in the file (this will be approximate)
 * @param compressor the compressor used to compress the pages
 * @param dictionaryPageSize the threshold for dictionary size
 * @param enableDictionary to enable the dictionary
 * @param validating if schema validation should be turned on
 * @param writerVersion writer compatibility version
 * @param memoryManager memory manager for the write
 */
@Deprecated
public ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long blockSize, int pageSize,
    BytesCompressor compressor,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating,
    WriterVersion writerVersion,
    MemoryManager memoryManager) {
  ParquetProperties props = ParquetProperties.builder()
      .withPageSize(pageSize)
      .withDictionaryPageSize(dictionaryPageSize)
      .withDictionaryEncoding(enableDictionary)
      .withWriterVersion(writerVersion)
      .build();
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, compressor, validating, props);
  this.memoryManager = Objects.requireNonNull(memoryManager, "memoryManager cannot be null");
  memoryManager.addWriter(internalWriter, blockSize);
  this.codecFactory = null;
}
 
Example #2
Source File: TupleConsumerPerfTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  String pigSchema = pigSchema(false, false);
  String pigSchemaProjected = pigSchema(true, false);
  String pigSchemaNoString = pigSchema(true, true);
  MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchema));

  MemPageStore memPageStore = new MemPageStore(0);
  ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(
      memPageStore, ParquetProperties.builder()
          .withPageSize(50*1024*1024)
          .withDictionaryEncoding(false)
          .build());
  write(memPageStore, columns, schema, pigSchema);
  columns.flush();
  read(memPageStore, pigSchema, pigSchemaProjected, pigSchemaNoString);
  System.out.println(columns.getBufferedSize()+" bytes used total");
  System.out.println("max col size: "+columns.maxColMemSize()+" bytes");
}
 
Example #3
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a new ParquetWriter.
 *
 * @param file the file to create
 * @param mode file creation mode
 * @param writeSupport the implementation to write a record to a RecordConsumer
 * @param compressionCodecName the compression codec to use
 * @param blockSize the block size threshold
 * @param pageSize the page size threshold
 * @param dictionaryPageSize the page size threshold for the dictionary pages
 * @param enableDictionary to turn dictionary encoding on
 * @param validating to turn on validation using the schema
 * @param writerVersion version of parquetWriter from {@link ParquetProperties.WriterVersion}
 * @param conf Hadoop configuration to use while accessing the filesystem
 * @throws IOException if there is an error while writing
 * @deprecated will be removed in 2.0.0
 */
@Deprecated
public ParquetWriter(
    Path file,
    ParquetFileWriter.Mode mode,
    WriteSupport<T> writeSupport,
    CompressionCodecName compressionCodecName,
    int blockSize,
    int pageSize,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating,
    WriterVersion writerVersion,
    Configuration conf) throws IOException {
  this(HadoopOutputFile.fromPath(file, conf),
      mode, writeSupport, compressionCodecName, blockSize,
      validating, conf, MAX_PADDING_SIZE_DEFAULT,
      ParquetProperties.builder()
          .withPageSize(pageSize)
          .withDictionaryPageSize(dictionaryPageSize)
          .withDictionaryEncoding(enableDictionary)
          .withWriterVersion(writerVersion)
          .build());
}
 
Example #4
Source File: ColumnWriteStoreBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
ColumnWriteStoreBase(
    MessageType schema,
    PageWriteStore pageWriteStore,
    ParquetProperties props) {
  this.props = props;
  this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO);
  Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>();
  for (ColumnDescriptor path : schema.getColumns()) {
    PageWriter pageWriter = pageWriteStore.getPageWriter(path);
    mcolumns.put(path, createColumnWriter(path, pageWriter, null, props));
  }
  this.columns = unmodifiableMap(mcolumns);

  this.rowCountForNextSizeCheck = min(props.getMinRowCountForPageSizeCheck(), props.getPageRowCountLimit());

  columnWriterProvider = new ColumnWriterProvider() {
    @Override
    public ColumnWriter getColumnWriter(ColumnDescriptor path) {
      return columns.get(path);
    }
  };
}
 
Example #5
Source File: DefaultValuesWriterFactory.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
static DictionaryValuesWriter dictionaryWriter(ColumnDescriptor path, ParquetProperties properties, Encoding dictPageEncoding, Encoding dataPageEncoding) {
  switch (path.getType()) {
    case BOOLEAN:
      throw new IllegalArgumentException("no dictionary encoding for BOOLEAN");
    case BINARY:
      return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT32:
      return new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT64:
      return new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT96:
      return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), 12, dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case DOUBLE:
      return new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case FLOAT:
      return new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case FIXED_LEN_BYTE_ARRAY:
      return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), path.getTypeLength(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    default:
      throw new IllegalArgumentException("Unknown type " + path.getType());
  }
}
 
Example #6
Source File: ColumnWriteStoreBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Deprecated
ColumnWriteStoreBase(
    final PageWriteStore pageWriteStore,
    final ParquetProperties props) {
  this.props = props;
  this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO);

  this.columns = new TreeMap<>();

  this.rowCountForNextSizeCheck = min(props.getMinRowCountForPageSizeCheck(), props.getPageRowCountLimit());

  columnWriterProvider = new ColumnWriterProvider() {
    @Override
    public ColumnWriter getColumnWriter(ColumnDescriptor path) {
      ColumnWriterBase column = columns.get(path);
      if (column == null) {
        column = createColumnWriter(path, pageWriteStore.getPageWriter(path), null, props);
        columns.put(path, column);
      }
      return column;
    }
  };
}
 
Example #7
Source File: TestBloomFiltering.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void createFile() throws IOException {
  int pageSize = DATA.size() / 100;     // Ensure that several pages will be created
  int rowGroupSize = pageSize * 4;    // Ensure that there are more row-groups created
  PhoneBookWriter.write(ExampleParquetWriter.builder(FILE_V1)
      .withWriteMode(OVERWRITE)
      .withRowGroupSize(rowGroupSize)
      .withPageSize(pageSize)
      .withBloomFilterNDV("location.lat", 10000L)
      .withBloomFilterNDV("name", 10000L)
      .withBloomFilterNDV("id", 10000L)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0),
    DATA);
  PhoneBookWriter.write(ExampleParquetWriter.builder(FILE_V2)
      .withWriteMode(OVERWRITE)
      .withRowGroupSize(rowGroupSize)
      .withPageSize(pageSize)
      .withBloomFilterNDV("location.lat", 10000L)
      .withBloomFilterNDV("name", 10000L)
      .withBloomFilterNDV("id", 10000L)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_2_0),
    DATA);
}
 
Example #8
Source File: PerfTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void write(MemPageStore memPageStore) {
  ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(
      memPageStore,
      ParquetProperties.builder()
          .withPageSize(50*1024*1024)
          .withDictionaryEncoding(false)
          .build());
  MessageColumnIO columnIO = newColumnFactory(schema);

  GroupWriter groupWriter = new GroupWriter(columnIO.getRecordWriter(columns), schema);
  groupWriter.write(r1);
  groupWriter.write(r2);

  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 100000);
  write(memPageStore, groupWriter, 1000000);
  columns.flush();
  System.out.println();
  System.out.println(columns.getBufferedSize() + " bytes used total");
  System.out.println("max col size: "+columns.maxColMemSize()+" bytes");
}
 
Example #9
Source File: AvroParquetConvertCreator.java    From datacollector with Apache License 2.0 6 votes vote down vote up
@Override
protected void addNecessaryJarsToJob(Configuration conf) {
  MapreduceUtils.addJarsToJob(conf,
      SemanticVersion.class,
      ParquetWriter.class,
      AvroParquetWriter.class,
      AvroParquetWriterBuilder190Int96.class,
      AvroSchemaConverter190Int96Avro18.class,
      FsInput.class,
      CompressionCodec.class,
      ParquetProperties.class,
      BytesInput.class,
      AvroToParquetConverterUtil.class,
      AvroLogicalTypeSupport.class
  );
}
 
Example #10
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}
 
Example #11
Source File: PentahoApacheOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public void setVersion( VERSION version ) throws Exception {
  inClassloader( () -> {
    ParquetProperties.WriterVersion writerVersion;
    switch ( version ) {
      case VERSION_1_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_1_0;
        break;
      case VERSION_2_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
      default:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
    }
    job.getConfiguration().set( ParquetOutputFormat.WRITER_VERSION, writerVersion.toString() );
  } );
}
 
Example #12
Source File: ParquetRecordWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param codec the compression codec used to compress the pages
 * @param validating if schema validation should be turned on
 * @param props parquet encoding properties
 */
ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long blockSize,
    CompressionCodecName codec,
    boolean validating,
    ParquetProperties props,
    MemoryManager memoryManager,
    Configuration conf) {
  this.codecFactory = new CodecFactory(conf, props.getPageSizeThreshold());
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, codecFactory.getCompressor(codec), validating,
      props);
  this.memoryManager = Objects.requireNonNull(memoryManager, "memoryManager cannot be null");
  memoryManager.addWriter(internalWriter, blockSize);
}
 
Example #13
Source File: InternalParquetRecordWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param parquetFileWriter the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param rowGroupSize the size of a block in the file (this will be approximate)
 * @param compressor the codec used to compress
 */
public InternalParquetRecordWriter(
    ParquetFileWriter parquetFileWriter,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long rowGroupSize,
    BytesCompressor compressor,
    boolean validating,
    ParquetProperties props) {
  this.parquetFileWriter = parquetFileWriter;
  this.writeSupport = Objects.requireNonNull(writeSupport, "writeSupport cannot be null");
  this.schema = schema;
  this.extraMetaData = extraMetaData;
  this.rowGroupSize = rowGroupSize;
  this.rowGroupSizeThreshold = rowGroupSize;
  this.nextRowGroupSize = rowGroupSizeThreshold;
  this.compressor = compressor;
  this.validating = validating;
  this.props = props;
  initStore();
}
 
Example #14
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
 *
 * @param configuration Hadoop configuration
 * @param schema the schema of the data
 * @param file the file to write to
 * @param rowAndBlockSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 */
ParquetFileWriter(Configuration configuration, MessageType schema,
                  Path file, long rowAndBlockSize, int maxPaddingSize)
    throws IOException {
  FileSystem fs = file.getFileSystem(configuration);
  this.schema = schema;
  this.alignment = PaddingAlignment.get(
      rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
  this.out = HadoopStreams.wrap(
      fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
  this.encodingStatsBuilder = new EncodingStats.Builder();
  // no truncation is needed for testing
  this.columnIndexTruncateLength = Integer.MAX_VALUE;
  this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
  this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}
 
Example #15
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
ParquetWriter(
    OutputFile file,
    ParquetFileWriter.Mode mode,
    WriteSupport<T> writeSupport,
    CompressionCodecName compressionCodecName,
    int rowGroupSize,
    boolean validating,
    Configuration conf,
    int maxPaddingSize,
    ParquetProperties encodingProps) throws IOException {

  WriteSupport.WriteContext writeContext = writeSupport.init(conf);
  MessageType schema = writeContext.getSchema();

  ParquetFileWriter fileWriter = new ParquetFileWriter(
    file, schema, mode, rowGroupSize, maxPaddingSize,
    encodingProps.getColumnIndexTruncateLength(), encodingProps.getStatisticsTruncateLength(),
    encodingProps.getPageWriteChecksumEnabled());
  fileWriter.start();

  this.codecFactory = new CodecFactory(conf, encodingProps.getPageSizeThreshold());
  CodecFactory.BytesCompressor compressor =	codecFactory.getCompressor(compressionCodecName);
  this.writer = new InternalParquetRecordWriter<T>(
      fileWriter,
      writeSupport,
      schema,
      writeContext.getExtraMetaData(),
      rowGroupSize,
      compressor,
      validating,
      encodingProps);
}
 
Example #16
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String createParquetFile(Configuration conf, Map<String, String> extraMeta, int numRecord, String prefix, String codec,
                                       ParquetProperties.WriterVersion writerVersion, int pageSize, TestDocs testDocs) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, BINARY, "Backward"),
      new PrimitiveType(REPEATED, BINARY, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file))
    .withConf(conf)
    .withWriterVersion(writerVersion)
    .withExtraMetaData(extraMeta)
    .withDictionaryEncoding("DocId", true)
    .withValidation(true)
    .enablePageWriteChecksum()
    .withPageSize(pageSize)
    .withCompressionCodec(CompressionCodecName.valueOf(codec));
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", testDocs.docId[i]);
      g.add("Name", testDocs.name[i]);
      g.add("Gender", testDocs.gender[i]);
      Group links = g.addGroup("Links");
      links.add(0, testDocs.linkBackward[i]);
      links.add(1, testDocs.linkForward[i]);
      writer.write(g);
    }
  }

  return file;
}
 
Example #17
Source File: TestParquetReadProtocol.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private <T extends TBase<?,?>> void validate(T expected) throws TException {
  @SuppressWarnings("unchecked")
  final Class<T> thriftClass = (Class<T>)expected.getClass();
  final MemPageStore memPageStore = new MemPageStore(1);
  final ThriftSchemaConverter schemaConverter = new ThriftSchemaConverter();
  final MessageType schema = schemaConverter.convert(thriftClass);
  LOG.info("{}", schema);
  final MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema);
  final ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(memPageStore,
      ParquetProperties.builder()
          .withPageSize(10000)
          .withDictionaryEncoding(false)
          .build());
  final RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
  final StructType thriftType = schemaConverter.toStructType(thriftClass);
  ParquetWriteProtocol parquetWriteProtocol = new ParquetWriteProtocol(recordWriter, columnIO, thriftType);

  expected.write(parquetWriteProtocol);
  recordWriter.flush();
  columns.flush();

  ThriftRecordConverter<T> converter = new TBaseRecordConverter<T>(thriftClass, schema, thriftType);
  final RecordReader<T> recordReader = columnIO.getRecordReader(memPageStore, converter);

  final T result = recordReader.read();

  assertEquals(expected, result);
}
 
Example #18
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testInternal(String srcCodec, String destCodec, ParquetProperties.WriterVersion writerVersion, int pageSize) throws Exception {
  int numRecord = 1000;
  TestDocs testDocs = new TestDocs(numRecord);
  String inputFile = createParquetFile(conf, extraMeta, numRecord, "input", srcCodec, writerVersion, pageSize, testDocs);
  String outputFile = createTempFile("output_trans");

  convertCompression(conf, inputFile, outputFile, destCodec);

  validateColumns(outputFile, numRecord, testDocs);
  validMeta(inputFile, outputFile);
  validColumnIndex(inputFile, outputFile);
}
 
Example #19
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testStatistics() throws IOException {
  File file = folder.newFile("test_file.parquet");
  file.delete();

  System.out.println(String.format("RANDOM SEED: %s", RANDOM_SEED));

  Random random = new Random(RANDOM_SEED);

  int blockSize =(random.nextInt(54) + 10) * MEGABYTE;
  int pageSize = (random.nextInt(10) + 1) * MEGABYTE;

  List<DataContext> contexts = Arrays.asList(
      new DataContext(random.nextLong(), file, blockSize,
          pageSize, false, ParquetProperties.WriterVersion.PARQUET_1_0),
      new DataContext(random.nextLong(), file, blockSize,
          pageSize, true, ParquetProperties.WriterVersion.PARQUET_1_0),
      new DataContext(random.nextLong(), file, blockSize,
          pageSize, false, ParquetProperties.WriterVersion.PARQUET_2_0),
      new DataContext(random.nextLong(), file, blockSize,
          pageSize, true, ParquetProperties.WriterVersion.PARQUET_2_0)
  );

  for (DataContext test : contexts) {
    DataGenerationContext.writeAndTest(test);
  }
}
 
Example #20
Source File: ColumnWriteStoreBase.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
ColumnWriteStoreBase(
  MessageType schema,
  PageWriteStore pageWriteStore,
  BloomFilterWriteStore bloomFilterWriteStore,
  ParquetProperties props) {
  this.props = props;
  this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO);
  Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>();
  for (ColumnDescriptor path : schema.getColumns()) {
    PageWriter pageWriter = pageWriteStore.getPageWriter(path);
    if (props.isBloomFilterEnabled(path)) {
      BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path);
      mcolumns.put(path, createColumnWriter(path, pageWriter, bloomFilterWriter, props));
    } else {
      mcolumns.put(path, createColumnWriter(path, pageWriter, null, props));
    }
  }
  this.columns = unmodifiableMap(mcolumns);

  this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck();

  columnWriterProvider = new ColumnWriterProvider() {
    @Override
    public ColumnWriter getColumnWriter(ColumnDescriptor path) {
      return columns.get(path);
    }
  };
}
 
Example #21
Source File: TestSimpleRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createTestParquetFile() throws IOException {
  Path fsPath = new Path(testFile().getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    writer.write(fact.newGroup()
     .append(INT32_FIELD, 32)
     .append(INT64_FIELD, 64L)
     .append(FLOAT_FIELD, 1.0f)
     .append(DOUBLE_FIELD, 2.0d)
     .append(BINARY_FIELD, Binary.fromString("foobar"))
     .append(FIXED_LEN_BYTE_ARRAY_FIELD,
       Binary.fromConstantByteArray(new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 })));
  }
}
 
Example #22
Source File: ParquetRecordWriter.java    From Bats with Apache License 2.0 5 votes vote down vote up
private void newSchema() throws IOException {
  List<Type> types = Lists.newArrayList();
  for (MaterializedField field : batchSchema) {
    if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
      continue;
    }
    types.add(getType(field));
  }
  schema = new MessageType("root", types);

  // We don't want this number to be too small, ideally we divide the block equally across the columns.
  // It is unlikely all columns are going to be the same size.
  // Its value is likely below Integer.MAX_VALUE (2GB), although rowGroupSize is a long type.
  // Therefore this size is cast to int, since allocating byte array in under layer needs to
  // limit the array size in an int scope.
  int initialBlockBufferSize = this.schema.getColumns().size() > 0 ?
      max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5) : MINIMUM_BUFFER_SIZE;
  // We don't want this number to be too small either. Ideally, slightly bigger than the page size,
  // but not bigger than the block buffer
  int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
  // TODO: Use initialSlabSize from ParquetProperties once drill will be updated to the latest version of Parquet library
  int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(64, pageSize, 10);
  // TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
  // once PARQUET-1006 will be resolved
  pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, initialSlabSize,
      pageSize, new ParquetDirectByteBufferAllocator(oContext));
  ParquetProperties parquetProperties = ParquetProperties.builder()
      .withPageSize(pageSize)
      .withDictionaryEncoding(enableDictionary)
      .withDictionaryPageSize(initialPageBufferSize)
      .withWriterVersion(writerVersion)
      .withAllocator(new ParquetDirectByteBufferAllocator(oContext))
      .withValuesWriterFactory(new DefaultV1ValuesWriterFactory())
      .build();
  store = new ColumnWriteStoreV1(pageStore, parquetProperties);
  MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
  consumer = columnIO.getRecordWriter(store);
  setUp(schema, consumer);
}
 
Example #23
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnWriteStoreV1 newColumnWriteStoreImpl(MemPageStore memPageStore) {
  return new ColumnWriteStoreV1(memPageStore,
      ParquetProperties.builder()
          .withPageSize(2048)
          .withDictionaryEncoding(false)
          .build());
}
 
Example #24
Source File: DefaultValuesWriterFactoryTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ValuesWriterFactory getDefaultFactory(WriterVersion writerVersion, boolean enableDictionary, boolean enableByteStreamSplit) {
  ValuesWriterFactory factory = new DefaultValuesWriterFactory();
  ParquetProperties.builder()
    .withDictionaryEncoding(enableDictionary)
    .withByteStreamSplitEncoding(enableByteStreamSplit)
    .withWriterVersion(writerVersion)
    .withValuesWriterFactory(factory)
    .build();

  return factory;
}
 
Example #25
Source File: DefaultValuesWriterFactoryTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ValuesWriterFactory getDefaultFactory(WriterVersion writerVersion, boolean dictEnabledDefault, String... dictInverseColumns) {
  ValuesWriterFactory factory = new DefaultValuesWriterFactory();
  ParquetProperties.Builder builder = ParquetProperties.builder()
      .withDictionaryEncoding(dictEnabledDefault)
      .withWriterVersion(writerVersion)
      .withValuesWriterFactory(factory);
  for (String column : dictInverseColumns) {
    builder.withDictionaryEncoding(column, !dictEnabledDefault);
  }
  builder.build();

  return factory;
}
 
Example #26
Source File: TestColumnReaderImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.writeCurrentValueToConverter();
    columnReader.consume();
  }
  assertEquals(rows, converter.count);
}
 
Example #27
Source File: TestColumnReaderImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testOptional() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.writeNull(0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.consume();
  }
  assertEquals(0, converter.count);
}
 
Example #28
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnWriteStoreV1 newColumnWriteStore(MemPageStore memPageStore) {
  return new ColumnWriteStoreV1(memPageStore,
      ParquetProperties.builder()
          .withPageSize(800)
          .withDictionaryPageSize(800)
          .withDictionaryEncoding(useDictionary)
          .build());
}
 
Example #29
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param file OutputFile to create or overwrite
 * @param schema the schema of the data
 * @param mode file creation mode
 * @param rowGroupSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 * @deprecated will be removed in 2.0.0
 */
@Deprecated
public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode,
                         long rowGroupSize, int maxPaddingSize)
    throws IOException {
  this(file, schema, mode, rowGroupSize, maxPaddingSize,
      ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
    ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH,
      ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
}
 
Example #30
Source File: ParquetRecordWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param pageSize the size of a page in the file (this will be approximate)
 * @param compressor the compressor used to compress the pages
 * @param dictionaryPageSize the threshold for dictionary size
 * @param enableDictionary to enable the dictionary
 * @param validating if schema validation should be turned on
 * @param writerVersion writer compatibility version
 */
@Deprecated
public ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    int blockSize, int pageSize,
    BytesCompressor compressor,
    int dictionaryPageSize,
    boolean enableDictionary,
    boolean validating,
    WriterVersion writerVersion) {
  ParquetProperties props = ParquetProperties.builder()
      .withPageSize(pageSize)
      .withDictionaryPageSize(dictionaryPageSize)
      .withDictionaryEncoding(enableDictionary)
      .withWriterVersion(writerVersion)
      .build();
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, compressor, validating, props);
  this.memoryManager = null;
  this.codecFactory = null;
}