org.apache.parquet.column.impl.ColumnWriteStoreV1 Java Examples

The following examples show how to use org.apache.parquet.column.impl.ColumnWriteStoreV1. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TupleConsumerPerfTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  String pigSchema = pigSchema(false, false);
  String pigSchemaProjected = pigSchema(true, false);
  String pigSchemaNoString = pigSchema(true, true);
  MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchema));

  MemPageStore memPageStore = new MemPageStore(0);
  ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(
      memPageStore, ParquetProperties.builder()
          .withPageSize(50*1024*1024)
          .withDictionaryEncoding(false)
          .build());
  write(memPageStore, columns, schema, pigSchema);
  columns.flush();
  read(memPageStore, pigSchema, pigSchemaProjected, pigSchemaNoString);
  System.out.println(columns.getBufferedSize()+" bytes used total");
  System.out.println("max col size: "+columns.maxColMemSize()+" bytes");
}
 
Example #2
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumn() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
  ColumnDescriptor path = schema.getColumnDescription(new String[] {"foo", "bar"});
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  columnWriter.write(42l, 0, 0);
  memColumnsStore.endRecord();
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, schema);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getLong(), 42);
    columnReader.consume();
  }
}
 
Example #3
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private MemPageStore writeTestRecords(MessageColumnIO columnIO, int number) {
  MemPageStore memPageStore = new MemPageStore(number * 2);
  ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(
      memPageStore,
      ParquetProperties.builder()
          .withPageSize(800)
          .withDictionaryEncoding(false)
          .build());

  RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
  GroupWriter groupWriter = new GroupWriter(recordWriter, schema);
  for ( int i = 0; i < number; i++ ) {
    groupWriter.write(r1);
    groupWriter.write(r2);
  }
  recordWriter.flush();
  columns.flush();
  return memPageStore;
}
 
Example #4
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumnBinary() throws Exception {
  MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required binary bar; } }");
  String[] col = new String[]{"foo", "bar"};
  MemPageStore memPageStore = new MemPageStore(10);

  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnDescriptor path1 = mt.getColumnDescription(col);
  ColumnDescriptor path = path1;

  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  columnWriter.write(Binary.fromString("42"), 0, 0);
  memColumnsStore.endRecord();
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getBinary().toStringUsingUTF8(), "42");
    columnReader.consume();
  }
}
 
Example #5
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumnSeveralPages() throws Exception {
  MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
  String[] col = new String[]{"foo", "bar"};
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnDescriptor path1 = mt.getColumnDescription(col);
  ColumnDescriptor path = path1;

  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  for (int i = 0; i < 2000; i++) {
    columnWriter.write(42l, 0, 0);
    memColumnsStore.endRecord();
  }
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getLong(), 42);
    columnReader.consume();
  }
}
 
Example #6
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testEmptyField() {
  MemPageStore memPageStore = new MemPageStore(1);
  ColumnWriteStoreV1 columns = newColumnWriteStore(memPageStore);
  MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema);
  final RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
  recordWriter.startMessage();
  recordWriter.startField("DocId", 0);
  recordWriter.addLong(0);
  recordWriter.endField("DocId", 0);
  recordWriter.startField("Links", 1);
  try {
    recordWriter.endField("Links", 1);
    Assert.fail("expected exception because of empty field");
  } catch (ParquetEncodingException e) {
    Assert.assertEquals("empty fields are illegal, the field should be ommited completely instead", e.getMessage());
  }
}
 
Example #7
Source File: PerfTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void write(MemPageStore memPageStore) {
  ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(
      memPageStore,
      ParquetProperties.builder()
          .withPageSize(50*1024*1024)
          .withDictionaryEncoding(false)
          .build());
  MessageColumnIO columnIO = newColumnFactory(schema);

  GroupWriter groupWriter = new GroupWriter(columnIO.getRecordWriter(columns), schema);
  groupWriter.write(r1);
  groupWriter.write(r2);

  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 10000);
  write(memPageStore, groupWriter, 100000);
  write(memPageStore, groupWriter, 1000000);
  columns.flush();
  System.out.println();
  System.out.println(columns.getBufferedSize() + " bytes used total");
  System.out.println("max col size: "+columns.maxColMemSize()+" bytes");
}
 
Example #8
Source File: TupleConsumerPerfTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void write(MemPageStore memPageStore, ColumnWriteStoreV1 columns, MessageType schema, String pigSchemaString) throws ExecException, ParserException {
  MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
  TupleWriteSupport tupleWriter = TupleWriteSupport.fromPigSchema(pigSchemaString);
  tupleWriter.init(null);
  tupleWriter.prepareForWrite(columnIO.getRecordWriter(columns));
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 100000);
  write(memPageStore, tupleWriter, 1000000);
  System.out.println();
}
 
Example #9
Source File: ParquetRecordWriter.java    From Bats with Apache License 2.0 5 votes vote down vote up
private void newSchema() throws IOException {
  List<Type> types = Lists.newArrayList();
  for (MaterializedField field : batchSchema) {
    if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
      continue;
    }
    types.add(getType(field));
  }
  schema = new MessageType("root", types);

  // We don't want this number to be too small, ideally we divide the block equally across the columns.
  // It is unlikely all columns are going to be the same size.
  // Its value is likely below Integer.MAX_VALUE (2GB), although rowGroupSize is a long type.
  // Therefore this size is cast to int, since allocating byte array in under layer needs to
  // limit the array size in an int scope.
  int initialBlockBufferSize = this.schema.getColumns().size() > 0 ?
      max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5) : MINIMUM_BUFFER_SIZE;
  // We don't want this number to be too small either. Ideally, slightly bigger than the page size,
  // but not bigger than the block buffer
  int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
  // TODO: Use initialSlabSize from ParquetProperties once drill will be updated to the latest version of Parquet library
  int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(64, pageSize, 10);
  // TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
  // once PARQUET-1006 will be resolved
  pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, initialSlabSize,
      pageSize, new ParquetDirectByteBufferAllocator(oContext));
  ParquetProperties parquetProperties = ParquetProperties.builder()
      .withPageSize(pageSize)
      .withDictionaryEncoding(enableDictionary)
      .withDictionaryPageSize(initialPageBufferSize)
      .withWriterVersion(writerVersion)
      .withAllocator(new ParquetDirectByteBufferAllocator(oContext))
      .withValuesWriterFactory(new DefaultV1ValuesWriterFactory())
      .build();
  store = new ColumnWriteStoreV1(pageStore, parquetProperties);
  MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
  consumer = columnIO.getRecordWriter(store);
  setUp(schema, consumer);
}
 
Example #10
Source File: TestParquetReadProtocol.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private <T extends TBase<?,?>> void validate(T expected) throws TException {
  @SuppressWarnings("unchecked")
  final Class<T> thriftClass = (Class<T>)expected.getClass();
  final MemPageStore memPageStore = new MemPageStore(1);
  final ThriftSchemaConverter schemaConverter = new ThriftSchemaConverter();
  final MessageType schema = schemaConverter.convert(thriftClass);
  LOG.info("{}", schema);
  final MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema);
  final ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(memPageStore,
      ParquetProperties.builder()
          .withPageSize(10000)
          .withDictionaryEncoding(false)
          .build());
  final RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
  final StructType thriftType = schemaConverter.toStructType(thriftClass);
  ParquetWriteProtocol parquetWriteProtocol = new ParquetWriteProtocol(recordWriter, columnIO, thriftType);

  expected.write(parquetWriteProtocol);
  recordWriter.flush();
  columns.flush();

  ThriftRecordConverter<T> converter = new TBaseRecordConverter<T>(thriftClass, schema, thriftType);
  final RecordReader<T> recordReader = columnIO.getRecordReader(memPageStore, converter);

  final T result = recordReader.read();

  assertEquals(expected, result);
}
 
Example #11
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnWriteStoreV1 newColumnWriteStore(MemPageStore memPageStore) {
  return new ColumnWriteStoreV1(memPageStore,
      ParquetProperties.builder()
          .withPageSize(800)
          .withDictionaryPageSize(800)
          .withDictionaryEncoding(useDictionary)
          .build());
}
 
Example #12
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testPushParser() {
  MemPageStore memPageStore = new MemPageStore(1);
  ColumnWriteStoreV1 columns = newColumnWriteStore(memPageStore);
  MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
  RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
  new GroupWriter(recordWriter, schema).write(r1);
  recordWriter.flush();
  columns.flush();

  RecordReader<Void> recordReader = columnIO.getRecordReader(memPageStore, new ExpectationValidatingConverter(expectedEventsForR1, schema));
  recordReader.read();

}
 
Example #13
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void writeGroups(MessageType writtenSchema, MemPageStore memPageStore, Group... groups) {
  ColumnIOFactory columnIOFactory = new ColumnIOFactory(true);
  ColumnWriteStoreV1 columns = newColumnWriteStore(memPageStore);
  MessageColumnIO columnIO = columnIOFactory.getColumnIO(writtenSchema);
  RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
  GroupWriter groupWriter = new GroupWriter(recordWriter, writtenSchema);
  for (Group group : groups) {
    groupWriter.write(group);
  }
  recordWriter.flush();
  columns.flush();
}
 
Example #14
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnWriteStoreV1 newColumnWriteStoreImpl(MemPageStore memPageStore) {
  return new ColumnWriteStoreV1(memPageStore,
      ParquetProperties.builder()
          .withPageSize(2048)
          .withDictionaryEncoding(false)
          .build());
}
 
Example #15
Source File: ParquetProperties.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ColumnWriteStore newColumnWriteStore(MessageType schema,
                                            PageWriteStore pageStore,
                                            BloomFilterWriteStore bloomFilterWriteStore) {
  switch (writerVersion) {
  case PARQUET_1_0:
    return new ColumnWriteStoreV1(schema, pageStore, bloomFilterWriteStore, this);
  case PARQUET_2_0:
    return new ColumnWriteStoreV2(schema, pageStore, bloomFilterWriteStore, this);
  default:
    throw new IllegalArgumentException("unknown version " + writerVersion);
  }
}
 
Example #16
Source File: ParquetProperties.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ColumnWriteStore newColumnWriteStore(MessageType schema,
                                            PageWriteStore pageStore) {
  switch (writerVersion) {
    case PARQUET_1_0:
      return new ColumnWriteStoreV1(schema, pageStore, this);
    case PARQUET_2_0:
      return new ColumnWriteStoreV2(schema, pageStore, this);
    default:
      throw new IllegalArgumentException("unknown version " + writerVersion);
  }
}
 
Example #17
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void newSchema() throws IOException {
  // Reset it to half of current number and bound it within the limits
  recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCountForNextMemCheck / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);

  String json = new Schema(batchSchema).toJson();
  extraMetaData.put(DREMIO_ARROW_SCHEMA_2_1, json);
  schema = getParquetMessageType(batchSchema, "root");

  int dictionarySize = (int)context.getOptions().getOption(ExecConstants.PARQUET_DICT_PAGE_SIZE_VALIDATOR);
  final ParquetProperties parquetProperties = ParquetProperties.builder()
    .withDictionaryPageSize(dictionarySize)
    .withWriterVersion(writerVersion)
    .withValuesWriterFactory(new DefaultV1ValuesWriterFactory())
    .withDictionaryEncoding(enableDictionary)
    .withAllocator(new ParquetDirectByteBufferAllocator(columnEncoderAllocator))
    .withPageSize(pageSize)
    .withAddPageHeadersToMetadata(true)
    .withEnableDictionarForBinaryType(enableDictionaryForBinary)
    .withPageRowCountLimit(Integer.MAX_VALUE) // Bug 16118
    .build();
  pageStore = ColumnChunkPageWriteStoreExposer.newColumnChunkPageWriteStore(
      toDeprecatedBytesCompressor(codecFactory.getCompressor(codec)), schema, parquetProperties);
  store = new ColumnWriteStoreV1(pageStore, parquetProperties);
  MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
  consumer = columnIO.getRecordWriter(store);
  setUp(schema, consumer);
}
 
Example #18
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private ColumnWriter getColumnWriter(ColumnDescriptor path, MemPageStore memPageStore) {
  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  return columnWriter;
}