org.apache.parquet.io.MessageColumnIO Java Examples

The following examples show how to use org.apache.parquet.io.MessageColumnIO. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestThriftToPigCompatibility.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * <ul> steps:
 * <li>Writes using the thrift mapping
 * <li>Reads using the pig mapping
 * <li>Use Elephant bird to convert from thrift to pig
 * <li>Check that both transformations give the same result
 * @param o the object to convert
 * @throws TException
 */
public static <T extends TBase<?,?>> void validateSameTupleAsEB(T o) throws TException {
  final ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter();
  @SuppressWarnings("unchecked")
  final Class<T> class1 = (Class<T>) o.getClass();
  final MessageType schema = thriftSchemaConverter.convert(class1);

  final StructType structType = ThriftSchemaConverter.toStructType(class1);
  final ThriftToPig<T> thriftToPig = new ThriftToPig<T>(class1);
  final Schema pigSchema = thriftToPig.toSchema();
  final TupleRecordMaterializer tupleRecordConverter = new TupleRecordMaterializer(schema, pigSchema, true);
  RecordConsumer recordConsumer = new ConverterConsumer(tupleRecordConverter.getRootConverter(), schema);
  final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
  ParquetWriteProtocol p = new ParquetWriteProtocol(new RecordConsumerLoggingWrapper(recordConsumer), columnIO, structType);
  o.write(p);
  final Tuple t = tupleRecordConverter.getCurrentRecord();
  final Tuple expected = thriftToPig.getPigTuple(o);
  assertEquals(expected.toString(), t.toString());
  final MessageType filtered = new PigSchemaConverter().filter(schema, pigSchema);
  assertEquals(schema.toString(), filtered.toString());
}
 
Example #2
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}
 
Example #3
Source File: TupleConsumerPerfTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void read(PageReadStore columns, String pigSchemaString, String message) throws ParserException {
    System.out.println(message);
    MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
    TupleReadSupport tupleReadSupport = new TupleReadSupport();
    Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
    MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchemaString));
    ReadContext init = tupleReadSupport.init(null, pigMetaData, schema);
    RecordMaterializer<Tuple> recordConsumer = tupleReadSupport.prepareForRead(null, pigMetaData, schema, init);
    RecordReader<Tuple> recordReader = columnIO.getRecordReader(columns, recordConsumer);
    // TODO: put this back
//  if (DEBUG) {
//    recordConsumer = new RecordConsumerLoggingWrapper(recordConsumer);
//  }
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 100000, pigSchemaString);
    read(recordReader, 1000000, pigSchemaString);
    System.out.println();
  }
 
Example #4
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readLinRegInterceptor(g);
                coefficients = readLinRegCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LinearRegressionModel(coefficients, interceptor);
}
 
Example #5
Source File: TestParquetReadProtocol.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private <T extends TBase<?,?>> void validate(T expected) throws TException {
  @SuppressWarnings("unchecked")
  final Class<T> thriftClass = (Class<T>)expected.getClass();
  final MemPageStore memPageStore = new MemPageStore(1);
  final ThriftSchemaConverter schemaConverter = new ThriftSchemaConverter();
  final MessageType schema = schemaConverter.convert(thriftClass);
  LOG.info("{}", schema);
  final MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema);
  final ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(memPageStore,
      ParquetProperties.builder()
          .withPageSize(10000)
          .withDictionaryEncoding(false)
          .build());
  final RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
  final StructType thriftType = schemaConverter.toStructType(thriftClass);
  ParquetWriteProtocol parquetWriteProtocol = new ParquetWriteProtocol(recordWriter, columnIO, thriftType);

  expected.write(parquetWriteProtocol);
  recordWriter.flush();
  columns.flush();

  ThriftRecordConverter<T> converter = new TBaseRecordConverter<T>(thriftClass, schema, thriftType);
  final RecordReader<T> recordReader = columnIO.getRecordReader(memPageStore, converter);

  final T result = recordReader.read();

  assertEquals(expected, result);
}
 
Example #6
Source File: TestParquetWriteProtocol.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateThrift(String[] expectations, TBase<?, ?> a)
      throws TException {
    final ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter();
//      System.out.println(a);
    final Class<TBase<?,?>> class1 = (Class<TBase<?,?>>)a.getClass();
    final MessageType schema = thriftSchemaConverter.convert(class1);
    LOG.info("{}", schema);
    final StructType structType = thriftSchemaConverter.toStructType(class1);
    ExpectationValidatingRecordConsumer recordConsumer = new ExpectationValidatingRecordConsumer(new ArrayDeque<String>(Arrays.asList(expectations)));
    final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
    ParquetWriteProtocol p = new ParquetWriteProtocol(new RecordConsumerLoggingWrapper(recordConsumer), columnIO, structType);
    a.write(p);
  }
 
Example #7
Source File: InternalParquetRecordWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void initStore() {
  ColumnChunkPageWriteStore columnChunkPageWriteStore = new ColumnChunkPageWriteStore(compressor,
      schema, props.getAllocator(), props.getColumnIndexTruncateLength(), props.getPageWriteChecksumEnabled());
  pageStore = columnChunkPageWriteStore;
  bloomFilterWriteStore = columnChunkPageWriteStore;

  columnStore = props.newColumnWriteStore(schema, pageStore, bloomFilterWriteStore);
  MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema);
  this.recordConsumer = columnIO.getRecordWriter(columnStore);
  writeSupport.prepareForWrite(recordConsumer);
}
 
Example #8
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void checkRead() throws IOException {
  if (current == totalCountLoadedSoFar) {
    if (current != 0) {
      totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
      if (LOG.isInfoEnabled()) {
          LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
          final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
          if (totalTime != 0) {
              final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
              final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
              LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
          }
      }
    }

    LOG.info("at row " + current + ". reading next block");
    long t0 = System.currentTimeMillis();
    PageReadStore pages = reader.readNextFilteredRowGroup();
    if (pages == null) {
      throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
    }
    long timeSpentReading = System.currentTimeMillis() - t0;
    totalTimeSpentReadingBytes += timeSpentReading;
    BenchmarkCounter.incrementTime(timeSpentReading);
    if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
    LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
    MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
    recordReader = columnIO.getRecordReader(pages, recordConverter,
        filterRecords ? filter : FilterCompat.NOOP);
    startedAssemblingCurrentBlockAt = System.currentTimeMillis();
    totalCountLoadedSoFar += pages.getRowCount();
    ++ currentBlock;
  }
}
 
Example #9
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
private RecordReader<T> createRecordReader(PageReadStore pages) throws IOException {
	if (pages == null) {
		throw new IOException(
			"Expecting more rows but reached last block. Read " + numReadRecords + " out of " + numTotalRecords);
	}
	MessageColumnIO columnIO = columnIOFactory.getColumnIO(readSchema, fileSchema, true);
	return columnIO.getRecordReader(pages, recordMaterializer, filter);
}
 
Example #10
Source File: InternalParquetRecordReader.java    From tajo with Apache License 2.0 5 votes vote down vote up
private void checkRead() throws IOException {
  if (current == totalCountLoadedSoFar) {
    if (current != 0) {
      totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
      if (Log.DEBUG) {
        LOG.debug("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
        final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
        if (totalTime != 0) {
          final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
          final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
          LOG.debug("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
        }
      }
    }

    if (Log.DEBUG) LOG.debug("at row " + current + ". reading next block");
    long t0 = System.currentTimeMillis();
    PageReadStore pages = reader.readNextRowGroup();
    if (pages == null) {
      throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
    }
    long timeSpentReading = System.currentTimeMillis() - t0;
    totalTimeSpentReadingBytes += timeSpentReading;
    BenchmarkCounter.incrementTime(timeSpentReading);
    if (Log.INFO) LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount());
    if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema);
    MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
    recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
    startedAssemblingCurrentBlockAt = System.currentTimeMillis();
    totalCountLoadedSoFar += pages.getRowCount();
    ++ currentBlock;
  }
}
 
Example #11
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LogisticRegressionModel(coefficients, interceptor);
}
 
Example #12
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load SVM model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinearSVMModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readSVMInterceptor(g);
                coefficients = readSVMCoefficients(g);
            }
        }
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new SVMLinearClassificationModel(coefficients, interceptor);
}
 
Example #13
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
 
Example #14
Source File: TupleConsumerPerfTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void write(MemPageStore memPageStore, ColumnWriteStoreV1 columns, MessageType schema, String pigSchemaString) throws ExecException, ParserException {
  MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
  TupleWriteSupport tupleWriter = TupleWriteSupport.fromPigSchema(pigSchemaString);
  tupleWriter.init(null);
  tupleWriter.prepareForWrite(columnIO.getRecordWriter(columns));
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 10000);
  write(memPageStore, tupleWriter, 100000);
  write(memPageStore, tupleWriter, 1000000);
  System.out.println();
}
 
Example #15
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void newSchema() throws IOException {
  // Reset it to half of current number and bound it within the limits
  recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCountForNextMemCheck / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);

  String json = new Schema(batchSchema).toJson();
  extraMetaData.put(DREMIO_ARROW_SCHEMA_2_1, json);
  schema = getParquetMessageType(batchSchema, "root");

  int dictionarySize = (int)context.getOptions().getOption(ExecConstants.PARQUET_DICT_PAGE_SIZE_VALIDATOR);
  final ParquetProperties parquetProperties = ParquetProperties.builder()
    .withDictionaryPageSize(dictionarySize)
    .withWriterVersion(writerVersion)
    .withValuesWriterFactory(new DefaultV1ValuesWriterFactory())
    .withDictionaryEncoding(enableDictionary)
    .withAllocator(new ParquetDirectByteBufferAllocator(columnEncoderAllocator))
    .withPageSize(pageSize)
    .withAddPageHeadersToMetadata(true)
    .withEnableDictionarForBinaryType(enableDictionaryForBinary)
    .withPageRowCountLimit(Integer.MAX_VALUE) // Bug 16118
    .build();
  pageStore = ColumnChunkPageWriteStoreExposer.newColumnChunkPageWriteStore(
      toDeprecatedBytesCompressor(codecFactory.getCompressor(codec)), schema, parquetProperties);
  store = new ColumnWriteStoreV1(pageStore, parquetProperties);
  MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
  consumer = columnIO.getRecordWriter(store);
  setUp(schema, consumer);
}
 
Example #16
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
private RecordReader<T> createRecordReader(PageReadStore pages) throws IOException {
	if (pages == null) {
		throw new IOException(
			"Expecting more rows but reached last block. Read " + numReadRecords + " out of " + numTotalRecords);
	}
	MessageColumnIO columnIO = columnIOFactory.getColumnIO(readSchema, fileSchema, true);
	return columnIO.getRecordReader(pages, recordMaterializer, filter);
}
 
Example #17
Source File: ParquetReader.java    From presto with Apache License 2.0 5 votes vote down vote up
public ParquetReader(
        Optional<String> fileCreatedBy,
        MessageColumnIO messageColumnIO,
        List<BlockMetaData> blocks,
        ParquetDataSource dataSource,
        AggregatedMemoryContext systemMemoryContext,
        ParquetReaderOptions options)
        throws IOException
{
    this.fileCreatedBy = requireNonNull(fileCreatedBy, "fileCreatedBy is null");
    this.columns = requireNonNull(messageColumnIO, "messageColumnIO is null").getLeaves();
    this.blocks = requireNonNull(blocks, "blocks is null");
    this.dataSource = requireNonNull(dataSource, "dataSource is null");
    this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null");
    this.currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext();
    this.options = requireNonNull(options, "options is null");
    this.columnReaders = new PrimitiveColumnReader[columns.size()];
    this.maxBytesPerCell = new long[columns.size()];

    Map<ChunkKey, DiskRange> ranges = new HashMap<>();
    for (int rowGroup = 0; rowGroup < blocks.size(); rowGroup++) {
        BlockMetaData metadata = blocks.get(rowGroup);
        for (PrimitiveColumnIO column : columns) {
            int columnId = column.getId();
            ColumnChunkMetaData chunkMetadata = getColumnChunkMetaData(metadata, column.getColumnDescriptor());
            DiskRange range = new DiskRange(chunkMetadata.getStartingPos(), toIntExact(chunkMetadata.getTotalSize()));
            ranges.put(new ChunkKey(columnId, rowGroup), range);
        }
    }

    this.chunkReaders = dataSource.planRead(ranges);
}
 
Example #18
Source File: ParquetRecordWriter.java    From Bats with Apache License 2.0 5 votes vote down vote up
private void newSchema() throws IOException {
  List<Type> types = Lists.newArrayList();
  for (MaterializedField field : batchSchema) {
    if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
      continue;
    }
    types.add(getType(field));
  }
  schema = new MessageType("root", types);

  // We don't want this number to be too small, ideally we divide the block equally across the columns.
  // It is unlikely all columns are going to be the same size.
  // Its value is likely below Integer.MAX_VALUE (2GB), although rowGroupSize is a long type.
  // Therefore this size is cast to int, since allocating byte array in under layer needs to
  // limit the array size in an int scope.
  int initialBlockBufferSize = this.schema.getColumns().size() > 0 ?
      max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5) : MINIMUM_BUFFER_SIZE;
  // We don't want this number to be too small either. Ideally, slightly bigger than the page size,
  // but not bigger than the block buffer
  int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
  // TODO: Use initialSlabSize from ParquetProperties once drill will be updated to the latest version of Parquet library
  int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(64, pageSize, 10);
  // TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
  // once PARQUET-1006 will be resolved
  pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, initialSlabSize,
      pageSize, new ParquetDirectByteBufferAllocator(oContext));
  ParquetProperties parquetProperties = ParquetProperties.builder()
      .withPageSize(pageSize)
      .withDictionaryEncoding(enableDictionary)
      .withDictionaryPageSize(initialPageBufferSize)
      .withWriterVersion(writerVersion)
      .withAllocator(new ParquetDirectByteBufferAllocator(oContext))
      .withValuesWriterFactory(new DefaultV1ValuesWriterFactory())
      .build();
  store = new ColumnWriteStoreV1(pageStore, parquetProperties);
  MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
  consumer = columnIO.getRecordWriter(store);
  setUp(schema, consumer);
}
 
Example #19
Source File: TupleConsumerPerfTest.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private static MessageColumnIO newColumnFactory(String pigSchemaString) throws ParserException {
  MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchemaString));
  return new ColumnIOFactory().getColumnIO(schema);
}
 
Example #20
Source File: ParquetWriteProtocol.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public ParquetWriteProtocol(RecordConsumer recordConsumer, MessageColumnIO schema, StructType thriftType) {
  this.currentProtocol = new MessageWriteProtocol(schema, thriftType);
  this.recordConsumer = recordConsumer;
}
 
Example #21
Source File: ParquetWriteProtocol.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public MessageWriteProtocol(MessageColumnIO schema, StructType thriftType) {
  super(schema, thriftType, null);
}
 
Example #22
Source File: ThriftBytesWriteSupport.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
  final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
  this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct);
  thriftWriteSupport.prepareForWrite(recordConsumer);
}
 
Example #23
Source File: AbstractThriftWriteSupport.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
  final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
  this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct);
}
 
Example #24
Source File: SparkModelParser.java    From ignite with Apache License 2.0 4 votes vote down vote up
/**
 * Load K-Means model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment learningEnvironment
 */
private static Model loadKMeansModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector[] centers = null;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final int rows = (int)pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            centers = new DenseVector[rows];

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                // final int clusterIdx = g.getInteger(0, 0);

                Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);

                final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);

                centers[i] = new DenseVector(amountOfCoefficients);

                for (int j = 0; j < amountOfCoefficients; j++) {
                    double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
                    centers[i].set(j, coefficient);
                }
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new KMeansModel(centers, new EuclideanDistance());
}
 
Example #25
Source File: ParquetTypeUtils.java    From presto with Apache License 2.0 4 votes vote down vote up
public static MessageColumnIO getColumnIO(MessageType fileSchema, MessageType requestedSchema)
{
    return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true);
}