Java Code Examples for org.apache.parquet.column.page.PageReader

The following examples show how to use org.apache.parquet.column.page.PageReader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: parquet-mr   Source File: ColumnReaderBase.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * creates a reader for triplets
 * @param path the descriptor for the corresponding column
 * @param pageReader the underlying store to read from
 * @param converter a converter that materializes the values in this column in the current record
 * @param writerVersion writer version string from the Parquet file being read
 */
ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.pageReader = Objects.requireNonNull(pageReader, "pageReader cannot be null");
  this.converter = Objects.requireNonNull(converter, "converter cannot be null");
  this.writerVersion = writerVersion;
  this.maxDefinitionLevel = path.getMaxDefinitionLevel();
  DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
      if (converter.hasDictionarySupport()) {
        converter.setDictionary(dictionary);
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
    }
  } else {
    this.dictionary = null;
  }
  this.totalValueCount = pageReader.getTotalValueCount();
  if (totalValueCount <= 0) {
    throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
  }
}
 
Example 2
Source Project: parquet-mr   Source File: TestMemPageStore.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws IOException {
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2);
  LongStatistics stats = new LongStatistics();
  PageWriter pageWriter = memPageStore.getPageWriter(col);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  PageReader pageReader = memPageStore.getPageReader(col);
  long totalValueCount = pageReader.getTotalValueCount();
  System.out.println(totalValueCount);
  int total = 0;
  do {
    DataPage readPage = pageReader.readPage();
    total += readPage.getValueCount();
    System.out.println(readPage);
    // TODO: assert
  } while (total < totalValueCount);
}
 
Example 3
Source Project: iceberg   Source File: VectorizedColumnIterator.java    License: Apache License 2.0 5 votes vote down vote up
public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) {
  // setPageSource can result in a data page read. If that happens, we need
  // to know in advance whether all the pages in the row group are dictionary encoded or not
  this.vectorizedPageIterator.setAllPagesDictEncoded(allPagesDictEncoded);
  super.setPageSource(store);
  return dictionary;
}
 
Example 4
Source Project: iceberg   Source File: BaseColumnIterator.java    License: Apache License 2.0 5 votes vote down vote up
public void setPageSource(PageReader source) {
  this.pageSource = source;
  this.triplesCount = source.getTotalValueCount();
  this.triplesRead = 0L;
  this.advanceNextPageCount = 0L;
  BasePageIterator pageIterator = pageIterator();
  pageIterator.reset();
  dictionary = ParquetUtil.readDictionary(desc, pageSource);
  pageIterator.setDictionary(dictionary);
  advance();
}
 
Example 5
Source Project: iceberg   Source File: ParquetUtil.java    License: Apache License 2.0 5 votes vote down vote up
public static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
  DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
    }
  }
  return null;
}
 
Example 6
Source Project: iceberg   Source File: ColumnIterator.java    License: Apache License 2.0 5 votes vote down vote up
public void setPageSource(PageReader source) {
  this.pageSource = source;
  this.triplesCount = source.getTotalValueCount();
  this.triplesRead = 0L;
  this.advanceNextPageCount = 0L;
  this.pageIterator.reset();
  this.pageIterator.setDictionary(readDictionary(desc, pageSource));
  advance();
}
 
Example 7
Source Project: iceberg   Source File: ColumnIterator.java    License: Apache License 2.0 5 votes vote down vote up
private static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
    DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
    if (dictionaryPage != null) {
      try {
        return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
//        if (converter.hasDictionarySupport()) {
//          converter.setDictionary(dictionary);
//        }
      } catch (IOException e) {
        throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
      }
    }
    return null;
  }
 
Example 8
Source Project: dremio-oss   Source File: ParquetRecordReaderTest.java    License: Apache License 2.0 5 votes vote down vote up
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes)
    throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV1 page = (DataPageV1) pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray());
}
 
Example 9
Source Project: flink   Source File: AbstractColumnReader.java    License: Apache License 2.0 5 votes vote down vote up
public AbstractColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	this.descriptor = descriptor;
	this.pageReader = pageReader;
	this.maxDefLevel = descriptor.getMaxDefinitionLevel();

	DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
	if (dictionaryPage != null) {
		try {
			this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage);
			this.isCurrentPageDictionaryEncoded = true;
		} catch (IOException e) {
			throw new IOException("could not decode the dictionary for " + descriptor, e);
		}
	} else {
		this.dictionary = null;
		this.isCurrentPageDictionaryEncoded = false;
	}
	/*
	 * Total number of values in this column (in this row group).
	 */
	long totalValueCount = pageReader.getTotalValueCount();
	if (totalValueCount == 0) {
		throw new IOException("totalValueCount == 0");
	}
}
 
Example 10
Source Project: flink   Source File: FixedLenBytesColumnReader.java    License: Apache License 2.0 5 votes vote down vote up
public FixedLenBytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader,
		int precision) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY);
	this.precision = precision;
}
 
Example 11
Source Project: flink   Source File: TimestampColumnReader.java    License: Apache License 2.0 5 votes vote down vote up
public TimestampColumnReader(
		boolean utcTimestamp,
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	this.utcTimestamp = utcTimestamp;
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT96);
}
 
Example 12
Source Project: parquet-mr   Source File: ColumnReadStoreImpl.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public ColumnReader getColumnReader(ColumnDescriptor path) {
  PrimitiveConverter converter = getPrimitiveConverter(path);
  PageReader pageReader = pageReadStore.getPageReader(path);
  Optional<PrimitiveIterator.OfLong> rowIndexes = pageReadStore.getRowIndexes();
  if (rowIndexes.isPresent()) {
    return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, rowIndexes.get());
  } else {
    return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
  }
}
 
Example 13
Source Project: parquet-mr   Source File: SynchronizingColumnReader.java    License: Apache License 2.0 5 votes vote down vote up
SynchronizingColumnReader(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter,
    ParsedVersion writerVersion, PrimitiveIterator.OfLong rowIndexes) {
  super(path, pageReader, converter, writerVersion);
  this.rowIndexes = rowIndexes;
  targetRow = Long.MIN_VALUE;
  consume();
}
 
Example 14
Source Project: parquet-mr   Source File: MemPageStore.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  MemPageWriter pageWriter = pageWriters.get(descriptor);
  if (pageWriter == null) {
    throw new UnknownColumnException(descriptor);
  }
  List<DataPage> pages = new ArrayList<>(pageWriter.getPages());
  LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size());
  return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage());
}
 
Example 15
Source Project: parquet-mr   Source File: ColumnChunkPageReadStore.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor path) {
  final PageReader pageReader = readers.get(path);
  if (pageReader == null) {
    throw new IllegalArgumentException(path + " is not in the store: " + readers.keySet() + " " + rowCount);
  }
  return pageReader;
}
 
Example 16
Source Project: parquet-mr   Source File: TestParquetFileWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void validateV2Page(MessageType schema, PageReadStore pages, String[] path, int values, int rows, int nullCount,
                            byte[] repetition, byte[] definition, byte[] data, int uncompressedSize) throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV2 page = (DataPageV2)pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertEquals(rows, page.getRowCount());
  assertEquals(nullCount, page.getNullCount());
  assertEquals(uncompressedSize, page.getUncompressedSize());
  assertArrayEquals(repetition, page.getRepetitionLevels().toByteArray());
  assertArrayEquals(definition, page.getDefinitionLevels().toByteArray());
  assertArrayEquals(data, page.getData().toByteArray());
}
 
Example 17
Source Project: parquet-mr   Source File: FileEncodingsIT.java    License: Apache License 2.0 5 votes vote down vote up
private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}
 
Example 18
Source Project: parquet-mr   Source File: TestStatistics.java    License: Apache License 2.0 5 votes vote down vote up
public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}
 
Example 19
Source Project: Bats   Source File: ColumnChunkIncReadStore.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  return columns.get(descriptor);
}
 
Example 20
Source Project: dremio-oss   Source File: ColumnChunkIncReadStore.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  return columns.get(descriptor);
}
 
Example 21
Source Project: flink   Source File: ByteColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
public ByteColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}
 
Example 22
Source Project: flink   Source File: BooleanColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
public BooleanColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.BOOLEAN);
}
 
Example 23
Source Project: flink   Source File: LongColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
public LongColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT64);
}
 
Example 24
Source Project: flink   Source File: ShortColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
public ShortColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}
 
Example 25
Source Project: flink   Source File: DoubleColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
public DoubleColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.DOUBLE);
}
 
Example 26
Source Project: flink   Source File: IntColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
public IntColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}
 
Example 27
Source Project: flink   Source File: BytesColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
public BytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.BINARY);
}
 
Example 28
Source Project: flink   Source File: FloatColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
public FloatColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FLOAT);
}
 
Example 29
Source Project: flink   Source File: ParquetSplitReaderUtil.java    License: Apache License 2.0 4 votes vote down vote up
public static ColumnReader createColumnReader(
		boolean utcTimestamp,
		LogicalType fieldType,
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	switch (fieldType.getTypeRoot()) {
		case BOOLEAN:
			return new BooleanColumnReader(descriptor, pageReader);
		case TINYINT:
			return new ByteColumnReader(descriptor, pageReader);
		case DOUBLE:
			return new DoubleColumnReader(descriptor, pageReader);
		case FLOAT:
			return new FloatColumnReader(descriptor, pageReader);
		case INTEGER:
		case DATE:
		case TIME_WITHOUT_TIME_ZONE:
			return new IntColumnReader(descriptor, pageReader);
		case BIGINT:
			return new LongColumnReader(descriptor, pageReader);
		case SMALLINT:
			return new ShortColumnReader(descriptor, pageReader);
		case CHAR:
		case VARCHAR:
		case BINARY:
		case VARBINARY:
			return new BytesColumnReader(descriptor, pageReader);
		case TIMESTAMP_WITHOUT_TIME_ZONE:
		case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
			return new TimestampColumnReader(utcTimestamp, descriptor, pageReader);
		case DECIMAL:
			switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) {
				case INT32:
					return new IntColumnReader(descriptor, pageReader);
				case INT64:
					return new LongColumnReader(descriptor, pageReader);
				case BINARY:
					return new BytesColumnReader(descriptor, pageReader);
				case FIXED_LEN_BYTE_ARRAY:
					return new FixedLenBytesColumnReader(
							descriptor, pageReader, ((DecimalType) fieldType).getPrecision());
			}
		default:
			throw new UnsupportedOperationException(fieldType + " is not supported now.");
	}
}
 
Example 30
Source Project: parquet-mr   Source File: ColumnReadStoreImpl.java    License: Apache License 2.0 4 votes vote down vote up
private ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) {
  PrimitiveConverter converter = getPrimitiveConverter(path);
  return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
}