org.apache.parquet.column.page.DataPage Java Examples
The following examples show how to use
org.apache.parquet.column.page.DataPage.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BasePageIterator.java From iceberg with Apache License 2.0 | 6 votes |
public void setPage(DataPage page) { Preconditions.checkNotNull(page, "Cannot read from null page"); this.page = page; this.page.accept(new DataPage.Visitor<ValuesReader>() { @Override public ValuesReader visit(DataPageV1 dataPageV1) { initFromPage(dataPageV1); return null; } @Override public ValuesReader visit(DataPageV2 dataPageV2) { initFromPage(dataPageV2); return null; } }); this.triplesRead = 0; this.hasNext = triplesRead < triplesCount; }
Example #2
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 6 votes |
public static void validatePages(Path file, List<?> expectedValues) throws IOException { List<PageReadStore> blockReaders = readBlocksFromFile(file); MessageType fileSchema = readSchemaFromFile(file); int rowGroupID = 0; int rowsRead = 0; for (PageReadStore pageReadStore : blockReaders) { for (ColumnDescriptor columnsDesc : fileSchema.getColumns()) { List<DataPage> pageGroup = getPageGroupForColumn(pageReadStore, columnsDesc); DictionaryPage dictPage = reusableCopy(getDictionaryPageForColumn(pageReadStore, columnsDesc)); List<?> expectedRowGroupValues = expectedValues.subList(rowsRead, (int)(rowsRead + pageReadStore.getRowCount())); validateFirstToLast(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues); validateLastToFirst(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues); } rowsRead += pageReadStore.getRowCount(); rowGroupID++; } }
Example #3
Source File: TestMemPageStore.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void test() throws IOException { MemPageStore memPageStore = new MemPageStore(10); ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2); LongStatistics stats = new LongStatistics(); PageWriter pageWriter = memPageStore.getPageWriter(col); pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN); pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN); pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN); pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN); PageReader pageReader = memPageStore.getPageReader(col); long totalValueCount = pageReader.getTotalValueCount(); System.out.println(totalValueCount); int total = 0; do { DataPage readPage = pageReader.readPage(); total += readPage.getValueCount(); System.out.println(readPage); // TODO: assert } while (total < totalValueCount); }
Example #4
Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0 | 6 votes |
private void readPage() { LOG.debug("loading page"); DataPage page = pageReader.readPage(); page.accept(new DataPage.Visitor<Void>() { @Override public Void visit(DataPageV1 dataPageV1) { readPageV1(dataPageV1); return null; } @Override public Void visit(DataPageV2 dataPageV2) { readPageV2(dataPageV2); return null; } }); }
Example #5
Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0 | 6 votes |
private static <T extends Comparable<T>> Statistics<T> getStatisticsFromPageHeader(DataPage page) { return page.accept(new DataPage.Visitor<Statistics<T>>() { @Override @SuppressWarnings("unchecked") public Statistics<T> visit(DataPageV1 dataPageV1) { return (Statistics<T>) dataPageV1.getStatistics(); } @Override @SuppressWarnings("unchecked") public Statistics<T> visit(DataPageV2 dataPageV2) { return (Statistics<T>) dataPageV2.getStatistics(); } }); }
Example #6
Source File: PageIterator.java From iceberg with Apache License 2.0 | 6 votes |
public void setPage(DataPage page) { Preconditions.checkNotNull(page, "Cannot read from null page"); this.page = page; this.page.accept(new DataPage.Visitor<ValuesReader>() { @Override public ValuesReader visit(DataPageV1 dataPageV1) { initFromPage(dataPageV1); return null; } @Override public ValuesReader visit(DataPageV2 dataPageV2) { initFromPage(dataPageV2); return null; } }); this.triplesRead = 0; advance(); }
Example #7
Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0 | 5 votes |
public StatsValidator(DataPage page) { Statistics<T> stats = getStatisticsFromPageHeader(page); this.comparator = stats.comparator(); this.hasNonNull = stats.hasNonNullValue(); if (hasNonNull) { this.min = stats.genericGetMin(); this.max = stats.genericGetMax(); } else { this.min = null; this.max = null; } }
Example #8
Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) { SingletonPageReader reader = new SingletonPageReader(dict, page); PrimitiveConverter converter = getValidatingConverter(page, desc.getType()); Statistics stats = getStatisticsFromPageHeader(page); long numNulls = 0; ColumnReader column = COL_READER_CTOR.newInstance(desc, reader, converter, null); for (int i = 0; i < reader.getTotalValueCount(); i += 1) { if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) { column.writeCurrentValueToConverter(); } else { numNulls += 1; } column.consume(); } if (numNulls != stats.getNumNulls()) { throw new BadStatsException("Number of nulls doesn't match."); } console.debug(String.format( "Validated stats min=%s max=%s nulls=%d for page=%s col=%s", stats.minAsString(), stats.maxAsString(), stats.getNumNulls(), page, Arrays.toString(desc.getPath()))); }
Example #9
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) { SingletonPageReader reader = new SingletonPageReader(dict, page); PrimitiveConverter converter = getValidatingConverter(page, desc.getType()); Statistics<?> stats = getStatisticsFromPageHeader(page); assertEquals("Statistics does not use the proper comparator", desc.getPrimitiveType().comparator().getClass(), stats.comparator().getClass()); if (stats.isEmpty()) { // stats are empty if num nulls = 0 and there are no non-null values // this happens if stats are not written (e.g., when stats are too big) return; } long numNulls = 0; ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null); for (int i = 0; i < reader.getTotalValueCount(); i += 1) { if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) { column.writeCurrentValueToConverter(); } else { numNulls += 1; } column.consume(); } Assert.assertEquals(numNulls, stats.getNumNulls()); }
Example #10
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 5 votes |
public void validate(MessageType schema, PageReadStore store) { for (ColumnDescriptor desc : schema.getColumns()) { PageReader reader = store.getPageReader(desc); DictionaryPage dict = reader.readDictionaryPage(); DataPage page; while ((page = reader.readPage()) != null) { validateStatsForPage(page, dict, desc); } } }
Example #11
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 5 votes |
public StatsValidator(DataPage page) { Statistics<T> stats = getStatisticsFromPageHeader(page); this.comparator = stats.comparator(); this.hasNonNull = stats.hasNonNullValue(); if (hasNonNull) { this.min = stats.genericGetMin(); this.max = stats.genericGetMax(); } else { this.min = null; this.max = null; } }
Example #12
Source File: BaseColumnIterator.java From iceberg with Apache License 2.0 | 5 votes |
protected void advance() { if (triplesRead >= advanceNextPageCount) { BasePageIterator pageIterator = pageIterator(); while (!pageIterator.hasNext()) { DataPage page = pageSource.readPage(); if (page != null) { pageIterator.setPage(page); this.advanceNextPageCount += pageIterator.currentPageCount(); } else { return; } } } }
Example #13
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 5 votes |
private static <T extends Comparable<T>> Statistics<T> getStatisticsFromPageHeader(DataPage page) { return page.accept(new DataPage.Visitor<Statistics<T>>() { @Override @SuppressWarnings("unchecked") public Statistics<T> visit(DataPageV1 dataPageV1) { return (Statistics<T>) dataPageV1.getStatistics(); } @Override @SuppressWarnings("unchecked") public Statistics<T> visit(DataPageV2 dataPageV2) { return (Statistics<T>) dataPageV2.getStatistics(); } }); }
Example #14
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 5 votes |
public static void validateValuesForPage(int rowGroupID, int pageID, DictionaryPage dictPage, DataPage page, ColumnDescriptor columnDesc, List<?> expectedValues) { TestStatistics.SingletonPageReader pageReader = new TestStatistics.SingletonPageReader(dictPage, page); PrimitiveConverter converter = getConverter(rowGroupID, pageID, columnDesc.getType(), expectedValues); ColumnReaderImpl column = new ColumnReaderImpl(columnDesc, pageReader, converter, null); for (int i = 0; i < pageReader.getTotalValueCount(); i += 1) { column.writeCurrentValueToConverter(); column.consume(); } }
Example #15
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 5 votes |
private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) { PageReader pageReader = pageReadStore.getPageReader(columnDescriptor); List<DataPage> pageGroup = new ArrayList<DataPage>(); DataPage page; while ((page = pageReader.readPage()) != null) { pageGroup.add(reusableCopy(page)); } return pageGroup; }
Example #16
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void validateLastToFirst(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) { int rowsLeft = expectedValues.size(); for (int pageID = pageGroup.size() - 1; pageID >= 0; pageID--) { DataPage page = pageGroup.get(pageID); int offset = rowsLeft - page.getValueCount(); List<?> expectedPageValues = expectedValues.subList(offset, offset + page.getValueCount()); PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues); rowsLeft -= page.getValueCount(); } }
Example #17
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void validateFirstToLast(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) { int rowsRead = 0, pageID = 0; for (DataPage page : pageGroup) { List<?> expectedPageValues = expectedValues.subList(rowsRead, rowsRead + page.getValueCount()); PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues); rowsRead += page.getValueCount(); pageID++; } }
Example #18
Source File: ShowPagesCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
String format(Page page) { String formatted = ""; if (page instanceof DictionaryPage) { formatted = printDictionaryPage((DictionaryPage) page); } else if (page instanceof DataPage) { formatted = ((DataPage) page).accept(this); } pageNum += 1; return formatted; }
Example #19
Source File: ColumnChunkPageReadStore.java From parquet-mr with Apache License 2.0 | 5 votes |
ColumnChunkPageReader(BytesInputDecompressor decompressor, List<DataPage> compressedPages, DictionaryPage compressedDictionaryPage, OffsetIndex offsetIndex, long rowCount) { this.decompressor = decompressor; this.compressedPages = new ArrayDeque<DataPage>(compressedPages); this.compressedDictionaryPage = compressedDictionaryPage; long count = 0; for (DataPage p : compressedPages) { count += p.getValueCount(); } this.valueCount = count; this.offsetIndex = offsetIndex; this.rowCount = rowCount; }
Example #20
Source File: MemPageStore.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public PageReader getPageReader(ColumnDescriptor descriptor) { MemPageWriter pageWriter = pageWriters.get(descriptor); if (pageWriter == null) { throw new UnknownColumnException(descriptor); } List<DataPage> pages = new ArrayList<>(pageWriter.getPages()); LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size()); return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage()); }
Example #21
Source File: MemPageReader.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public DataPage readPage() { if (pages.hasNext()) { DataPage next = pages.next(); LOG.debug("read page {}", next); return next; } else { throw new ParquetDecodingException("after last page"); } }
Example #22
Source File: ColumnIterator.java From iceberg with Apache License 2.0 | 5 votes |
private void advance() { if (triplesRead >= advanceNextPageCount) { while (!pageIterator.hasNext()) { DataPage page = pageSource.readPage(); if (page != null) { pageIterator.setPage(page); this.advanceNextPageCount += pageIterator.currentPageCount(); } else { return; } } } }
Example #23
Source File: ColumnChunkIncReadStore.java From dremio-oss with Apache License 2.0 | 5 votes |
@Override public DataPage readPage() { try { in.seek(lastPosition); final DataPage dataPage = super.readPage(); lastPosition = in.getPos(); return dataPage; } catch (IOException ioe) { throw new RuntimeException(ioe); } }
Example #24
Source File: TestColumnReaderImpl.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testOptional() throws Exception { MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }"); ColumnDescriptor col = schema.getColumns().get(0); MemPageWriter pageWriter = new MemPageWriter(); ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter, ParquetProperties.builder() .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0) .withPageSize(2048).build()); for (int i = 0; i < rows; i++) { columnWriterV2.writeNull(0, 0); if ((i + 1) % 1000 == 0) { columnWriterV2.writePage(); } } columnWriterV2.writePage(); columnWriterV2.finalizeColumnChunk(); List<DataPage> pages = pageWriter.getPages(); int valueCount = 0; int rowCount = 0; for (DataPage dataPage : pages) { valueCount += dataPage.getValueCount(); rowCount += ((DataPageV2)dataPage).getRowCount(); } assertEquals(rows, rowCount); assertEquals(rows, valueCount); MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage()); ValidatingConverter converter = new ValidatingConverter(); ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION)); for (int i = 0; i < rows; i++) { assertEquals(0, columnReader.getCurrentRepetitionLevel()); assertEquals(0, columnReader.getCurrentDefinitionLevel()); columnReader.consume(); } assertEquals(0, converter.count); }
Example #25
Source File: TestColumnReaderImpl.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void test() throws Exception { MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }"); ColumnDescriptor col = schema.getColumns().get(0); MemPageWriter pageWriter = new MemPageWriter(); ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter, ParquetProperties.builder() .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0) .withPageSize(2048).build()); for (int i = 0; i < rows; i++) { columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0); if ((i + 1) % 1000 == 0) { columnWriterV2.writePage(); } } columnWriterV2.writePage(); columnWriterV2.finalizeColumnChunk(); List<DataPage> pages = pageWriter.getPages(); int valueCount = 0; int rowCount = 0; for (DataPage dataPage : pages) { valueCount += dataPage.getValueCount(); rowCount += ((DataPageV2)dataPage).getRowCount(); } assertEquals(rows, rowCount); assertEquals(rows, valueCount); MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage()); ValidatingConverter converter = new ValidatingConverter(); ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION)); for (int i = 0; i < rows; i++) { assertEquals(0, columnReader.getCurrentRepetitionLevel()); assertEquals(0, columnReader.getCurrentDefinitionLevel()); columnReader.writeCurrentValueToConverter(); columnReader.consume(); } assertEquals(rows, converter.count); }
Example #26
Source File: SynchronizingColumnReader.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override protected void newPageInitialized(DataPage page) { long firstRowIndex = page.getFirstRowIndex() .orElseThrow(() -> new IllegalArgumentException("Missing firstRowIndex for synchronizing values")); int rowCount = page.getIndexRowCount() .orElseThrow(() -> new IllegalArgumentException("Missing rowCount for synchronizing values")); currentRow = firstRowIndex - 1; lastRowInPage = firstRowIndex + rowCount - 1; valuesReadFromPage = 0; }
Example #27
Source File: PageIterator.java From iceberg with Apache License 2.0 | 4 votes |
@Override public void setPage(DataPage page) { super.setPage(page); advance(); }
Example #28
Source File: AbstractColumnReader.java From flink with Apache License 2.0 | 4 votes |
/** * Reads `total` values from this columnReader into column. */ @Override public final void readToVector(int readNumber, VECTOR vector) throws IOException { int rowId = 0; WritableIntVector dictionaryIds = null; if (dictionary != null) { dictionaryIds = vector.reserveDictionaryIds(readNumber); } while (readNumber > 0) { // Compute the number of values we want to read in this page. int leftInPage = (int) (endOfPageValueCount - valuesRead); if (leftInPage == 0) { DataPage page = pageReader.readPage(); if (page instanceof DataPageV1) { readPageV1((DataPageV1) page); } else if (page instanceof DataPageV2) { readPageV2((DataPageV2) page); } else { throw new RuntimeException("Unsupported page type: " + page.getClass()); } leftInPage = (int) (endOfPageValueCount - valuesRead); } int num = Math.min(readNumber, leftInPage); if (isCurrentPageDictionaryEncoded) { // Read and decode dictionary ids. runLenDecoder.readDictionaryIds( num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder); if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) { // Column vector supports lazy decoding of dictionary values so just set the dictionary. // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some // non-dictionary encoded values have already been added). vector.setDictionary(new ParquetDictionary(dictionary)); } else { readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds); } } else { if (vector.hasDictionary() && rowId != 0) { // This batch already has dictionary encoded values but this new page is not. The batch // does not support a mix of dictionary and not so we will decode the dictionary. readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds()); } vector.setDictionary(null); readBatch(rowId, num, vector); } valuesRead += num; rowId += num; readNumber -= num; } }
Example #29
Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public DataPage readPage() { return data; }
Example #30
Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0 | 4 votes |
public SingletonPageReader(DictionaryPage dict, DataPage data) { this.dict = dict; this.data = data; }