org.apache.parquet.column.page.DataPage Java Exaples

Source File: BasePageIterator.java From iceberg with Apache License 2.0

6 votes

public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  this.hasNext = triplesRead < triplesCount;
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

6 votes

public static void validatePages(Path file, List<?> expectedValues) throws IOException {
  List<PageReadStore> blockReaders = readBlocksFromFile(file);
  MessageType fileSchema = readSchemaFromFile(file);
  int rowGroupID = 0;
  int rowsRead = 0;
  for (PageReadStore pageReadStore : blockReaders) {
    for (ColumnDescriptor columnsDesc : fileSchema.getColumns()) {
      List<DataPage> pageGroup = getPageGroupForColumn(pageReadStore, columnsDesc);
      DictionaryPage dictPage = reusableCopy(getDictionaryPageForColumn(pageReadStore, columnsDesc));

      List<?> expectedRowGroupValues = expectedValues.subList(rowsRead, (int)(rowsRead + pageReadStore.getRowCount()));
      validateFirstToLast(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
      validateLastToFirst(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
    }

    rowsRead += pageReadStore.getRowCount();
    rowGroupID++;
  }
}

Source File: TestMemPageStore.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void test() throws IOException {
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2);
  LongStatistics stats = new LongStatistics();
  PageWriter pageWriter = memPageStore.getPageWriter(col);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  PageReader pageReader = memPageStore.getPageReader(col);
  long totalValueCount = pageReader.getTotalValueCount();
  System.out.println(totalValueCount);
  int total = 0;
  do {
    DataPage readPage = pageReader.readPage();
    total += readPage.getValueCount();
    System.out.println(readPage);
    // TODO: assert
  } while (total < totalValueCount);
}

Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0

6 votes

private void readPage() {
  LOG.debug("loading page");
  DataPage page = pageReader.readPage();
  page.accept(new DataPage.Visitor<Void>() {
    @Override
    public Void visit(DataPageV1 dataPageV1) {
      readPageV1(dataPageV1);
      return null;
    }
    @Override
    public Void visit(DataPageV2 dataPageV2) {
      readPageV2(dataPageV2);
      return null;
    }
  });
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

6 votes

private static <T extends Comparable<T>>
Statistics<T> getStatisticsFromPageHeader(DataPage page) {
  return page.accept(new DataPage.Visitor<Statistics<T>>() {
    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV1 dataPageV1) {
      return (Statistics<T>) dataPageV1.getStatistics();
    }

    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV2 dataPageV2) {
      return (Statistics<T>) dataPageV2.getStatistics();
    }
  });
}

Source File: PageIterator.java From iceberg with Apache License 2.0

6 votes

public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  advance();
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

private void validateStatsForPage(DataPage page, DictionaryPage dict,
                                  ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics stats = getStatisticsFromPageHeader(page);

  long numNulls = 0;

  ColumnReader column = COL_READER_CTOR.newInstance(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  if (numNulls != stats.getNumNulls()) {
    throw new BadStatsException("Number of nulls doesn't match.");
  }

  console.debug(String.format(
      "Validated stats min=%s max=%s nulls=%d for page=%s col=%s",
      stats.minAsString(),
      stats.maxAsString(), stats.getNumNulls(), page,
      Arrays.toString(desc.getPath())));
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics<?> stats = getStatisticsFromPageHeader(page);

  assertEquals("Statistics does not use the proper comparator",
      desc.getPrimitiveType().comparator().getClass(),
      stats.comparator().getClass());

  if (stats.isEmpty()) {
    // stats are empty if num nulls = 0 and there are no non-null values
    // this happens if stats are not written (e.g., when stats are too big)
    return;
  }

  long numNulls = 0;
  ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  Assert.assertEquals(numNulls, stats.getNumNulls());
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}

Source File: BaseColumnIterator.java From iceberg with Apache License 2.0

5 votes

protected void advance() {
  if (triplesRead >= advanceNextPageCount) {
    BasePageIterator pageIterator = pageIterator();
    while (!pageIterator.hasNext()) {
      DataPage page = pageSource.readPage();
      if (page != null) {
        pageIterator.setPage(page);
        this.advanceNextPageCount += pageIterator.currentPageCount();
      } else {
        return;
      }
    }
  }
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

private static <T extends Comparable<T>> Statistics<T> getStatisticsFromPageHeader(DataPage page) {
  return page.accept(new DataPage.Visitor<Statistics<T>>() {
    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV1 dataPageV1) {
      return (Statistics<T>) dataPageV1.getStatistics();
    }

    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV2 dataPageV2) {
      return (Statistics<T>) dataPageV2.getStatistics();
    }
  });
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

public static void validateValuesForPage(int rowGroupID, int pageID, DictionaryPage dictPage, DataPage page, ColumnDescriptor columnDesc, List<?> expectedValues) {
  TestStatistics.SingletonPageReader pageReader = new TestStatistics.SingletonPageReader(dictPage, page);
  PrimitiveConverter converter = getConverter(rowGroupID, pageID, columnDesc.getType(), expectedValues);
  ColumnReaderImpl column = new ColumnReaderImpl(columnDesc, pageReader, converter, null);
  for (int i = 0; i < pageReader.getTotalValueCount(); i += 1) {
    column.writeCurrentValueToConverter();
    column.consume();
  }
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static void validateLastToFirst(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) {
  int rowsLeft = expectedValues.size();
  for (int pageID = pageGroup.size() - 1; pageID >= 0; pageID--) {
    DataPage page = pageGroup.get(pageID);
    int offset = rowsLeft - page.getValueCount();
    List<?> expectedPageValues = expectedValues.subList(offset, offset + page.getValueCount());
    PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues);
    rowsLeft -= page.getValueCount();
  }
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static void validateFirstToLast(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) {
  int rowsRead = 0, pageID = 0;
  for (DataPage page : pageGroup) {
    List<?> expectedPageValues = expectedValues.subList(rowsRead, rowsRead + page.getValueCount());
    PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues);
    rowsRead += page.getValueCount();
    pageID++;
  }
}

Source File: ShowPagesCommand.java From parquet-mr with Apache License 2.0

5 votes

String format(Page page) {
  String formatted = "";
  if (page instanceof DictionaryPage) {
    formatted = printDictionaryPage((DictionaryPage) page);
  } else if (page instanceof DataPage) {
    formatted = ((DataPage) page).accept(this);
  }
  pageNum += 1;
  return formatted;
}

Source File: ColumnChunkPageReadStore.java From parquet-mr with Apache License 2.0

5 votes

ColumnChunkPageReader(BytesInputDecompressor decompressor, List<DataPage> compressedPages,
    DictionaryPage compressedDictionaryPage, OffsetIndex offsetIndex, long rowCount) {
  this.decompressor = decompressor;
  this.compressedPages = new ArrayDeque<DataPage>(compressedPages);
  this.compressedDictionaryPage = compressedDictionaryPage;
  long count = 0;
  for (DataPage p : compressedPages) {
    count += p.getValueCount();
  }
  this.valueCount = count;
  this.offsetIndex = offsetIndex;
  this.rowCount = rowCount;
}

Source File: MemPageStore.java From parquet-mr with Apache License 2.0

5 votes

@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  MemPageWriter pageWriter = pageWriters.get(descriptor);
  if (pageWriter == null) {
    throw new UnknownColumnException(descriptor);
  }
  List<DataPage> pages = new ArrayList<>(pageWriter.getPages());
  LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size());
  return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage());
}

Source File: MemPageReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
public DataPage readPage() {
  if (pages.hasNext()) {
    DataPage next = pages.next();
    LOG.debug("read page {}", next);
    return next;
  } else {
    throw new ParquetDecodingException("after last page");
  }
}

Source File: ColumnIterator.java From iceberg with Apache License 2.0

5 votes

private void advance() {
  if (triplesRead >= advanceNextPageCount) {
    while (!pageIterator.hasNext()) {
      DataPage page = pageSource.readPage();
      if (page != null) {
        pageIterator.setPage(page);
        this.advanceNextPageCount += pageIterator.currentPageCount();
      } else {
        return;
      }
    }
  }
}

Source File: ColumnChunkIncReadStore.java From dremio-oss with Apache License 2.0

5 votes

@Override
public DataPage readPage() {
  try {
    in.seek(lastPosition);
    final DataPage dataPage = super.readPage();
    lastPosition = in.getPos();
    return dataPage;
  } catch (IOException ioe) {
    throw new RuntimeException(ioe);
  }
}

Source File: TestColumnReaderImpl.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testOptional() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.writeNull(0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.consume();
  }
  assertEquals(0, converter.count);
}

Source File: TestColumnReaderImpl.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void test() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.writeCurrentValueToConverter();
    columnReader.consume();
  }
  assertEquals(rows, converter.count);
}

Source File: SynchronizingColumnReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
protected void newPageInitialized(DataPage page) {
  long firstRowIndex = page.getFirstRowIndex()
      .orElseThrow(() -> new IllegalArgumentException("Missing firstRowIndex for synchronizing values"));
  int rowCount = page.getIndexRowCount()
      .orElseThrow(() -> new IllegalArgumentException("Missing rowCount for synchronizing values"));
  currentRow = firstRowIndex - 1;
  lastRowInPage = firstRowIndex + rowCount - 1;
  valuesReadFromPage = 0;
}

Source File: PageIterator.java From iceberg with Apache License 2.0

4 votes

@Override
public void setPage(DataPage page) {
  super.setPage(page);
  advance();
}

Source File: AbstractColumnReader.java From flink with Apache License 2.0

4 votes

/**
 * Reads `total` values from this columnReader into column.
 */
@Override
public final void readToVector(int readNumber, VECTOR vector) throws IOException {
	int rowId = 0;
	WritableIntVector dictionaryIds = null;
	if (dictionary != null) {
		dictionaryIds = vector.reserveDictionaryIds(readNumber);
	}
	while (readNumber > 0) {
		// Compute the number of values we want to read in this page.
		int leftInPage = (int) (endOfPageValueCount - valuesRead);
		if (leftInPage == 0) {
			DataPage page = pageReader.readPage();
			if (page instanceof DataPageV1) {
				readPageV1((DataPageV1) page);
			} else if (page instanceof DataPageV2) {
				readPageV2((DataPageV2) page);
			} else {
				throw new RuntimeException("Unsupported page type: " + page.getClass());
			}
			leftInPage = (int) (endOfPageValueCount - valuesRead);
		}
		int num = Math.min(readNumber, leftInPage);
		if (isCurrentPageDictionaryEncoded) {
			// Read and decode dictionary ids.
			runLenDecoder.readDictionaryIds(
					num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder);

			if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) {
				// Column vector supports lazy decoding of dictionary values so just set the dictionary.
				// We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some
				// non-dictionary encoded values have already been added).
				vector.setDictionary(new ParquetDictionary(dictionary));
			} else {
				readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds);
			}
		} else {
			if (vector.hasDictionary() && rowId != 0) {
				// This batch already has dictionary encoded values but this new page is not. The batch
				// does not support a mix of dictionary and not so we will decode the dictionary.
				readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds());
			}
			vector.setDictionary(null);
			readBatch(rowId, num, vector);
		}

		valuesRead += num;
		rowId += num;
		readNumber -= num;
	}
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

4 votes

@Override
public DataPage readPage() {
  return data;
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

4 votes

public SingletonPageReader(DictionaryPage dict, DataPage data) {
  this.dict = dict;
  this.data = data;
}

org.apache.parquet.column.page.DataPage Java Examples