org.apache.parquet.column.page.DataPage Java Examples

The following examples show how to use org.apache.parquet.column.page.DataPage. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BasePageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  this.hasNext = triplesRead < triplesCount;
}
 
Example #2
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void validatePages(Path file, List<?> expectedValues) throws IOException {
  List<PageReadStore> blockReaders = readBlocksFromFile(file);
  MessageType fileSchema = readSchemaFromFile(file);
  int rowGroupID = 0;
  int rowsRead = 0;
  for (PageReadStore pageReadStore : blockReaders) {
    for (ColumnDescriptor columnsDesc : fileSchema.getColumns()) {
      List<DataPage> pageGroup = getPageGroupForColumn(pageReadStore, columnsDesc);
      DictionaryPage dictPage = reusableCopy(getDictionaryPageForColumn(pageReadStore, columnsDesc));

      List<?> expectedRowGroupValues = expectedValues.subList(rowsRead, (int)(rowsRead + pageReadStore.getRowCount()));
      validateFirstToLast(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
      validateLastToFirst(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
    }

    rowsRead += pageReadStore.getRowCount();
    rowGroupID++;
  }
}
 
Example #3
Source File: TestMemPageStore.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws IOException {
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2);
  LongStatistics stats = new LongStatistics();
  PageWriter pageWriter = memPageStore.getPageWriter(col);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  PageReader pageReader = memPageStore.getPageReader(col);
  long totalValueCount = pageReader.getTotalValueCount();
  System.out.println(totalValueCount);
  int total = 0;
  do {
    DataPage readPage = pageReader.readPage();
    total += readPage.getValueCount();
    System.out.println(readPage);
    // TODO: assert
  } while (total < totalValueCount);
}
 
Example #4
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void readPage() {
  LOG.debug("loading page");
  DataPage page = pageReader.readPage();
  page.accept(new DataPage.Visitor<Void>() {
    @Override
    public Void visit(DataPageV1 dataPageV1) {
      readPageV1(dataPageV1);
      return null;
    }
    @Override
    public Void visit(DataPageV2 dataPageV2) {
      readPageV2(dataPageV2);
      return null;
    }
  });
}
 
Example #5
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static <T extends Comparable<T>>
Statistics<T> getStatisticsFromPageHeader(DataPage page) {
  return page.accept(new DataPage.Visitor<Statistics<T>>() {
    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV1 dataPageV1) {
      return (Statistics<T>) dataPageV1.getStatistics();
    }

    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV2 dataPageV2) {
      return (Statistics<T>) dataPageV2.getStatistics();
    }
  });
}
 
Example #6
Source File: PageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  advance();
}
 
Example #7
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}
 
Example #8
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateStatsForPage(DataPage page, DictionaryPage dict,
                                  ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics stats = getStatisticsFromPageHeader(page);

  long numNulls = 0;

  ColumnReader column = COL_READER_CTOR.newInstance(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  if (numNulls != stats.getNumNulls()) {
    throw new BadStatsException("Number of nulls doesn't match.");
  }

  console.debug(String.format(
      "Validated stats min=%s max=%s nulls=%d for page=%s col=%s",
      stats.minAsString(),
      stats.maxAsString(), stats.getNumNulls(), page,
      Arrays.toString(desc.getPath())));
}
 
Example #9
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics<?> stats = getStatisticsFromPageHeader(page);

  assertEquals("Statistics does not use the proper comparator",
      desc.getPrimitiveType().comparator().getClass(),
      stats.comparator().getClass());

  if (stats.isEmpty()) {
    // stats are empty if num nulls = 0 and there are no non-null values
    // this happens if stats are not written (e.g., when stats are too big)
    return;
  }

  long numNulls = 0;
  ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  Assert.assertEquals(numNulls, stats.getNumNulls());
}
 
Example #10
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}
 
Example #11
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}
 
Example #12
Source File: BaseColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
protected void advance() {
  if (triplesRead >= advanceNextPageCount) {
    BasePageIterator pageIterator = pageIterator();
    while (!pageIterator.hasNext()) {
      DataPage page = pageSource.readPage();
      if (page != null) {
        pageIterator.setPage(page);
        this.advanceNextPageCount += pageIterator.currentPageCount();
      } else {
        return;
      }
    }
  }
}
 
Example #13
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static <T extends Comparable<T>> Statistics<T> getStatisticsFromPageHeader(DataPage page) {
  return page.accept(new DataPage.Visitor<Statistics<T>>() {
    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV1 dataPageV1) {
      return (Statistics<T>) dataPageV1.getStatistics();
    }

    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV2 dataPageV2) {
      return (Statistics<T>) dataPageV2.getStatistics();
    }
  });
}
 
Example #14
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static void validateValuesForPage(int rowGroupID, int pageID, DictionaryPage dictPage, DataPage page, ColumnDescriptor columnDesc, List<?> expectedValues) {
  TestStatistics.SingletonPageReader pageReader = new TestStatistics.SingletonPageReader(dictPage, page);
  PrimitiveConverter converter = getConverter(rowGroupID, pageID, columnDesc.getType(), expectedValues);
  ColumnReaderImpl column = new ColumnReaderImpl(columnDesc, pageReader, converter, null);
  for (int i = 0; i < pageReader.getTotalValueCount(); i += 1) {
    column.writeCurrentValueToConverter();
    column.consume();
  }
}
 
Example #15
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}
 
Example #16
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void validateLastToFirst(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) {
  int rowsLeft = expectedValues.size();
  for (int pageID = pageGroup.size() - 1; pageID >= 0; pageID--) {
    DataPage page = pageGroup.get(pageID);
    int offset = rowsLeft - page.getValueCount();
    List<?> expectedPageValues = expectedValues.subList(offset, offset + page.getValueCount());
    PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues);
    rowsLeft -= page.getValueCount();
  }
}
 
Example #17
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void validateFirstToLast(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) {
  int rowsRead = 0, pageID = 0;
  for (DataPage page : pageGroup) {
    List<?> expectedPageValues = expectedValues.subList(rowsRead, rowsRead + page.getValueCount());
    PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues);
    rowsRead += page.getValueCount();
    pageID++;
  }
}
 
Example #18
Source File: ShowPagesCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
String format(Page page) {
  String formatted = "";
  if (page instanceof DictionaryPage) {
    formatted = printDictionaryPage((DictionaryPage) page);
  } else if (page instanceof DataPage) {
    formatted = ((DataPage) page).accept(this);
  }
  pageNum += 1;
  return formatted;
}
 
Example #19
Source File: ColumnChunkPageReadStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
ColumnChunkPageReader(BytesInputDecompressor decompressor, List<DataPage> compressedPages,
    DictionaryPage compressedDictionaryPage, OffsetIndex offsetIndex, long rowCount) {
  this.decompressor = decompressor;
  this.compressedPages = new ArrayDeque<DataPage>(compressedPages);
  this.compressedDictionaryPage = compressedDictionaryPage;
  long count = 0;
  for (DataPage p : compressedPages) {
    count += p.getValueCount();
  }
  this.valueCount = count;
  this.offsetIndex = offsetIndex;
  this.rowCount = rowCount;
}
 
Example #20
Source File: MemPageStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  MemPageWriter pageWriter = pageWriters.get(descriptor);
  if (pageWriter == null) {
    throw new UnknownColumnException(descriptor);
  }
  List<DataPage> pages = new ArrayList<>(pageWriter.getPages());
  LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size());
  return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage());
}
 
Example #21
Source File: MemPageReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public DataPage readPage() {
  if (pages.hasNext()) {
    DataPage next = pages.next();
    LOG.debug("read page {}", next);
    return next;
  } else {
    throw new ParquetDecodingException("after last page");
  }
}
 
Example #22
Source File: ColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void advance() {
  if (triplesRead >= advanceNextPageCount) {
    while (!pageIterator.hasNext()) {
      DataPage page = pageSource.readPage();
      if (page != null) {
        pageIterator.setPage(page);
        this.advanceNextPageCount += pageIterator.currentPageCount();
      } else {
        return;
      }
    }
  }
}
 
Example #23
Source File: ColumnChunkIncReadStore.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Override
public DataPage readPage() {
  try {
    in.seek(lastPosition);
    final DataPage dataPage = super.readPage();
    lastPosition = in.getPos();
    return dataPage;
  } catch (IOException ioe) {
    throw new RuntimeException(ioe);
  }
}
 
Example #24
Source File: TestColumnReaderImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testOptional() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.writeNull(0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.consume();
  }
  assertEquals(0, converter.count);
}
 
Example #25
Source File: TestColumnReaderImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.writeCurrentValueToConverter();
    columnReader.consume();
  }
  assertEquals(rows, converter.count);
}
 
Example #26
Source File: SynchronizingColumnReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
protected void newPageInitialized(DataPage page) {
  long firstRowIndex = page.getFirstRowIndex()
      .orElseThrow(() -> new IllegalArgumentException("Missing firstRowIndex for synchronizing values"));
  int rowCount = page.getIndexRowCount()
      .orElseThrow(() -> new IllegalArgumentException("Missing rowCount for synchronizing values"));
  currentRow = firstRowIndex - 1;
  lastRowInPage = firstRowIndex + rowCount - 1;
  valuesReadFromPage = 0;
}
 
Example #27
Source File: PageIterator.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public void setPage(DataPage page) {
  super.setPage(page);
  advance();
}
 
Example #28
Source File: AbstractColumnReader.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Reads `total` values from this columnReader into column.
 */
@Override
public final void readToVector(int readNumber, VECTOR vector) throws IOException {
	int rowId = 0;
	WritableIntVector dictionaryIds = null;
	if (dictionary != null) {
		dictionaryIds = vector.reserveDictionaryIds(readNumber);
	}
	while (readNumber > 0) {
		// Compute the number of values we want to read in this page.
		int leftInPage = (int) (endOfPageValueCount - valuesRead);
		if (leftInPage == 0) {
			DataPage page = pageReader.readPage();
			if (page instanceof DataPageV1) {
				readPageV1((DataPageV1) page);
			} else if (page instanceof DataPageV2) {
				readPageV2((DataPageV2) page);
			} else {
				throw new RuntimeException("Unsupported page type: " + page.getClass());
			}
			leftInPage = (int) (endOfPageValueCount - valuesRead);
		}
		int num = Math.min(readNumber, leftInPage);
		if (isCurrentPageDictionaryEncoded) {
			// Read and decode dictionary ids.
			runLenDecoder.readDictionaryIds(
					num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder);

			if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) {
				// Column vector supports lazy decoding of dictionary values so just set the dictionary.
				// We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some
				// non-dictionary encoded values have already been added).
				vector.setDictionary(new ParquetDictionary(dictionary));
			} else {
				readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds);
			}
		} else {
			if (vector.hasDictionary() && rowId != 0) {
				// This batch already has dictionary encoded values but this new page is not. The batch
				// does not support a mix of dictionary and not so we will decode the dictionary.
				readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds());
			}
			vector.setDictionary(null);
			readBatch(rowId, num, vector);
		}

		valuesRead += num;
		rowId += num;
		readNumber -= num;
	}
}
 
Example #29
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public DataPage readPage() {
  return data;
}
 
Example #30
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public SingletonPageReader(DictionaryPage dict, DataPage data) {
  this.dict = dict;
  this.data = data;
}