Java Code Examples for org.apache.parquet.column.page.DataPage

The following examples show how to use org.apache.parquet.column.page.DataPage. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: BasePageIterator.java    License: Apache License 2.0 6 votes vote down vote up
public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  this.hasNext = triplesRead < triplesCount;
}
 
Example 2
Source Project: iceberg   Source File: PageIterator.java    License: Apache License 2.0 6 votes vote down vote up
public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  advance();
}
 
Example 3
Source Project: parquet-mr   Source File: ColumnReaderBase.java    License: Apache License 2.0 6 votes vote down vote up
private void readPage() {
  LOG.debug("loading page");
  DataPage page = pageReader.readPage();
  page.accept(new DataPage.Visitor<Void>() {
    @Override
    public Void visit(DataPageV1 dataPageV1) {
      readPageV1(dataPageV1);
      return null;
    }
    @Override
    public Void visit(DataPageV2 dataPageV2) {
      readPageV2(dataPageV2);
      return null;
    }
  });
}
 
Example 4
Source Project: parquet-mr   Source File: TestMemPageStore.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws IOException {
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2);
  LongStatistics stats = new LongStatistics();
  PageWriter pageWriter = memPageStore.getPageWriter(col);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  PageReader pageReader = memPageStore.getPageReader(col);
  long totalValueCount = pageReader.getTotalValueCount();
  System.out.println(totalValueCount);
  int total = 0;
  do {
    DataPage readPage = pageReader.readPage();
    total += readPage.getValueCount();
    System.out.println(readPage);
    // TODO: assert
  } while (total < totalValueCount);
}
 
Example 5
Source Project: parquet-mr   Source File: FileEncodingsIT.java    License: Apache License 2.0 6 votes vote down vote up
public static void validatePages(Path file, List<?> expectedValues) throws IOException {
  List<PageReadStore> blockReaders = readBlocksFromFile(file);
  MessageType fileSchema = readSchemaFromFile(file);
  int rowGroupID = 0;
  int rowsRead = 0;
  for (PageReadStore pageReadStore : blockReaders) {
    for (ColumnDescriptor columnsDesc : fileSchema.getColumns()) {
      List<DataPage> pageGroup = getPageGroupForColumn(pageReadStore, columnsDesc);
      DictionaryPage dictPage = reusableCopy(getDictionaryPageForColumn(pageReadStore, columnsDesc));

      List<?> expectedRowGroupValues = expectedValues.subList(rowsRead, (int)(rowsRead + pageReadStore.getRowCount()));
      validateFirstToLast(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
      validateLastToFirst(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
    }

    rowsRead += pageReadStore.getRowCount();
    rowGroupID++;
  }
}
 
Example 6
Source Project: parquet-mr   Source File: CheckParquet251Command.java    License: Apache License 2.0 6 votes vote down vote up
private static <T extends Comparable<T>>
Statistics<T> getStatisticsFromPageHeader(DataPage page) {
  return page.accept(new DataPage.Visitor<Statistics<T>>() {
    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV1 dataPageV1) {
      return (Statistics<T>) dataPageV1.getStatistics();
    }

    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV2 dataPageV2) {
      return (Statistics<T>) dataPageV2.getStatistics();
    }
  });
}
 
Example 7
Source Project: iceberg   Source File: BaseColumnIterator.java    License: Apache License 2.0 5 votes vote down vote up
protected void advance() {
  if (triplesRead >= advanceNextPageCount) {
    BasePageIterator pageIterator = pageIterator();
    while (!pageIterator.hasNext()) {
      DataPage page = pageSource.readPage();
      if (page != null) {
        pageIterator.setPage(page);
        this.advanceNextPageCount += pageIterator.currentPageCount();
      } else {
        return;
      }
    }
  }
}
 
Example 8
Source Project: iceberg   Source File: ColumnIterator.java    License: Apache License 2.0 5 votes vote down vote up
private void advance() {
  if (triplesRead >= advanceNextPageCount) {
    while (!pageIterator.hasNext()) {
      DataPage page = pageSource.readPage();
      if (page != null) {
        pageIterator.setPage(page);
        this.advanceNextPageCount += pageIterator.currentPageCount();
      } else {
        return;
      }
    }
  }
}
 
Example 9
Source Project: dremio-oss   Source File: ColumnChunkIncReadStore.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public DataPage readPage() {
  try {
    in.seek(lastPosition);
    final DataPage dataPage = super.readPage();
    lastPosition = in.getPos();
    return dataPage;
  } catch (IOException ioe) {
    throw new RuntimeException(ioe);
  }
}
 
Example 10
Source Project: parquet-mr   Source File: SynchronizingColumnReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected void newPageInitialized(DataPage page) {
  long firstRowIndex = page.getFirstRowIndex()
      .orElseThrow(() -> new IllegalArgumentException("Missing firstRowIndex for synchronizing values"));
  int rowCount = page.getIndexRowCount()
      .orElseThrow(() -> new IllegalArgumentException("Missing rowCount for synchronizing values"));
  currentRow = firstRowIndex - 1;
  lastRowInPage = firstRowIndex + rowCount - 1;
  valuesReadFromPage = 0;
}
 
Example 11
Source Project: parquet-mr   Source File: TestColumnReaderImpl.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.writeCurrentValueToConverter();
    columnReader.consume();
  }
  assertEquals(rows, converter.count);
}
 
Example 12
Source Project: parquet-mr   Source File: TestColumnReaderImpl.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testOptional() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.writeNull(0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.consume();
  }
  assertEquals(0, converter.count);
}
 
Example 13
Source Project: parquet-mr   Source File: MemPageReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public DataPage readPage() {
  if (pages.hasNext()) {
    DataPage next = pages.next();
    LOG.debug("read page {}", next);
    return next;
  } else {
    throw new ParquetDecodingException("after last page");
  }
}
 
Example 14
Source Project: parquet-mr   Source File: MemPageStore.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  MemPageWriter pageWriter = pageWriters.get(descriptor);
  if (pageWriter == null) {
    throw new UnknownColumnException(descriptor);
  }
  List<DataPage> pages = new ArrayList<>(pageWriter.getPages());
  LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size());
  return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage());
}
 
Example 15
Source Project: parquet-mr   Source File: ColumnChunkPageReadStore.java    License: Apache License 2.0 5 votes vote down vote up
ColumnChunkPageReader(BytesInputDecompressor decompressor, List<DataPage> compressedPages,
    DictionaryPage compressedDictionaryPage, OffsetIndex offsetIndex, long rowCount) {
  this.decompressor = decompressor;
  this.compressedPages = new ArrayDeque<DataPage>(compressedPages);
  this.compressedDictionaryPage = compressedDictionaryPage;
  long count = 0;
  for (DataPage p : compressedPages) {
    count += p.getValueCount();
  }
  this.valueCount = count;
  this.offsetIndex = offsetIndex;
  this.rowCount = rowCount;
}
 
Example 16
Source Project: parquet-mr   Source File: FileEncodingsIT.java    License: Apache License 2.0 5 votes vote down vote up
private static void validateFirstToLast(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) {
  int rowsRead = 0, pageID = 0;
  for (DataPage page : pageGroup) {
    List<?> expectedPageValues = expectedValues.subList(rowsRead, rowsRead + page.getValueCount());
    PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues);
    rowsRead += page.getValueCount();
    pageID++;
  }
}
 
Example 17
Source Project: parquet-mr   Source File: FileEncodingsIT.java    License: Apache License 2.0 5 votes vote down vote up
private static void validateLastToFirst(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) {
  int rowsLeft = expectedValues.size();
  for (int pageID = pageGroup.size() - 1; pageID >= 0; pageID--) {
    DataPage page = pageGroup.get(pageID);
    int offset = rowsLeft - page.getValueCount();
    List<?> expectedPageValues = expectedValues.subList(offset, offset + page.getValueCount());
    PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues);
    rowsLeft -= page.getValueCount();
  }
}
 
Example 18
Source Project: parquet-mr   Source File: FileEncodingsIT.java    License: Apache License 2.0 5 votes vote down vote up
private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}
 
Example 19
Source Project: parquet-mr   Source File: FileEncodingsIT.java    License: Apache License 2.0 5 votes vote down vote up
public static void validateValuesForPage(int rowGroupID, int pageID, DictionaryPage dictPage, DataPage page, ColumnDescriptor columnDesc, List<?> expectedValues) {
  TestStatistics.SingletonPageReader pageReader = new TestStatistics.SingletonPageReader(dictPage, page);
  PrimitiveConverter converter = getConverter(rowGroupID, pageID, columnDesc.getType(), expectedValues);
  ColumnReaderImpl column = new ColumnReaderImpl(columnDesc, pageReader, converter, null);
  for (int i = 0; i < pageReader.getTotalValueCount(); i += 1) {
    column.writeCurrentValueToConverter();
    column.consume();
  }
}
 
Example 20
Source Project: parquet-mr   Source File: TestStatistics.java    License: Apache License 2.0 5 votes vote down vote up
private static <T extends Comparable<T>> Statistics<T> getStatisticsFromPageHeader(DataPage page) {
  return page.accept(new DataPage.Visitor<Statistics<T>>() {
    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV1 dataPageV1) {
      return (Statistics<T>) dataPageV1.getStatistics();
    }

    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV2 dataPageV2) {
      return (Statistics<T>) dataPageV2.getStatistics();
    }
  });
}
 
Example 21
Source Project: parquet-mr   Source File: TestStatistics.java    License: Apache License 2.0 5 votes vote down vote up
public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}
 
Example 22
Source Project: parquet-mr   Source File: TestStatistics.java    License: Apache License 2.0 5 votes vote down vote up
public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}
 
Example 23
Source Project: parquet-mr   Source File: TestStatistics.java    License: Apache License 2.0 5 votes vote down vote up
private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics<?> stats = getStatisticsFromPageHeader(page);

  assertEquals("Statistics does not use the proper comparator",
      desc.getPrimitiveType().comparator().getClass(),
      stats.comparator().getClass());

  if (stats.isEmpty()) {
    // stats are empty if num nulls = 0 and there are no non-null values
    // this happens if stats are not written (e.g., when stats are too big)
    return;
  }

  long numNulls = 0;
  ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  Assert.assertEquals(numNulls, stats.getNumNulls());
}
 
Example 24
Source Project: parquet-mr   Source File: CheckParquet251Command.java    License: Apache License 2.0 5 votes vote down vote up
public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}
 
Example 25
Source Project: parquet-mr   Source File: CheckParquet251Command.java    License: Apache License 2.0 5 votes vote down vote up
private void validateStatsForPage(DataPage page, DictionaryPage dict,
                                  ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics stats = getStatisticsFromPageHeader(page);

  long numNulls = 0;

  ColumnReader column = COL_READER_CTOR.newInstance(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  if (numNulls != stats.getNumNulls()) {
    throw new BadStatsException("Number of nulls doesn't match.");
  }

  console.debug(String.format(
      "Validated stats min=%s max=%s nulls=%d for page=%s col=%s",
      stats.minAsString(),
      stats.maxAsString(), stats.getNumNulls(), page,
      Arrays.toString(desc.getPath())));
}
 
Example 26
Source Project: parquet-mr   Source File: ShowPagesCommand.java    License: Apache License 2.0 5 votes vote down vote up
String format(Page page) {
  String formatted = "";
  if (page instanceof DictionaryPage) {
    formatted = printDictionaryPage((DictionaryPage) page);
  } else if (page instanceof DataPage) {
    formatted = ((DataPage) page).accept(this);
  }
  pageNum += 1;
  return formatted;
}
 
Example 27
Source Project: iceberg   Source File: PageIterator.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public void setPage(DataPage page) {
  super.setPage(page);
  advance();
}
 
Example 28
Source Project: flink   Source File: AbstractColumnReader.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Reads `total` values from this columnReader into column.
 */
@Override
public final void readToVector(int readNumber, VECTOR vector) throws IOException {
	int rowId = 0;
	WritableIntVector dictionaryIds = null;
	if (dictionary != null) {
		dictionaryIds = vector.reserveDictionaryIds(readNumber);
	}
	while (readNumber > 0) {
		// Compute the number of values we want to read in this page.
		int leftInPage = (int) (endOfPageValueCount - valuesRead);
		if (leftInPage == 0) {
			DataPage page = pageReader.readPage();
			if (page instanceof DataPageV1) {
				readPageV1((DataPageV1) page);
			} else if (page instanceof DataPageV2) {
				readPageV2((DataPageV2) page);
			} else {
				throw new RuntimeException("Unsupported page type: " + page.getClass());
			}
			leftInPage = (int) (endOfPageValueCount - valuesRead);
		}
		int num = Math.min(readNumber, leftInPage);
		if (isCurrentPageDictionaryEncoded) {
			// Read and decode dictionary ids.
			runLenDecoder.readDictionaryIds(
					num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder);

			if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) {
				// Column vector supports lazy decoding of dictionary values so just set the dictionary.
				// We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some
				// non-dictionary encoded values have already been added).
				vector.setDictionary(new ParquetDictionary(dictionary));
			} else {
				readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds);
			}
		} else {
			if (vector.hasDictionary() && rowId != 0) {
				// This batch already has dictionary encoded values but this new page is not. The batch
				// does not support a mix of dictionary and not so we will decode the dictionary.
				readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds());
			}
			vector.setDictionary(null);
			readBatch(rowId, num, vector);
		}

		valuesRead += num;
		rowId += num;
		readNumber -= num;
	}
}
 
Example 29
Source Project: parquet-mr   Source File: ColumnReaderImpl.java    License: Apache License 2.0 4 votes vote down vote up
@Override
void newPageInitialized(DataPage page) {
}
 
Example 30
Source Project: parquet-mr   Source File: MemPageWriter.java    License: Apache License 2.0 4 votes vote down vote up
public List<DataPage> getPages() {
  return pages;
}