org.apache.parquet.column.page.DictionaryPage Java Examples

The following examples show how to use org.apache.parquet.column.page.DictionaryPage. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PageReader.java    From Bats with Apache License 2.0 7 votes vote down vote up
private void readDictionaryPage(final PageHeader pageHeader,
                                final ColumnReader<?> parentStatus) throws IOException {
  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();

  final DrillBuf dictionaryData = readPage(pageHeader, compressedSize, uncompressedSize);
  allocatedDictionaryBuffers.add(dictionaryData);

  DictionaryPage page = new DictionaryPage(
      asBytesInput(dictionaryData, 0, uncompressedSize),
      pageHeader.uncompressed_page_size,
      pageHeader.dictionary_page_header.num_values,
      valueOf(pageHeader.dictionary_page_header.encoding.name()));

  this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page);
}
 
Example #2
Source File: AsyncPageReader.java    From Bats with Apache License 2.0 6 votes vote down vote up
private void readDictionaryPageData(final ReadStatus readStatus, final ColumnReader<?> parentStatus)
    throws UserException {
  try {
    pageHeader = readStatus.getPageHeader();
    int uncompressedSize = pageHeader.getUncompressed_page_size();
    final DrillBuf dictionaryData = getDecompressedPageData(readStatus);
    Stopwatch timer = Stopwatch.createStarted();
    allocatedDictionaryBuffers.add(dictionaryData);
    DictionaryPage page = new DictionaryPage(asBytesInput(dictionaryData, 0, uncompressedSize),
        pageHeader.uncompressed_page_size, pageHeader.dictionary_page_header.num_values,
        valueOf(pageHeader.dictionary_page_header.encoding.name()));
    this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page);
    long timeToDecode = timer.elapsed(TimeUnit.NANOSECONDS);
    stats.timeDictPageDecode.addAndGet(timeToDecode);
  } catch (Exception e) {
    handleAndThrowException(e, "Error decoding dictionary page.");
  }
}
 
Example #3
Source File: ColumnWriterBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Finalizes the Column chunk. Possibly adding extra pages if needed (dictionary, ...)
 * Is called right after writePage
 */
void finalizeColumnChunk() {
  final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose();
  if (dictionaryPage != null) {
    if (DEBUG)
      LOG.debug("write dictionary");
    try {
      pageWriter.writeDictionaryPage(dictionaryPage);
    } catch (IOException e) {
      throw new ParquetEncodingException("could not write dictionary page for " + path, e);
    }
    dataColumn.resetDictionary();
  }

  if (bloomFilterWriter != null && bloomFilter != null) {
    bloomFilterWriter.writeBloomFilter(bloomFilter);
  }
}
 
Example #4
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * creates a reader for triplets
 * @param path the descriptor for the corresponding column
 * @param pageReader the underlying store to read from
 * @param converter a converter that materializes the values in this column in the current record
 * @param writerVersion writer version string from the Parquet file being read
 */
ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.pageReader = Objects.requireNonNull(pageReader, "pageReader cannot be null");
  this.converter = Objects.requireNonNull(converter, "converter cannot be null");
  this.writerVersion = writerVersion;
  this.maxDefinitionLevel = path.getMaxDefinitionLevel();
  DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
      if (converter.hasDictionarySupport()) {
        converter.setDictionary(dictionary);
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
    }
  } else {
    this.dictionary = null;
  }
  this.totalValueCount = pageReader.getTotalValueCount();
  if (totalValueCount <= 0) {
    throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
  }
}
 
Example #5
Source File: ColumnChunkPageReadStore.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public DictionaryPage readDictionaryPage() {
  if (compressedDictionaryPage == null) {
    return null;
  }
  try {
    DictionaryPage decompressedPage = new DictionaryPage(
      decompressor.decompress(compressedDictionaryPage.getBytes(), compressedDictionaryPage.getUncompressedSize()),
      compressedDictionaryPage.getDictionarySize(),
      compressedDictionaryPage.getEncoding());
    if (compressedDictionaryPage.getCrc().isPresent()) {
      decompressedPage.setCrc(compressedDictionaryPage.getCrc().getAsInt());
    }
    return decompressedPage;
  } catch (IOException e) {
    throw new ParquetDecodingException("Could not decompress dictionary page", e);
  }
}
 
Example #6
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
  if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
      !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
    return null;
  }

  // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
  if (f.getPos() != meta.getStartingPos()) {
    f.seek(meta.getStartingPos());
  }

  PageHeader pageHeader = Util.readPageHeader(f);
  if (!pageHeader.isSetDictionary_page_header()) {
    return null; // TODO: should this complain?
  }

  DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
  BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

  return new DictionaryPage(
      decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
      compressedPage.getDictionarySize(),
      compressedPage.getEncoding());
}
 
Example #7
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private DictionaryPage readCompressedDictionary(
    PageHeader pageHeader, SeekableInputStream fin) throws IOException {
  DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();

  int uncompressedPageSize = pageHeader.getUncompressed_page_size();
  int compressedPageSize = pageHeader.getCompressed_page_size();

  byte [] dictPageBytes = new byte[compressedPageSize];
  fin.readFully(dictPageBytes);

  BytesInput bin = BytesInput.from(dictPageBytes);

  return new DictionaryPage(
      bin, uncompressedPageSize, dictHeader.getNum_values(),
      converter.getEncoding(dictHeader.getEncoding()));
}
 
Example #8
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void validatePages(Path file, List<?> expectedValues) throws IOException {
  List<PageReadStore> blockReaders = readBlocksFromFile(file);
  MessageType fileSchema = readSchemaFromFile(file);
  int rowGroupID = 0;
  int rowsRead = 0;
  for (PageReadStore pageReadStore : blockReaders) {
    for (ColumnDescriptor columnsDesc : fileSchema.getColumns()) {
      List<DataPage> pageGroup = getPageGroupForColumn(pageReadStore, columnsDesc);
      DictionaryPage dictPage = reusableCopy(getDictionaryPageForColumn(pageReadStore, columnsDesc));

      List<?> expectedRowGroupValues = expectedValues.subList(rowsRead, (int)(rowsRead + pageReadStore.getRowCount()));
      validateFirstToLast(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
      validateLastToFirst(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
    }

    rowsRead += pageReadStore.getRowCount();
    rowGroupID++;
  }
}
 
Example #9
Source File: ShowPagesCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String printDictionaryPage(DictionaryPage dict) {
  // TODO: the compressed size of a dictionary page is lost in Parquet
  dict.getUncompressedSize();
  long totalSize = dict.getCompressedSize();
  int count = dict.getDictionarySize();
  float perValue = ((float) totalSize) / count;
  String enc = encodingAsString(dict.getEncoding(), true);
  if (pageNum == 0) {
    return String.format("%3d-D    %-5s %s %-2s %-7d %-10s %-10s",
        rowGroupNum, "dict", shortCodec, enc, count, humanReadable(perValue),
        humanReadable(totalSize));
  } else {
    return String.format("%3d-%-3d  %-5s %s %-2s %-7d %-10s %-10s",
        rowGroupNum, pageNum, "dict", shortCodec, enc, count, humanReadable(perValue),
        humanReadable(totalSize));
  }
}
 
Example #10
Source File: DictionaryPageReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) {
  if (rowGroup != null) {
    // if the row group has already been read, use that dictionary
    return rowGroup.readDictionaryPage(descriptor);
  }

  String dotPath = String.join(".", descriptor.getPath());
  ColumnChunkMetaData column = columns.get(dotPath);
  if (column == null) {
    throw new ParquetDecodingException(
        "Failed to load dictionary, unknown column: " + dotPath);
  }

  return dictionaryPageCache.computeIfAbsent(dotPath, key -> {
    try {
      final DictionaryPage dict =
          hasDictionaryPage(column) ? reader.readDictionary(column) : null;

      // Copy the dictionary to ensure it can be reused if it is returned
      // more than once. This can happen when a DictionaryFilter has two or
      // more predicates for the same column. Cache misses as well.
      return (dict != null) ? Optional.of(reusableCopy(dict)) : Optional.empty();
    } catch (IOException e) {
      throw new ParquetDecodingException("Failed to read dictionary", e);
    }
  }).orElse(null);
}
 
Example #11
Source File: PlainValuesDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param dictionaryPage a dictionary page of encoded integer values
 * @throws IOException if there is an exception while decoding the dictionary page
 */
public PlainIntegerDictionary(DictionaryPage dictionaryPage) throws IOException {
  super(dictionaryPage);
  ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
  intDictionaryContent = new int[dictionaryPage.getDictionarySize()];
  IntegerPlainValuesReader intReader = new IntegerPlainValuesReader();
  intReader.initFromPage(dictionaryPage.getDictionarySize(), in);
  for (int i = 0; i < intDictionaryContent.length; i++) {
    intDictionaryContent[i] = intReader.readInteger();
  }
}
 
Example #12
Source File: DictionaryValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public DictionaryPage toDictPageAndClose() {
  if (lastUsedDictionarySize > 0) {
    // return a dictionary only if we actually used it
    FixedLenByteArrayPlainValuesWriter dictionaryEncoder = new FixedLenByteArrayPlainValuesWriter(length, lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
    Iterator<Binary> binaryIterator = binaryDictionaryContent.keySet().iterator();
    // write only the part of the dict that we used
    for (int i = 0; i < lastUsedDictionarySize; i++) {
      Binary entry = binaryIterator.next();
      dictionaryEncoder.writeBytes(entry);
    }
    return dictPage(dictionaryEncoder);
  }
  return null;
}
 
Example #13
Source File: DictionaryValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public DictionaryPage toDictPageAndClose() {
  if (lastUsedDictionarySize > 0) {
    // return a dictionary only if we actually used it
    PlainValuesWriter dictionaryEncoder = new PlainValuesWriter(lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
    Iterator<Binary> binaryIterator = binaryDictionaryContent.keySet().iterator();
    // write only the part of the dict that we used
    for (int i = 0; i < lastUsedDictionarySize; i++) {
      Binary entry = binaryIterator.next();
      dictionaryEncoder.writeBytes(entry);
    }
    return dictPage(dictionaryEncoder);
  }
  return null;
}
 
Example #14
Source File: ColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
    DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
    if (dictionaryPage != null) {
      try {
        return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
//        if (converter.hasDictionarySupport()) {
//          converter.setDictionary(dictionary);
//        }
      } catch (IOException e) {
        throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
      }
    }
    return null;
  }
 
Example #15
Source File: PlainValuesDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param dictionaryPage a dictionary page of encoded float values
 * @throws IOException if there is an exception while decoding the dictionary page
 */
public PlainFloatDictionary(DictionaryPage dictionaryPage) throws IOException {
  super(dictionaryPage);
  ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
  floatDictionaryContent = new float[dictionaryPage.getDictionarySize()];
  FloatPlainValuesReader floatReader = new FloatPlainValuesReader();
  floatReader.initFromPage(dictionaryPage.getDictionarySize(), in);
  for (int i = 0; i < floatDictionaryContent.length; i++) {
    floatDictionaryContent[i] = floatReader.readFloat();
  }
}
 
Example #16
Source File: FallbackValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public DictionaryPage toDictPageAndClose() {
  if (initialUsedAndHadDictionary) {
    return initialWriter.toDictPageAndClose();
  } else {
    return currentWriter.toDictPageAndClose();
  }
}
 
Example #17
Source File: MemPageWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page per block");
  }
  this.memSize += dictionaryPage.getBytes().size();
  this.dictionaryPage = dictionaryPage.copy();
  LOG.debug("dictionary page written for {} bytes and {} records", dictionaryPage.getBytes().size(), dictionaryPage.getDictionarySize());
}
 
Example #18
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private DictionaryValuesReader initDicReader(ValuesWriter cw, PrimitiveTypeName type)
    throws IOException {
  final DictionaryPage dictionaryPage = cw.toDictPageAndClose().copy();
  final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] {"foo"}, type, 0, 0);
  final Dictionary dictionary = PLAIN.initDictionary(descriptor, dictionaryPage);
  final DictionaryValuesReader cr = new DictionaryValuesReader(dictionary);
  return cr;
}
 
Example #19
Source File: ShowPagesCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
String format(Page page) {
  String formatted = "";
  if (page instanceof DictionaryPage) {
    formatted = printDictionaryPage((DictionaryPage) page);
  } else if (page instanceof DataPage) {
    formatted = ((DataPage) page).accept(this);
  }
  pageNum += 1;
  return formatted;
}
 
Example #20
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void validateLastToFirst(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) {
  int rowsLeft = expectedValues.size();
  for (int pageID = pageGroup.size() - 1; pageID >= 0; pageID--) {
    DataPage page = pageGroup.get(pageID);
    int offset = rowsLeft - page.getValueCount();
    List<?> expectedPageValues = expectedValues.subList(offset, offset + page.getValueCount());
    PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues);
    rowsLeft -= page.getValueCount();
  }
}
 
Example #21
Source File: ColumnChunkPageReadStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
ColumnChunkPageReader(BytesInputDecompressor decompressor, List<DataPage> compressedPages,
    DictionaryPage compressedDictionaryPage, OffsetIndex offsetIndex, long rowCount) {
  this.decompressor = decompressor;
  this.compressedPages = new ArrayDeque<DataPage>(compressedPages);
  this.compressedDictionaryPage = compressedDictionaryPage;
  long count = 0;
  for (DataPage p : compressedPages) {
    count += p.getValueCount();
  }
  this.valueCount = count;
  this.offsetIndex = offsetIndex;
  this.rowCount = rowCount;
}
 
Example #22
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateStatsForPage(DataPage page, DictionaryPage dict,
                                  ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics stats = getStatisticsFromPageHeader(page);

  long numNulls = 0;

  ColumnReader column = COL_READER_CTOR.newInstance(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  if (numNulls != stats.getNumNulls()) {
    throw new BadStatsException("Number of nulls doesn't match.");
  }

  console.debug(String.format(
      "Validated stats min=%s max=%s nulls=%d for page=%s col=%s",
      stats.minAsString(),
      stats.maxAsString(), stats.getNumNulls(), page,
      Arrays.toString(desc.getPath())));
}
 
Example #23
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics<?> stats = getStatisticsFromPageHeader(page);

  assertEquals("Statistics does not use the proper comparator",
      desc.getPrimitiveType().comparator().getClass(),
      stats.comparator().getClass());

  if (stats.isEmpty()) {
    // stats are empty if num nulls = 0 and there are no non-null values
    // this happens if stats are not written (e.g., when stats are too big)
    return;
  }

  long numNulls = 0;
  ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  Assert.assertEquals(numNulls, stats.getNumNulls());
}
 
Example #24
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}
 
Example #25
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * writes a dictionary page page
 * @param dictionaryPage the dictionary page
 * @throws IOException if there is an error while writing
 */
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  state = state.write();
  LOG.debug("{}: write dictionary page: {} values", out.getPos(), dictionaryPage.getDictionarySize());
  currentChunkDictionaryPageOffset = out.getPos();
  int uncompressedSize = dictionaryPage.getUncompressedSize();
  int compressedPageSize = (int)dictionaryPage.getBytes().size(); // TODO: fix casts
  if (pageWriteChecksumEnabled) {
    crc.reset();
    crc.update(dictionaryPage.getBytes().toByteArray());
    metadataConverter.writeDictionaryPageHeader(
      uncompressedSize,
      compressedPageSize,
      dictionaryPage.getDictionarySize(),
      dictionaryPage.getEncoding(),
      (int) crc.getValue(),
      out);
  } else {
    metadataConverter.writeDictionaryPageHeader(
      uncompressedSize,
      compressedPageSize,
      dictionaryPage.getDictionarySize(),
      dictionaryPage.getEncoding(),
      out);
  }
  long headerSize = out.getPos() - currentChunkDictionaryPageOffset;
  this.uncompressedLength += uncompressedSize + headerSize;
  this.compressedLength += compressedPageSize + headerSize;
  LOG.debug("{}: write dictionary page content {}", out.getPos(), compressedPageSize);
  dictionaryPage.getBytes().writeAllTo(out);
  encodingStatsBuilder.addDictEncoding(dictionaryPage.getEncoding());
  currentEncodings.add(dictionaryPage.getEncoding());
}
 
Example #26
Source File: ColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page is allowed");
  }
  BytesInput dictionaryBytes = dictionaryPage.getBytes();
  int uncompressedSize = (int)dictionaryBytes.size();
  BytesInput compressedBytes = compressor.compress(dictionaryBytes);
  this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize, dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
 
Example #27
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testDictionaryEncoding() throws IOException {
  Configuration conf = new Configuration();

  // Write out dictionary encoded sample file via the non-checksum code path, extract the raw
  // bytes to calculate the  reference crc with
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
  Path refPath = writeNestedWithNullsSampleParquetFile(conf, true, CompressionCodecName.SNAPPY);

  try (ParquetFileReader refReader =
    getParquetFileReader(refPath, conf, Collections.singletonList(colDValDesc))) {
    PageReadStore refPageReadStore = refReader.readNextRowGroup();
    // Read (decompressed) dictionary page
    byte[] dictPageBytes = readDictPage(colDValDesc, refPageReadStore).getBytes().toByteArray();
    byte[] colDValPageBytes = readNextPage(colDValDesc, refPageReadStore).getBytes().toByteArray();

    // Write out sample file with checksums
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeNestedWithNullsSampleParquetFile(conf, true, CompressionCodecName.SNAPPY);

    try (ParquetFileReader reader =
      getParquetFileReader(path, conf, Collections.singletonList(colDValDesc))) {
      PageReadStore pageReadStore = reader.readNextRowGroup();

      DictionaryPage dictPage = readDictPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(dictPage, snappy(dictPageBytes));
      assertCorrectContent(dictPage.getBytes().toByteArray(), dictPageBytes);

      DataPageV1 colDValPage = readNextPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(colDValPage, snappy(colDValPageBytes));
      assertCorrectContent(colDValPage.getBytes().toByteArray(), colDValPageBytes);
    }
  }
}
 
Example #28
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static void validateValuesForPage(int rowGroupID, int pageID, DictionaryPage dictPage, DataPage page, ColumnDescriptor columnDesc, List<?> expectedValues) {
  TestStatistics.SingletonPageReader pageReader = new TestStatistics.SingletonPageReader(dictPage, page);
  PrimitiveConverter converter = getConverter(rowGroupID, pageID, columnDesc.getType(), expectedValues);
  ColumnReaderImpl column = new ColumnReaderImpl(columnDesc, pageReader, converter, null);
  for (int i = 0; i < pageReader.getTotalValueCount(); i += 1) {
    column.writeCurrentValueToConverter();
    column.consume();
  }
}
 
Example #29
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static DictionaryPage reusableCopy(DictionaryPage dict) {
  if (dict == null) {
    return null;
  }
  try {
    return new DictionaryPage(
        BytesInput.from(dict.getBytes().toByteArray()),
        dict.getDictionarySize(), dict.getEncoding());
  } catch (IOException e) {
    throw new ParquetDecodingException("Cannot read dictionary", e);
  }
}
 
Example #30
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void validateFirstToLast(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) {
  int rowsRead = 0, pageID = 0;
  for (DataPage page : pageGroup) {
    List<?> expectedPageValues = expectedValues.subList(rowsRead, rowsRead + page.getValueCount());
    PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues);
    rowsRead += page.getValueCount();
    pageID++;
  }
}