org.apache.parquet.format.PageType Java Examples

The following examples show how to use org.apache.parquet.format.PageType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
private static Optional<DictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName)
{
    try {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        PageHeader pageHeader = Util.readPageHeader(inputStream);

        if (pageHeader.type != PageType.DICTIONARY_PAGE) {
            return Optional.empty();
        }

        Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size());
        DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
        ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
        int dictionarySize = dicHeader.getNum_values();

        return Optional.of(new DictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
    }
    catch (IOException ignored) {
        return Optional.empty();
    }
}
 
Example #2
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private PageHeader newDataPageHeader(
    int uncompressedSize, int compressedSize,
    int valueCount,
    org.apache.parquet.column.Encoding rlEncoding,
    org.apache.parquet.column.Encoding dlEncoding,
    org.apache.parquet.column.Encoding valuesEncoding,
    int crc) {
  PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize);
  pageHeader.setCrc(crc);
  pageHeader.setData_page_header(new DataPageHeader(
      valueCount,
      getEncoding(valuesEncoding),
      getEncoding(dlEncoding),
      getEncoding(rlEncoding)));
  return pageHeader;
}
 
Example #3
Source File: PageReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * Get the page header and the pageData (uncompressed) for the next page
 */
protected void nextInternal() throws IOException{
  Stopwatch timer = Stopwatch.createUnstarted();
  // next, we need to decompress the bytes
  // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one
  // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary
  do {
    long start=dataReader.getPos();
    timer.start();
    pageHeader = Util.readPageHeader(dataReader);
    long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
    long pageHeaderBytes=dataReader.getPos()-start;
    this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
    logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","",
        this.parentColumnReader.parentReader.hadoopPath,
        this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead);
    timer.reset();
    if (pageHeader.getType() == PageType.DICTIONARY_PAGE) {
      readDictionaryPage(pageHeader, parentColumnReader);
    }
  } while (pageHeader.getType() == PageType.DICTIONARY_PAGE);

  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();
  pageData = readPage(pageHeader, compressedSize, uncompressedSize);

}
 
Example #4
Source File: PageReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
protected void updateStats(PageHeader pageHeader, String op, long start, long time, long bytesin, long bytesout) {
  String pageType = "Data Page";
  if (pageHeader.type == PageType.DICTIONARY_PAGE) {
    pageType = "Dictionary Page";
  }
  logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}", op, pageType,
      this.parentColumnReader.parentReader.hadoopPath,
      this.parentColumnReader.columnDescriptor.toString(), start, bytesin, bytesout, time);

  if (pageHeader.type != PageType.DICTIONARY_PAGE) {
    if (bytesin == bytesout) {
      this.stats.timeDataPageLoads.addAndGet(time);
      this.stats.numDataPageLoads.incrementAndGet();
      this.stats.totalDataPageReadBytes.addAndGet(bytesin);
    } else {
      this.stats.timeDataPagesDecompressed.addAndGet(time);
      this.stats.numDataPagesDecompressed.incrementAndGet();
      this.stats.totalDataDecompressedBytes.addAndGet(bytesin);
    }
  } else {
    if (bytesin == bytesout) {
      this.stats.timeDictPageLoads.addAndGet(time);
      this.stats.numDictPageLoads.incrementAndGet();
      this.stats.totalDictPageReadBytes.addAndGet(bytesin);
    } else {
      this.stats.timeDictPagesDecompressed.addAndGet(time);
      this.stats.numDictPagesDecompressed.incrementAndGet();
      this.stats.totalDictDecompressedBytes.addAndGet(bytesin);
    }
  }
}
 
Example #5
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException {
  // Passing the max footer length is not required in this case as the parquet reader would already have failed.
  final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER,
    ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}
 
Example #6
Source File: PageReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void updateStats(PageHeader pageHeader, String op, long start, long time, long bytesin, long bytesout) {
  String pageType = "Data Page";
  if (pageHeader.type == PageType.DICTIONARY_PAGE) {
    pageType = "Dictionary Page";
  }
  logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}", op, pageType.toString(),
      this.parentColumnReader.parentReader.fsPath,
      this.parentColumnReader.columnDescriptor.toString(), start, bytesin, bytesout, time);
  if (pageHeader.type != PageType.DICTIONARY_PAGE) {
    if (bytesin == bytesout) {
      this.stats.timePageLoads += time;
      this.stats.numPageLoads++;
      this.stats.totalPageReadBytes += bytesin;
    } else {
      this.stats.timePagesDecompressed += time;
      this.stats.numPagesDecompressed++;
      this.stats.totalDecompressedBytes += bytesin;
    }
  } else {
    if (bytesin == bytesout) {
      this.stats.timeDictPageLoads += time;
      this.stats.numDictPageLoads++;
      this.stats.totalDictPageReadBytes += bytesin;
    } else {
      this.stats.timeDictPagesDecompressed += time;
      this.stats.numDictPagesDecompressed++;
      this.stats.totalDictDecompressedBytes += bytesin;
    }
  }
}
 
Example #7
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private PageHeader newDataPageHeader(
  int uncompressedSize, int compressedSize,
  int valueCount,
  org.apache.parquet.column.Encoding rlEncoding,
  org.apache.parquet.column.Encoding dlEncoding,
  org.apache.parquet.column.Encoding valuesEncoding) {
  PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize);
  pageHeader.setData_page_header(new DataPageHeader(
    valueCount,
    getEncoding(valuesEncoding),
    getEncoding(dlEncoding),
    getEncoding(rlEncoding)));
  return pageHeader;
}
 
Example #8
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private PageHeader newDataPageV2Header(
    int uncompressedSize, int compressedSize,
    int valueCount, int nullCount, int rowCount,
    org.apache.parquet.column.Encoding dataEncoding,
    int rlByteLength, int dlByteLength) {
  // TODO: pageHeader.crc = ...;
  DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2(
      valueCount, nullCount, rowCount,
      getEncoding(dataEncoding),
      dlByteLength, rlByteLength);
  PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize);
  pageHeader.setData_page_header_v2(dataPageHeaderV2);
  return pageHeader;
}
 
Example #9
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void writeDictionaryPageHeader(
  int uncompressedSize, int compressedSize, int valueCount,
  org.apache.parquet.column.Encoding valuesEncoding, OutputStream to) throws IOException {
  PageHeader pageHeader = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize);
  pageHeader.setDictionary_page_header(new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding)));
  writePageHeader(pageHeader, to);
}
 
Example #10
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void writeDictionaryPageHeader(
    int uncompressedSize, int compressedSize, int valueCount,
    org.apache.parquet.column.Encoding valuesEncoding, int crc, OutputStream to) throws IOException {
  PageHeader pageHeader = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize);
  pageHeader.setCrc(crc);
  pageHeader.setDictionary_page_header(new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding)));
  writePageHeader(pageHeader, to);
}
 
Example #11
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testPageHeader() throws IOException {
  ByteArrayOutputStream out = new ByteArrayOutputStream();
  PageType type = PageType.DATA_PAGE;
  int compSize = 10;
  int uncSize = 20;
  PageHeader pageHeader = new PageHeader(type, uncSize, compSize);
  writePageHeader(pageHeader, out);
  PageHeader readPageHeader = readPageHeader(new ByteArrayInputStream(out.toByteArray()));
  assertEquals(pageHeader, readPageHeader);
}
 
Example #12
Source File: PageReader.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws java.io.IOException
 */
public boolean next() throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  currentPageCount = -1;
  valuesRead = 0;
  valuesReadyToRead = 0;

  // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
  // and submit a bug report
  if(!dataReader.hasRemainder() || parentColumnReader.totalValuesRead == parentColumnReader.columnChunkMetaData.getValueCount()) {
    return false;
  }
  clearBuffers();

  // next, we need to decompress the bytes
  // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one
  // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary
  do {
    long start=inputStream.getPos();
    timer.start();
    pageHeader = dataReader.readPageHeader();
    long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
    this.updateStats(pageHeader, "Page Header Read", start, timeToRead, 0,0);
    logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","",
        this.parentColumnReader.parentReader.fsPath,
        this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead);
    timer.reset();
    if (pageHeader.getType() == PageType.DICTIONARY_PAGE) {
      readDictionaryPage(pageHeader, parentColumnReader);
    }
  } while (pageHeader.getType() == PageType.DICTIONARY_PAGE);

  //TODO: Handle buffer allocation exception

  allocatePageData(pageHeader.getUncompressed_page_size());
  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();
  readPage(pageHeader, compressedSize, uncompressedSize, pageData);

  currentPageCount = pageHeader.data_page_header.num_values;

  final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);

  final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
  final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);

  byteLength = pageHeader.uncompressed_page_size;

  final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, LargeMemoryUtil.checkedCastToInt(pageData.capacity()));

  readPosInBytes = 0;
  if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
    repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
    repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
    // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
    // read the first zero here to simplify the reading processes, and start reading the first value the same as all
    // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
    // the first list of repetition levels
    readPosInBytes = repetitionLevels.getNextOffset();
    repetitionLevels.readInteger();
  }
  if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0){
    parentColumnReader.currDefLevel = -1;
    definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
    definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    readPosInBytes = definitionLevels.getNextOffset();
    if (!valueEncoding.usesDictionary()) {
      valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
      valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    }
  }
  if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
    valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
    valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
  }
  if (valueEncoding.usesDictionary()) {
    // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
    // actually copying the values out into the vectors
    dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
    dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    dictionaryValueReader = new DictionaryValuesReader(dictionary);
    dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    parentColumnReader.usingDictionary = true;
  } else {
    parentColumnReader.usingDictionary = false;
  }
  // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
  // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
  // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
  // definition and repetition level data which is stored alongside the page data itself
  readyToReadPosInBytes = readPosInBytes;
  return true;
}