org.apache.parquet.format.Util Java Examples

The following examples show how to use org.apache.parquet.format.Util. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testParquetMetadataConverterWithoutDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(null, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be false
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertFalse(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata pmd2 = converter.fromParquetMetadata(fmd2);

  long dicOffsetConverted =
    pmd2.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset();

  Assert.assertEquals(0, dicOffsetConverted);
}

Example #2

Source File: PredicateUtils.java From presto with Apache License 2.0

6 votes

private static Optional<DictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName)
{
    try {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        PageHeader pageHeader = Util.readPageHeader(inputStream);

        if (pageHeader.type != PageType.DICTIONARY_PAGE) {
            return Optional.empty();
        }

        Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size());
        DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
        ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
        int dictionarySize = dicHeader.getNum_values();

        return Optional.of(new DictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
    }
    catch (IOException ignored) {
        return Optional.empty();
    }
}

Example #3

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}

Example #4

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeOffsetIndexes(
    List<List<OffsetIndex>> offsetIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: offset indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
      if (offsetIndex == null) {
        continue;
      }
      ColumnChunkMetaData column = columns.get(cIndex);
      long offset = out.getPos();
      Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out);
      column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Example #5

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Example #6

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
  if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
      !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
    return null;
  }

  // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
  if (f.getPos() != meta.getStartingPos()) {
    f.seek(meta.getStartingPos());
  }

  PageHeader pageHeader = Util.readPageHeader(f);
  if (!pageHeader.isSetDictionary_page_header()) {
    return null; // TODO: should this complain?
  }

  DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
  BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

  return new DictionaryPage(
      decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
      compressedPage.getDictionarySize(),
      compressedPage.getEncoding());
}

Example #7

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

protected PageHeader readPageHeader() throws IOException {
  PageHeader pageHeader;
  stream.mark(8192); // headers should not be larger than 8k
  try {
    pageHeader = Util.readPageHeader(stream);
  } catch (IOException e) {
    // this is to workaround a bug where the compressedLength
    // of the chunk is missing the size of the header of the dictionary
    // to allow reading older files (using dictionary) we need this.
    // usually 13 to 19 bytes are missing
    // if the last page is smaller than this, the page header itself is truncated in the buffer.
    stream.reset(); // resetting the buffer to the position before we got the error
    LOG.info("completing the column chunk to read the page header");
    pageHeader = Util.readPageHeader(new SequenceInputStream(stream, f)); // trying again from the buffer + remainder of the stream.
  }
  return pageHeader;
}

Example #8

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * @param column
 *          the column chunk which the column index is to be returned for
 * @return the column index for the specified column chunk or {@code null} if there is no index
 * @throws IOException
 *           if any I/O error occurs during reading the file
 */
@Private
public ColumnIndex readColumnIndex(ColumnChunkMetaData column) throws IOException {
  IndexReference ref = column.getColumnIndexReference();
  if (ref == null) {
    return null;
  }
  f.seek(ref.getOffset());
  return ParquetMetadataConverter.fromParquetColumnIndex(column.getPrimitiveType(), Util.readColumnIndex(f));
}

Example #9

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testParquetMetadataConverterWithDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be true
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertTrue(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata parquetMetaDataConverted =
    converter.fromParquetMetadata(fmd2);

  long dicOffsetOriginal =
    parquetMetaData.getBlocks().get(0).getColumns().get(0)
      .getDictionaryPageOffset();
  long dicOffsetConverted =
    parquetMetaDataConverted.getBlocks().get(0).getColumns().get(0)
      .getDictionaryPageOffset();

  Assert.assertEquals(dicOffsetOriginal, dicOffsetConverted);
}

Example #10

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * @param column
 *          the column chunk which the offset index is to be returned for
 * @return the offset index for the specified column chunk or {@code null} if there is no index
 * @throws IOException
 *           if any I/O error occurs during reading the file
 */
@Private
public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException {
  IndexReference ref = column.getOffsetIndexReference();
  if (ref == null) {
    return null;
  }
  f.seek(ref.getOffset());
  return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f));
}

Example #11

Source File: PageReader.java From Bats with Apache License 2.0

5 votes

/**
 * Get the page header and the pageData (uncompressed) for the next page
 */
protected void nextInternal() throws IOException{
  Stopwatch timer = Stopwatch.createUnstarted();
  // next, we need to decompress the bytes
  // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one
  // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary
  do {
    long start=dataReader.getPos();
    timer.start();
    pageHeader = Util.readPageHeader(dataReader);
    long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
    long pageHeaderBytes=dataReader.getPos()-start;
    this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
    logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","",
        this.parentColumnReader.parentReader.hadoopPath,
        this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead);
    timer.reset();
    if (pageHeader.getType() == PageType.DICTIONARY_PAGE) {
      readDictionaryPage(pageHeader, parentColumnReader);
    }
  } while (pageHeader.getType() == PageType.DICTIONARY_PAGE);

  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();
  pageData = readPage(pageHeader, compressedSize, uncompressedSize);

}

Example #12

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Reads Bloom filter data for the given column chunk.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an BloomFilter object.
 * @throws IOException if there is an error while reading the Bloom filter.
 */
public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException {
  long bloomFilterOffset = meta.getBloomFilterOffset();
  f.seek(bloomFilterOffset);
  BloomFilterHeader bloomFilterHeader;

  // Read Bloom filter data header.
  try {
    bloomFilterHeader = Util.readBloomFilterHeader(f);
  } catch (IOException e) {
    LOG.warn("read no bloom filter");
    return null;
  }

  int numBytes = bloomFilterHeader.getNumBytes();
  if (numBytes <= 0 || numBytes > BlockSplitBloomFilter.UPPER_BOUND_BYTES) {
    LOG.warn("the read bloom filter size is wrong, size is {}", bloomFilterHeader.getNumBytes());
    return null;
  }

  if (!bloomFilterHeader.getHash().isSetXXHASH() || !bloomFilterHeader.getAlgorithm().isSetBLOCK()
    || !bloomFilterHeader.getCompression().isSetUNCOMPRESSED()) {
    LOG.warn("the read bloom filter is not supported yet,  algorithm = {}, hash = {}, compression = {}",
      bloomFilterHeader.getAlgorithm(), bloomFilterHeader.getHash(), bloomFilterHeader.getCompression());
    return null;
  }

  byte[] bitset = new byte[numBytes];
  f.readFully(bitset);
  return new BlockSplitBloomFilter(bitset);
}

Example #13

Source File: ColumnChunkIncReadStore.java From dremio-oss with Apache License 2.0

5 votes

@Override
public DictionaryPage readDictionaryPage() {
  if (dictionaryPage == null) {
    PageHeader pageHeader = new PageHeader();
    long pos = 0;
    try {
      pos = in.getPos();
      pageHeader = Util.readPageHeader(in.asSeekableInputStream());
      if (pageHeader.getDictionary_page_header() == null) {
        in.seek(pos);
        return null;
      }
      dictionaryPage = readDictionaryPageHelper(pageHeader);
    } catch (Exception e) {
      throw new RuntimeException("Error reading dictionary page." +
        "\nFile path: " + path.toURI().getPath() +
        "\nRow count: " + rowCount +
        "\nColumn Chunk Metadata: " + metaData +
        "\nPage Header: " + pageHeader +
        "\nFile offset: " + fileOffset +
        "\nSize: " + size +
        "\nValue read so far: " + valueReadSoFar +
        "\nPosition: " + pos, e);
    }
  }
  return dictionaryPage;
}

Example #14

Source File: ParquetWriter.java From presto with Apache License 2.0

5 votes

static Slice getFooter(List<RowGroup> rowGroups, MessageType messageType)
        throws IOException
{
    FileMetaData fileMetaData = new FileMetaData();
    fileMetaData.setVersion(1);
    fileMetaData.setSchema(MessageTypeConverter.toParquetSchema(messageType));
    long totalRows = rowGroups.stream().mapToLong(RowGroup::getNum_rows).sum();
    fileMetaData.setNum_rows(totalRows);
    fileMetaData.setRow_groups(ImmutableList.copyOf(rowGroups));

    DynamicSliceOutput dynamicSliceOutput = new DynamicSliceOutput(40);
    Util.writeFileMetaData(fileMetaData, dynamicSliceOutput);
    return dynamicSliceOutput.slice();
}

Example #15

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

protected PageHeader readPageHeader() throws IOException {
  return Util.readPageHeader(stream);
}

Example #16

Source File: ColumnDataReader.java From dremio-oss with Apache License 2.0

4 votes

public PageHeader readPageHeader() throws IOException{
  return Util.readPageHeader(input);
}

Example #17

Source File: ParquetColumnChunk.java From presto with Apache License 2.0

4 votes

protected PageHeader readPageHeader()
        throws IOException
{
    return Util.readPageHeader(input);
}

Example #18

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

4 votes

public PageHeader readPageHeader() throws IOException {
  return Util.readPageHeader(f);
}

Example #19

Source File: ColumnDataReader.java From Bats with Apache License 2.0

4 votes

public PageHeader readPageHeader() throws IOException{
  return Util.readPageHeader(input);
}