org.apache.parquet.format.Util Java Exaples

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testParquetMetadataConverterWithoutDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(null, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be false
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertFalse(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata pmd2 = converter.fromParquetMetadata(fmd2);

  long dicOffsetConverted =
    pmd2.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset();

  Assert.assertEquals(0, dicOffsetConverted);
}

Source File: PredicateUtils.java From presto with Apache License 2.0

6 votes

private static Optional<DictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName)
{
    try {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        PageHeader pageHeader = Util.readPageHeader(inputStream);

        if (pageHeader.type != PageType.DICTIONARY_PAGE) {
            return Optional.empty();
        }

        Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size());
        DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
        ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
        int dictionarySize = dicHeader.getNum_values();

        return Optional.of(new DictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
    }
    catch (IOException ignored) {
        return Optional.empty();
    }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeOffsetIndexes(
    List<List<OffsetIndex>> offsetIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: offset indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
      if (offsetIndex == null) {
        continue;
      }
      ColumnChunkMetaData column = columns.get(cIndex);
      long offset = out.getPos();
      Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out);
      column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
  if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
      !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
    return null;
  }

  // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
  if (f.getPos() != meta.getStartingPos()) {
    f.seek(meta.getStartingPos());
  }

  PageHeader pageHeader = Util.readPageHeader(f);
  if (!pageHeader.isSetDictionary_page_header()) {
    return null; // TODO: should this complain?
  }

  DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
  BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

  return new DictionaryPage(
      decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
      compressedPage.getDictionarySize(),
      compressedPage.getEncoding());
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

protected PageHeader readPageHeader() throws IOException {
  PageHeader pageHeader;
  stream.mark(8192); // headers should not be larger than 8k
  try {
    pageHeader = Util.readPageHeader(stream);
  } catch (IOException e) {
    // this is to workaround a bug where the compressedLength
    // of the chunk is missing the size of the header of the dictionary
    // to allow reading older files (using dictionary) we need this.
    // usually 13 to 19 bytes are missing
    // if the last page is smaller than this, the page header itself is truncated in the buffer.
    stream.reset(); // resetting the buffer to the position before we got the error
    LOG.info("completing the column chunk to read the page header");
    pageHeader = Util.readPageHeader(new SequenceInputStream(stream, f)); // trying again from the buffer + remainder of the stream.
  }
  return pageHeader;
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * @param column
 *          the column chunk which the column index is to be returned for
 * @return the column index for the specified column chunk or {@code null} if there is no index
 * @throws IOException
 *           if any I/O error occurs during reading the file
 */
@Private
public ColumnIndex readColumnIndex(ColumnChunkMetaData column) throws IOException {
  IndexReference ref = column.getColumnIndexReference();
  if (ref == null) {
    return null;
  }
  f.seek(ref.getOffset());
  return ParquetMetadataConverter.fromParquetColumnIndex(column.getPrimitiveType(), Util.readColumnIndex(f));
}

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testParquetMetadataConverterWithDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be true
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertTrue(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata parquetMetaDataConverted =
    converter.fromParquetMetadata(fmd2);

  long dicOffsetOriginal =
    parquetMetaData.getBlocks().get(0).getColumns().get(0)
      .getDictionaryPageOffset();
  long dicOffsetConverted =
    parquetMetaDataConverted.getBlocks().get(0).getColumns().get(0)
      .getDictionaryPageOffset();

  Assert.assertEquals(dicOffsetOriginal, dicOffsetConverted);
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * @param column
 *          the column chunk which the offset index is to be returned for
 * @return the offset index for the specified column chunk or {@code null} if there is no index
 * @throws IOException
 *           if any I/O error occurs during reading the file
 */
@Private
public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException {
  IndexReference ref = column.getOffsetIndexReference();
  if (ref == null) {
    return null;
  }
  f.seek(ref.getOffset());
  return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f));
}

Source File: PageReader.java From Bats with Apache License 2.0

5 votes

/**
 * Get the page header and the pageData (uncompressed) for the next page
 */
protected void nextInternal() throws IOException{
  Stopwatch timer = Stopwatch.createUnstarted();
  // next, we need to decompress the bytes
  // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one
  // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary
  do {
    long start=dataReader.getPos();
    timer.start();
    pageHeader = Util.readPageHeader(dataReader);
    long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
    long pageHeaderBytes=dataReader.getPos()-start;
    this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
    logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","",
        this.parentColumnReader.parentReader.hadoopPath,
        this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead);
    timer.reset();
    if (pageHeader.getType() == PageType.DICTIONARY_PAGE) {
      readDictionaryPage(pageHeader, parentColumnReader);
    }
  } while (pageHeader.getType() == PageType.DICTIONARY_PAGE);

  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();
  pageData = readPage(pageHeader, compressedSize, uncompressedSize);

}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Reads Bloom filter data for the given column chunk.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an BloomFilter object.
 * @throws IOException if there is an error while reading the Bloom filter.
 */
public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException {
  long bloomFilterOffset = meta.getBloomFilterOffset();
  f.seek(bloomFilterOffset);
  BloomFilterHeader bloomFilterHeader;

  // Read Bloom filter data header.
  try {
    bloomFilterHeader = Util.readBloomFilterHeader(f);
  } catch (IOException e) {
    LOG.warn("read no bloom filter");
    return null;
  }

  int numBytes = bloomFilterHeader.getNumBytes();
  if (numBytes <= 0 || numBytes > BlockSplitBloomFilter.UPPER_BOUND_BYTES) {
    LOG.warn("the read bloom filter size is wrong, size is {}", bloomFilterHeader.getNumBytes());
    return null;
  }

  if (!bloomFilterHeader.getHash().isSetXXHASH() || !bloomFilterHeader.getAlgorithm().isSetBLOCK()
    || !bloomFilterHeader.getCompression().isSetUNCOMPRESSED()) {
    LOG.warn("the read bloom filter is not supported yet,  algorithm = {}, hash = {}, compression = {}",
      bloomFilterHeader.getAlgorithm(), bloomFilterHeader.getHash(), bloomFilterHeader.getCompression());
    return null;
  }

  byte[] bitset = new byte[numBytes];
  f.readFully(bitset);
  return new BlockSplitBloomFilter(bitset);
}

Source File: ColumnChunkIncReadStore.java From dremio-oss with Apache License 2.0

5 votes

@Override
public DictionaryPage readDictionaryPage() {
  if (dictionaryPage == null) {
    PageHeader pageHeader = new PageHeader();
    long pos = 0;
    try {
      pos = in.getPos();
      pageHeader = Util.readPageHeader(in.asSeekableInputStream());
      if (pageHeader.getDictionary_page_header() == null) {
        in.seek(pos);
        return null;
      }
      dictionaryPage = readDictionaryPageHelper(pageHeader);
    } catch (Exception e) {
      throw new RuntimeException("Error reading dictionary page." +
        "\nFile path: " + path.toURI().getPath() +
        "\nRow count: " + rowCount +
        "\nColumn Chunk Metadata: " + metaData +
        "\nPage Header: " + pageHeader +
        "\nFile offset: " + fileOffset +
        "\nSize: " + size +
        "\nValue read so far: " + valueReadSoFar +
        "\nPosition: " + pos, e);
    }
  }
  return dictionaryPage;
}

Source File: ParquetWriter.java From presto with Apache License 2.0

5 votes

static Slice getFooter(List<RowGroup> rowGroups, MessageType messageType)
        throws IOException
{
    FileMetaData fileMetaData = new FileMetaData();
    fileMetaData.setVersion(1);
    fileMetaData.setSchema(MessageTypeConverter.toParquetSchema(messageType));
    long totalRows = rowGroups.stream().mapToLong(RowGroup::getNum_rows).sum();
    fileMetaData.setNum_rows(totalRows);
    fileMetaData.setRow_groups(ImmutableList.copyOf(rowGroups));

    DynamicSliceOutput dynamicSliceOutput = new DynamicSliceOutput(40);
    Util.writeFileMetaData(fileMetaData, dynamicSliceOutput);
    return dynamicSliceOutput.slice();
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

protected PageHeader readPageHeader() throws IOException {
  return Util.readPageHeader(stream);
}

Source File: ColumnDataReader.java From dremio-oss with Apache License 2.0

4 votes

public PageHeader readPageHeader() throws IOException{
  return Util.readPageHeader(input);
}

Source File: ParquetColumnChunk.java From presto with Apache License 2.0

4 votes

protected PageHeader readPageHeader()
        throws IOException
{
    return Util.readPageHeader(input);
}

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

4 votes

public PageHeader readPageHeader() throws IOException {
  return Util.readPageHeader(f);
}

Source File: ColumnDataReader.java From Bats with Apache License 2.0

4 votes

public PageHeader readPageHeader() throws IOException{
  return Util.readPageHeader(input);
}

org.apache.parquet.format.Util Java Examples