Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#setRowCount()

The following examples show how to use org.apache.parquet.hadoop.metadata.BlockMetaData#setRowCount() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroRecordFileParquet() {
  Assume.assumeTrue(format == FileFormat.PARQUET);
  BlockMetaData emptyBlock = new BlockMetaData();
  emptyBlock.setRowCount(0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example 2
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroRecordFile() {
  BlockMetaData emptyBlock = new BlockMetaData();
  emptyBlock.setRowCount(0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr)
        .shouldRead(PARQUET_SCHEMA, emptyBlock);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example 3
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
  BlockMetaData blockMetaData = new BlockMetaData();

  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      100l, 100l, valueCount, 100l, 100l);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(200l);
  blockMetaData.setRowCount(valueCount);
  return blockMetaData;
}
 
Example 4
Source File: MetadataReader.java    From presto with Apache License 2.0 4 votes vote down vote up
public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize)
        throws IOException

{
    // Parquet File Layout:
    //
    // MAGIC
    // variable: Data
    // variable: Metadata
    // 4 bytes: MetadataLength
    // MAGIC

    validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
    long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length;

    InputStream footerStream = readFully(inputStream, metadataLengthIndex, PARQUET_METADATA_LENGTH + MAGIC.length);
    int metadataLength = readIntLittleEndian(footerStream);

    byte[] magic = new byte[MAGIC.length];
    footerStream.read(magic);
    validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));

    long metadataIndex = metadataLengthIndex - metadataLength;
    validateParquet(
            metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
            "Corrupted Parquet file: %s metadata index: %s out of range",
            file,
            metadataIndex);
    InputStream metadataStream = readFully(inputStream, metadataIndex, metadataLength);
    FileMetaData fileMetaData = readFileMetaData(metadataStream);
    List<SchemaElement> schema = fileMetaData.getSchema();
    validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

    MessageType messageType = readParquetSchema(schema);
    List<BlockMetaData> blocks = new ArrayList<>();
    List<RowGroup> rowGroups = fileMetaData.getRow_groups();
    if (rowGroups != null) {
        for (RowGroup rowGroup : rowGroups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                validateParquet(
                        (filePath == null && columnChunk.getFile_path() == null)
                                || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                        "all column chunks of the same row group must be in the same file");
                ColumnMetaData metaData = columnChunk.meta_data;
                String[] path = metaData.path_in_schema.stream()
                        .map(value -> value.toLowerCase(Locale.ENGLISH))
                        .toArray(String[]::new);
                ColumnPath columnPath = ColumnPath.get(path);
                PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
                ColumnChunkMetaData column = ColumnChunkMetaData.get(
                        columnPath,
                        primitiveType,
                        CompressionCodecName.fromParquet(metaData.codec),
                        PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats),
                        readEncodings(metaData.encodings),
                        readStats(Optional.ofNullable(fileMetaData.getCreated_by()), Optional.ofNullable(metaData.statistics), primitiveType),
                        metaData.data_page_offset,
                        metaData.dictionary_page_offset,
                        metaData.num_values,
                        metaData.total_compressed_size,
                        metaData.total_uncompressed_size);
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }

    Map<String, String> keyValueMetaData = new HashMap<>();
    List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
    if (keyValueList != null) {
        for (KeyValue keyValue : keyValueList) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}
 
Example 5
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
  MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
  List<RowGroup> row_groups = parquetMetadata.getRow_groups();
  if (row_groups != null) {
    for (RowGroup rowGroup : row_groups) {
      BlockMetaData blockMetaData = new BlockMetaData();
      blockMetaData.setRowCount(rowGroup.getNum_rows());
      blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
      List<ColumnChunk> columns = rowGroup.getColumns();
      String filePath = columns.get(0).getFile_path();
      for (ColumnChunk columnChunk : columns) {
        if ((filePath == null && columnChunk.getFile_path() != null)
            || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
          throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
        }
        ColumnMetaData metaData = columnChunk.meta_data;
        ColumnPath path = getPath(metaData);
        ColumnChunkMetaData column = ColumnChunkMetaData.get(
            path,
            messageType.getType(path.toArray()).asPrimitiveType(),
            fromFormatCodec(metaData.codec),
            convertEncodingStats(metaData.getEncoding_stats()),
            fromFormatEncodings(metaData.encodings),
            fromParquetStatistics(
                parquetMetadata.getCreated_by(),
                metaData.statistics,
                messageType.getType(path.toArray()).asPrimitiveType()),
            metaData.data_page_offset,
            metaData.dictionary_page_offset,
            metaData.num_values,
            metaData.total_compressed_size,
            metaData.total_uncompressed_size);
        column.setColumnIndexReference(toColumnIndexReference(columnChunk));
        column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
        column.setBloomFilterOffset(metaData.bloom_filter_offset);
        // TODO
        // index_page_offset
        // key_value_metadata
        blockMetaData.addColumn(column);
      }
      blockMetaData.setPath(filePath);
      blocks.add(blockMetaData);
    }
  }
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
  if (key_value_metadata != null) {
    for (KeyValue keyValue : key_value_metadata) {
      keyValueMetaData.put(keyValue.key, keyValue.value);
    }
  }
  return new ParquetMetadata(
      new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()),
      blocks);
}