Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#addColumn()

The following examples show how to use org.apache.parquet.hadoop.metadata.BlockMetaData#addColumn() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
  BlockMetaData blockMetaData = new BlockMetaData();

  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      100l, 100l, valueCount, 100l, 100l);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(200l);
  blockMetaData.setRowCount(valueCount);
  return blockMetaData;
}
 
Example 2
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private BlockMetaData newBlock(long start, long compressedBlockSize) {
  BlockMetaData blockMetaData = new BlockMetaData();
  long uncompressedSize = compressedBlockSize * 2;//assuming the compression ratio is 2
  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
                                                       PrimitiveTypeName.BINARY,
                                                       CompressionCodecName.GZIP,
                                                       new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
                                                       new BinaryStatistics(),
                                                       start, 0l, 0l, compressedBlockSize, uncompressedSize);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(uncompressedSize);
  return blockMetaData;
}
 
Example 3
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ParquetMetadata createParquetMetaData(Encoding dicEncoding,
  Encoding dataEncoding) {
  MessageType schema =
    parseMessageType("message schema { optional int32 col (INT_32); }");
  org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
    new org.apache.parquet.hadoop.metadata.FileMetaData(schema,
      new HashMap<String, String>(), null);
  List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
  BlockMetaData blockMetaData = new BlockMetaData();
  EncodingStats.Builder builder = new EncodingStats.Builder();
  if (dicEncoding!= null) {
    builder.addDictEncoding(dicEncoding).build();
  }
  builder.addDataEncoding(dataEncoding);
  EncodingStats es = builder.build();
  Set<org.apache.parquet.column.Encoding> e =
    new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.INT32;
  ColumnPath p = ColumnPath.get("col");
  CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md =
    ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
  blockMetaData.addColumn(md);
  blockMetaDataList.add(blockMetaData);
  return new ParquetMetadata(fileMetaData, blockMetaDataList);
}
 
Example 4
Source File: MetadataReader.java    From presto with Apache License 2.0 4 votes vote down vote up
public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize)
        throws IOException

{
    // Parquet File Layout:
    //
    // MAGIC
    // variable: Data
    // variable: Metadata
    // 4 bytes: MetadataLength
    // MAGIC

    validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
    long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length;

    InputStream footerStream = readFully(inputStream, metadataLengthIndex, PARQUET_METADATA_LENGTH + MAGIC.length);
    int metadataLength = readIntLittleEndian(footerStream);

    byte[] magic = new byte[MAGIC.length];
    footerStream.read(magic);
    validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));

    long metadataIndex = metadataLengthIndex - metadataLength;
    validateParquet(
            metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
            "Corrupted Parquet file: %s metadata index: %s out of range",
            file,
            metadataIndex);
    InputStream metadataStream = readFully(inputStream, metadataIndex, metadataLength);
    FileMetaData fileMetaData = readFileMetaData(metadataStream);
    List<SchemaElement> schema = fileMetaData.getSchema();
    validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

    MessageType messageType = readParquetSchema(schema);
    List<BlockMetaData> blocks = new ArrayList<>();
    List<RowGroup> rowGroups = fileMetaData.getRow_groups();
    if (rowGroups != null) {
        for (RowGroup rowGroup : rowGroups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                validateParquet(
                        (filePath == null && columnChunk.getFile_path() == null)
                                || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                        "all column chunks of the same row group must be in the same file");
                ColumnMetaData metaData = columnChunk.meta_data;
                String[] path = metaData.path_in_schema.stream()
                        .map(value -> value.toLowerCase(Locale.ENGLISH))
                        .toArray(String[]::new);
                ColumnPath columnPath = ColumnPath.get(path);
                PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
                ColumnChunkMetaData column = ColumnChunkMetaData.get(
                        columnPath,
                        primitiveType,
                        CompressionCodecName.fromParquet(metaData.codec),
                        PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats),
                        readEncodings(metaData.encodings),
                        readStats(Optional.ofNullable(fileMetaData.getCreated_by()), Optional.ofNullable(metaData.statistics), primitiveType),
                        metaData.data_page_offset,
                        metaData.dictionary_page_offset,
                        metaData.num_values,
                        metaData.total_compressed_size,
                        metaData.total_uncompressed_size);
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }

    Map<String, String> keyValueMetaData = new HashMap<>();
    List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
    if (keyValueList != null) {
        for (KeyValue keyValue : keyValueList) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}
 
Example 5
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
  MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
  List<RowGroup> row_groups = parquetMetadata.getRow_groups();
  if (row_groups != null) {
    for (RowGroup rowGroup : row_groups) {
      BlockMetaData blockMetaData = new BlockMetaData();
      blockMetaData.setRowCount(rowGroup.getNum_rows());
      blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
      List<ColumnChunk> columns = rowGroup.getColumns();
      String filePath = columns.get(0).getFile_path();
      for (ColumnChunk columnChunk : columns) {
        if ((filePath == null && columnChunk.getFile_path() != null)
            || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
          throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
        }
        ColumnMetaData metaData = columnChunk.meta_data;
        ColumnPath path = getPath(metaData);
        ColumnChunkMetaData column = ColumnChunkMetaData.get(
            path,
            messageType.getType(path.toArray()).asPrimitiveType(),
            fromFormatCodec(metaData.codec),
            convertEncodingStats(metaData.getEncoding_stats()),
            fromFormatEncodings(metaData.encodings),
            fromParquetStatistics(
                parquetMetadata.getCreated_by(),
                metaData.statistics,
                messageType.getType(path.toArray()).asPrimitiveType()),
            metaData.data_page_offset,
            metaData.dictionary_page_offset,
            metaData.num_values,
            metaData.total_compressed_size,
            metaData.total_uncompressed_size);
        column.setColumnIndexReference(toColumnIndexReference(columnChunk));
        column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
        column.setBloomFilterOffset(metaData.bloom_filter_offset);
        // TODO
        // index_page_offset
        // key_value_metadata
        blockMetaData.addColumn(column);
      }
      blockMetaData.setPath(filePath);
      blocks.add(blockMetaData);
    }
  }
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
  if (key_value_metadata != null) {
    for (KeyValue keyValue : key_value_metadata) {
      keyValueMetaData.put(keyValue.key, keyValue.value);
    }
  }
  return new ParquetMetadata(
      new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()),
      blocks);
}