parquet.hadoop.metadata.ColumnChunkMetaData Java Examples

The following examples show how to use parquet.hadoop.metadata.ColumnChunkMetaData. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReader.java    From paraflow with Apache License 2.0 6 votes vote down vote up
public Block readPrimitive(ColumnDescriptor columnDescriptor, Type type, IntList offsets)
        throws IOException
{
    ParquetColumnReader columnReader = columnReadersMap.get(columnDescriptor);
    if (columnReader.getPageReader() == null) {
        validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
        ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
        long startingPosition = metadata.getStartingPos();
        int totalSize = checkedCast(metadata.getTotalSize());
        byte[] buffer = new byte[totalSize];
        dataSource.readFully(startingPosition, buffer);
        ParquetColumnChunkDescriptor descriptor = new ParquetColumnChunkDescriptor(columnDescriptor, metadata, totalSize);
        ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0);
        columnReader.setPageReader(columnChunk.readAllPages());
    }
    return columnReader.readPrimitive(type, offsets);
}
 
Example #2
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 6 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, List<ColumnChunkMetaData> ccmeta) {
  Map<String,Object> chunks = new LinkedHashMap<String,Object>();
  for (ColumnChunkMetaData cmeta : ccmeta) {
    String[] path = cmeta.getPath().toArray();

    Map<String,Object> current = chunks;
    for (int i = 0; i < path.length - 1; ++i) {
      String next = path[i];
      if (!current.containsKey(next)) {
        current.put(next, new LinkedHashMap<String,Object>());
      }

      current = (Map<String,Object>)current.get(next);
    }

    current.put(path[path.length - 1], cmeta);
  }

  showColumnChunkDetails(out, chunks, 0);
}
 
Example #3
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 6 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) {
  long doff = meta.getDictionaryPageOffset();
  long foff = meta.getFirstDataPageOffset();
  long tsize = meta.getTotalSize();
  long usize = meta.getTotalUncompressedSize();
  long count = meta.getValueCount();
  double ratio = usize / (double)tsize;
  String encodings = Joiner.on(',').skipNulls().join(meta.getEncodings());

  if (name) {
    String path = Joiner.on('.').skipNulls().join(meta.getPath());
    out.format("%s: ", path);
  }

  out.format(" %s", meta.getType());
  out.format(" %s", meta.getCodec());
  out.format(" DO:%d", doff);
  out.format(" FPO:%d", foff);
  out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio);
  out.format(" VC:%d", count);
  if (!encodings.isEmpty()) out.format(" ENC:%s", encodings);
  out.println();
}
 
Example #4
Source File: ParquetReader.java    From paraflow with Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData getColumnChunkMetaData(ColumnDescriptor columnDescriptor)
        throws IOException
{
    for (ColumnChunkMetaData metadata : currentBlockMetadata.getColumns()) {
        if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
            return metadata;
        }
    }
    throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor);
}
 
Example #5
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
private static void showColumnChunkDetails(PrettyPrintWriter out, Map<String,Object> current, int depth) {
  for (Map.Entry<String,Object> entry : current.entrySet()) {
    String name = Strings.repeat(".", depth) + entry.getKey();
    Object value = entry.getValue();

    if (value instanceof Map) {
      out.println(name + ": ");
      showColumnChunkDetails(out, (Map<String,Object>)value, depth + 1);
    } else {
      out.print(name + ": ");
      showDetails(out, (ColumnChunkMetaData)value, false);
    }
  }
}
 
Example #6
Source File: ParquetMetadataReader.java    From paraflow with Apache License 2.0 4 votes vote down vote up
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file)
        throws IOException
{
    FileStatus fileStatus = fileSystem.getFileStatus(file);
    try (FSDataInputStream inputStream = fileSystem.open(file)) {
        // Parquet File Layout:
        //
        // MAGIC
        // variable: Data
        // variable: Metadata
        // 4 bytes: MetadataLength
        // MAGIC

        long length = fileStatus.getLen();
        validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
        long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;

        inputStream.seek(metadataLengthIndex);
        int metadataLength = readIntLittleEndian(inputStream);

        byte[] magic = new byte[MAGIC.length];
        inputStream.readFully(magic);
        validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));

        long metadataIndex = metadataLengthIndex - metadataLength;
        validateParquet(
                metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
                "Corrupted Parquet file: %s metadata index: %s out of range",
                file,
                metadataIndex);
        inputStream.seek(metadataIndex);
        FileMetaData fileMetaData = readFileMetaData(inputStream);
        List<SchemaElement> schema = fileMetaData.getSchema();
        validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

        MessageType messageType = readParquetSchema(schema);
        List<BlockMetaData> blocks = new ArrayList<>();
        List<RowGroup> rowGroups = fileMetaData.getRow_groups();
        if (rowGroups != null) {
            for (RowGroup rowGroup : rowGroups) {
                BlockMetaData blockMetaData = new BlockMetaData();
                blockMetaData.setRowCount(rowGroup.getNum_rows());
                blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
                List<ColumnChunk> columns = rowGroup.getColumns();
                validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
                String filePath = columns.get(0).getFile_path();
                for (ColumnChunk columnChunk : columns) {
                    validateParquet(
                            (filePath == null && columnChunk.getFile_path() == null)
                                    || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                            "all column chunks of the same row group must be in the same file");
                    ColumnMetaData metaData = columnChunk.meta_data;
                    String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
                    ColumnPath columnPath = ColumnPath.get(path);
                    ColumnChunkMetaData column = ColumnChunkMetaData.get(
                            columnPath,
                            messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(),
                            CompressionCodecName.fromParquet(metaData.codec),
                            readEncodings(metaData.encodings),
                            readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()),
                            metaData.data_page_offset,
                            metaData.dictionary_page_offset,
                            metaData.num_values,
                            metaData.total_compressed_size,
                            metaData.total_uncompressed_size);
                    blockMetaData.addColumn(column);
                }
                blockMetaData.setPath(filePath);
                blocks.add(blockMetaData);
            }
        }

        Map<String, String> keyValueMetaData = new HashMap<>();
        List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
        if (keyValueList != null) {
            for (KeyValue keyValue : keyValueList) {
                keyValueMetaData.put(keyValue.key, keyValue.value);
            }
        }
        return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
    }
}
 
Example #7
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 4 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta) {
  showDetails(out, meta, true);
}