org.apache.parquet.format.FileMetaData Java Examples

The following examples show how to use org.apache.parquet.format.FileMetaData. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * verifies that splits will end up being a partition of the rowgroup
 * they are all found only once
 * @param md
 * @param splitWidth
 */
private void verifyAllFilters(FileMetaData md, long splitWidth) {
  Set<Long> offsetsFound = new TreeSet<Long>();
  for (long start = 0; start < fileSize(md); start += splitWidth) {
    FileMetaData filtered = filter(md, start, start + splitWidth);
    for (RowGroup rg : filtered.getRow_groups()) {
      long o = getOffset(rg);
      if (offsetsFound.contains(o)) {
        fail("found the offset twice: " + o);
      } else {
        offsetsFound.add(o);
      }
    }
  }
  if (offsetsFound.size() != md.row_groups.size()) {
    fail("missing row groups, "
        + "found: " + offsetsFound
        + "\nexpected " + md.getRow_groups());
  }
}
 
Example #2
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
  List<BlockMetaData> blocks = parquetMetadata.getBlocks();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long numRows = 0;
  for (BlockMetaData block : blocks) {
    numRows += block.getRowCount();
    addRowGroup(parquetMetadata, rowGroups, block);
  }
  FileMetaData fileMetaData = new FileMetaData(
      currentVersion,
      toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
      numRows,
      rowGroups);

  Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
  for (Entry<String, String> keyValue : keyValues) {
    addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
  }

  fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());

  fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));

  return fileMetaData;
}
 
Example #3
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) {
  List<RowGroup> rowGroups = metaData.getRow_groups();
  List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
  for (RowGroup rowGroup : rowGroups) {
    long totalSize = 0;
    long startIndex = getOffset(rowGroup.getColumns().get(0));
    for (ColumnChunk col : rowGroup.getColumns()) {
      totalSize += col.getMeta_data().getTotal_compressed_size();
    }
    long midPoint = startIndex + totalSize / 2;
    if (filter.contains(midPoint)) {
      newRowGroups.add(rowGroup);
    }
  }
  metaData.setRow_groups(newRowGroups);
  return metaData;
}
 
Example #4
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private FileMetaData metadata(long... sizes) {
  List<SchemaElement> schema = emptyList();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long offset = 0;
  for (long size : sizes) {
    ColumnChunk columnChunk = new ColumnChunk(offset);
    columnChunk.setMeta_data(new ColumnMetaData(
        INT32,
        Collections.<org.apache.parquet.format.Encoding>emptyList(),
        Collections.<String>emptyList(),
        UNCOMPRESSED, 10l, size * 2, size, offset));
    rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
    offset += size;
  }
  return new FileMetaData(1, schema, sizes.length, rowGroups);
}
 
Example #5
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetMetadataConverterWithoutDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(null, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be false
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertFalse(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata pmd2 = converter.fromParquetMetadata(fmd2);

  long dicOffsetConverted =
    pmd2.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset();

  Assert.assertEquals(0, dicOffsetConverted);
}
 
Example #6
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 6 votes vote down vote up
/**
 * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
 *
 * @param footer Parquet file metadata
 * @return       schema full path to SchemaElement map
 */
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  Map<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

  Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();

  // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
  // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
  if (iter.hasNext()) {
    iter.next();
  }
  while (iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
  }
  return schemaElements;
}
 
Example #7
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testParquetMetadataConverterWithDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be true
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertTrue(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata parquetMetaDataConverted =
    converter.fromParquetMetadata(fmd2);

  long dicOffsetOriginal =
    parquetMetaData.getBlocks().get(0).getColumns().get(0)
      .getDictionaryPageOffset();
  long dicOffsetConverted =
    parquetMetaDataConverted.getBlocks().get(0).getColumns().get(0)
      .getDictionaryPageOffset();

  Assert.assertEquals(dicOffsetOriginal, dicOffsetConverted);
}
 
Example #8
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ParquetMetadata createParquetMetaData(Encoding dicEncoding,
  Encoding dataEncoding) {
  MessageType schema =
    parseMessageType("message schema { optional int32 col (INT_32); }");
  org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
    new org.apache.parquet.hadoop.metadata.FileMetaData(schema,
      new HashMap<String, String>(), null);
  List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
  BlockMetaData blockMetaData = new BlockMetaData();
  EncodingStats.Builder builder = new EncodingStats.Builder();
  if (dicEncoding!= null) {
    builder.addDictEncoding(dicEncoding).build();
  }
  builder.addDataEncoding(dataEncoding);
  EncodingStats es = builder.build();
  Set<org.apache.parquet.column.Encoding> e =
    new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.INT32;
  ColumnPath p = ColumnPath.get("col");
  CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md =
    ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
  blockMetaData.addColumn(md);
  blockMetaDataList.add(blockMetaData);
  return new ParquetMetadata(fileMetaData, blockMetaDataList);
}
 
Example #9
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private long fileSize(FileMetaData md) {
  long size = 0;
  for (RowGroup rg : md.getRow_groups()) {
    size += rg.total_byte_size;
  }
  return size;
}
 
Example #10
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void verifyMD(FileMetaData md, long... offsets) {
  assertEquals(offsets.length, md.row_groups.size());
  for (int i = 0; i < offsets.length; i++) {
    long offset = offsets[i];
    RowGroup rowGroup = md.getRow_groups().get(i);
    assertEquals(offset, getOffset(rowGroup));
  }
}
 
Example #11
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) {
  List<RowGroup> rowGroups = metaData.getRow_groups();
  List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
  for (RowGroup rowGroup : rowGroups) {
    long startIndex = getOffset(rowGroup.getColumns().get(0));
    if (filter.contains(startIndex)) {
      newRowGroups.add(rowGroup);
    }
  }
  metaData.setRow_groups(newRowGroups);
  return metaData;
}
 
Example #12
Source File: ParquetReaderUtility.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  HashMap<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
  for (SchemaElement se : fileMetaData.getSchema()) {
    schemaElements.put(se.getName(), se);
  }
  return schemaElements;
}
 
Example #13
Source File: ParquetWriter.java    From presto with Apache License 2.0 5 votes vote down vote up
static Slice getFooter(List<RowGroup> rowGroups, MessageType messageType)
        throws IOException
{
    FileMetaData fileMetaData = new FileMetaData();
    fileMetaData.setVersion(1);
    fileMetaData.setSchema(MessageTypeConverter.toParquetSchema(messageType));
    long totalRows = rowGroups.stream().mapToLong(RowGroup::getNum_rows).sum();
    fileMetaData.setNum_rows(totalRows);
    fileMetaData.setRow_groups(ImmutableList.copyOf(rowGroups));

    DynamicSliceOutput dynamicSliceOutput = new DynamicSliceOutput(40);
    Util.writeFileMetaData(fileMetaData, dynamicSliceOutput);
    return dynamicSliceOutput.slice();
}
 
Example #14
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
  MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
  List<RowGroup> row_groups = parquetMetadata.getRow_groups();
  if (row_groups != null) {
    for (RowGroup rowGroup : row_groups) {
      BlockMetaData blockMetaData = new BlockMetaData();
      blockMetaData.setRowCount(rowGroup.getNum_rows());
      blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
      List<ColumnChunk> columns = rowGroup.getColumns();
      String filePath = columns.get(0).getFile_path();
      for (ColumnChunk columnChunk : columns) {
        if ((filePath == null && columnChunk.getFile_path() != null)
            || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
          throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
        }
        ColumnMetaData metaData = columnChunk.meta_data;
        ColumnPath path = getPath(metaData);
        ColumnChunkMetaData column = ColumnChunkMetaData.get(
            path,
            messageType.getType(path.toArray()).asPrimitiveType(),
            fromFormatCodec(metaData.codec),
            convertEncodingStats(metaData.getEncoding_stats()),
            fromFormatEncodings(metaData.encodings),
            fromParquetStatistics(
                parquetMetadata.getCreated_by(),
                metaData.statistics,
                messageType.getType(path.toArray()).asPrimitiveType()),
            metaData.data_page_offset,
            metaData.dictionary_page_offset,
            metaData.num_values,
            metaData.total_compressed_size,
            metaData.total_uncompressed_size);
        column.setColumnIndexReference(toColumnIndexReference(columnChunk));
        column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
        column.setBloomFilterOffset(metaData.bloom_filter_offset);
        // TODO
        // index_page_offset
        // key_value_metadata
        blockMetaData.addColumn(column);
      }
      blockMetaData.setPath(filePath);
      blocks.add(blockMetaData);
    }
  }
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
  if (key_value_metadata != null) {
    for (KeyValue keyValue : key_value_metadata) {
      keyValueMetaData.put(keyValue.key, keyValue.value);
    }
  }
  return new ParquetMetadata(
      new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()),
      blocks);
}
 
Example #15
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private FileMetaData filter(FileMetaData md, long start, long end) {
  return filterFileMetaDataByMidpoint(new FileMetaData(md),
      new ParquetMetadataConverter.RangeMetadataFilter(start, end));
}
 
Example #16
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private FileMetaData find(FileMetaData md, Long... blockStart) {
  return filterFileMetaDataByStart(new FileMetaData(md),
      new ParquetMetadataConverter.OffsetMetadataFilter(
          Sets.newHashSet((Long[]) blockStart)));
}
 
Example #17
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private FileMetaData find(FileMetaData md, long blockStart) {
  return filterFileMetaDataByStart(new FileMetaData(md),
      new ParquetMetadataConverter.OffsetMetadataFilter(
          Sets.newHashSet(blockStart)));
}
 
Example #18
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private static void addKeyValue(FileMetaData fileMetaData, String key, String value) {
  KeyValue keyValue = new KeyValue(key);
  keyValue.value = value;
  fileMetaData.addToKey_value_metadata(keyValue);
}
 
Example #19
Source File: MetadataReader.java    From presto with Apache License 2.0 4 votes vote down vote up
public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize)
        throws IOException

{
    // Parquet File Layout:
    //
    // MAGIC
    // variable: Data
    // variable: Metadata
    // 4 bytes: MetadataLength
    // MAGIC

    validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
    long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length;

    InputStream footerStream = readFully(inputStream, metadataLengthIndex, PARQUET_METADATA_LENGTH + MAGIC.length);
    int metadataLength = readIntLittleEndian(footerStream);

    byte[] magic = new byte[MAGIC.length];
    footerStream.read(magic);
    validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));

    long metadataIndex = metadataLengthIndex - metadataLength;
    validateParquet(
            metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
            "Corrupted Parquet file: %s metadata index: %s out of range",
            file,
            metadataIndex);
    InputStream metadataStream = readFully(inputStream, metadataIndex, metadataLength);
    FileMetaData fileMetaData = readFileMetaData(metadataStream);
    List<SchemaElement> schema = fileMetaData.getSchema();
    validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

    MessageType messageType = readParquetSchema(schema);
    List<BlockMetaData> blocks = new ArrayList<>();
    List<RowGroup> rowGroups = fileMetaData.getRow_groups();
    if (rowGroups != null) {
        for (RowGroup rowGroup : rowGroups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                validateParquet(
                        (filePath == null && columnChunk.getFile_path() == null)
                                || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                        "all column chunks of the same row group must be in the same file");
                ColumnMetaData metaData = columnChunk.meta_data;
                String[] path = metaData.path_in_schema.stream()
                        .map(value -> value.toLowerCase(Locale.ENGLISH))
                        .toArray(String[]::new);
                ColumnPath columnPath = ColumnPath.get(path);
                PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
                ColumnChunkMetaData column = ColumnChunkMetaData.get(
                        columnPath,
                        primitiveType,
                        CompressionCodecName.fromParquet(metaData.codec),
                        PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats),
                        readEncodings(metaData.encodings),
                        readStats(Optional.ofNullable(fileMetaData.getCreated_by()), Optional.ofNullable(metaData.statistics), primitiveType),
                        metaData.data_page_offset,
                        metaData.dictionary_page_offset,
                        metaData.num_values,
                        metaData.total_compressed_size,
                        metaData.total_uncompressed_size);
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }

    Map<String, String> keyValueMetaData = new HashMap<>();
    List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
    if (keyValueList != null) {
        for (KeyValue keyValue : keyValueList) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}