Java Code Examples for org.apache.parquet.hadoop.metadata.ColumnPath#get()

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnPath#get() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private Type pruneColumnsInField(Type field, List<String> currentPath, Set<ColumnPath> prunePaths) {
  String fieldName = field.getName();
  currentPath.add(fieldName);
  ColumnPath path = ColumnPath.get(currentPath.toArray(new String[0]));
  Type prunedField = null;
  if (!prunePaths.contains(path)) {
    if (field.isPrimitive()) {
      prunedField = field;
    } else {
      List<Type> childFields = ((GroupType) field).getFields();
      List<Type> prunedFields = pruneColumnsInFields(childFields, currentPath, prunePaths);
      if (prunedFields.size() > 0) {
        prunedField = ((GroupType) field).withNewFields(prunedFields);
      }
    } 
  }

  currentPath.remove(fieldName);
  return prunedField;
}
 
Example 2
Source File: FilteringGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {

  // get the real converter from the delegate
  Converter delegateConverter = Objects.requireNonNull(delegate.getConverter(fieldIndex), "delegate converter cannot be null");

  // determine the indexFieldPath for the converter proxy we're about to make, which is
  // this converter's path + the requested fieldIndex
  List<Integer> newIndexFieldPath = new ArrayList<>(indexFieldPath.size() + 1);
  newIndexFieldPath.addAll(indexFieldPath);
  newIndexFieldPath.add(fieldIndex);

  if (delegateConverter.isPrimitive()) {
    PrimitiveColumnIO columnIO = getColumnIO(newIndexFieldPath);
    ColumnPath columnPath = ColumnPath.get(columnIO.getColumnDescriptor().getPath());
    ValueInspector[] valueInspectors = getValueInspectors(columnPath);
    return new FilteringPrimitiveConverter(delegateConverter.asPrimitiveConverter(), valueInspectors);
  } else {
    return new FilteringGroupConverter(delegateConverter.asGroupConverter(), newIndexFieldPath, valueInspectorsByColumn, columnIOsByIndexFieldPath);
  }

}
 
Example 3
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * start a column inside a block
 * @param descriptor the column descriptor
 * @param valueCount the value count in this column
 * @param compressionCodecName a compression codec name
 * @throws IOException if there is an error while writing
 */
public void startColumn(ColumnDescriptor descriptor,
                        long valueCount,
                        CompressionCodecName compressionCodecName) throws IOException {
  state = state.startColumn();
  encodingStatsBuilder.clear();
  currentEncodings = new HashSet<Encoding>();
  currentChunkPath = ColumnPath.get(descriptor.getPath());
  currentChunkType = descriptor.getPrimitiveType();
  currentChunkCodec = compressionCodecName;
  currentChunkValueCount = valueCount;
  currentChunkFirstDataPage = out.getPos();
  compressedLength = 0;
  uncompressedLength = 0;
  // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one
  currentStatistics = null;

  columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength);
  offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
  firstPageOffset = -1;
}
 
Example 4
Source File: SchemaCompatibilityValidator.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private SchemaCompatibilityValidator(MessageType schema) {

    for (ColumnDescriptor cd : schema.getColumns()) {
      ColumnPath columnPath = ColumnPath.get(cd.getPath());
      columnsAccordingToSchema.put(columnPath, cd);
    }
  }
 
Example 5
Source File: IncrementallyUpdatedFilterPredicateBuilderBase.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public IncrementallyUpdatedFilterPredicateBuilderBase(List<PrimitiveColumnIO> leaves) {
  for (PrimitiveColumnIO leaf : leaves) {
    ColumnDescriptor descriptor = leaf.getColumnDescriptor();
    ColumnPath path = ColumnPath.get(descriptor.getPath());
    PrimitiveComparator<?> comparator = descriptor.getPrimitiveType().comparator();
    comparatorsByColumn.put(path, comparator);
  }
}
 
Example 6
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData createColumnChunkMetaData() {
  Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.BINARY;
  ColumnPath p = ColumnPath.get("foo");
  CompressionCodecName c = CompressionCodecName.GZIP;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s,
          0, 0, 0, 0, 0);
  return md;
}
 
Example 7
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ParquetMetadata createParquetMetaData(Encoding dicEncoding,
  Encoding dataEncoding) {
  MessageType schema =
    parseMessageType("message schema { optional int32 col (INT_32); }");
  org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
    new org.apache.parquet.hadoop.metadata.FileMetaData(schema,
      new HashMap<String, String>(), null);
  List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
  BlockMetaData blockMetaData = new BlockMetaData();
  EncodingStats.Builder builder = new EncodingStats.Builder();
  if (dicEncoding!= null) {
    builder.addDictEncoding(dicEncoding).build();
  }
  builder.addDataEncoding(dataEncoding);
  EncodingStats es = builder.build();
  Set<org.apache.parquet.column.Encoding> e =
    new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.INT32;
  ColumnPath p = ColumnPath.get("col");
  CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md =
    ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
  blockMetaData.addColumn(md);
  blockMetaDataList.add(blockMetaData);
  return new ParquetMetadata(fileMetaData, blockMetaDataList);
}
 
Example 8
Source File: MetadataReader.java    From presto with Apache License 2.0 4 votes vote down vote up
public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize)
        throws IOException

{
    // Parquet File Layout:
    //
    // MAGIC
    // variable: Data
    // variable: Metadata
    // 4 bytes: MetadataLength
    // MAGIC

    validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
    long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length;

    InputStream footerStream = readFully(inputStream, metadataLengthIndex, PARQUET_METADATA_LENGTH + MAGIC.length);
    int metadataLength = readIntLittleEndian(footerStream);

    byte[] magic = new byte[MAGIC.length];
    footerStream.read(magic);
    validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));

    long metadataIndex = metadataLengthIndex - metadataLength;
    validateParquet(
            metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
            "Corrupted Parquet file: %s metadata index: %s out of range",
            file,
            metadataIndex);
    InputStream metadataStream = readFully(inputStream, metadataIndex, metadataLength);
    FileMetaData fileMetaData = readFileMetaData(metadataStream);
    List<SchemaElement> schema = fileMetaData.getSchema();
    validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

    MessageType messageType = readParquetSchema(schema);
    List<BlockMetaData> blocks = new ArrayList<>();
    List<RowGroup> rowGroups = fileMetaData.getRow_groups();
    if (rowGroups != null) {
        for (RowGroup rowGroup : rowGroups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                validateParquet(
                        (filePath == null && columnChunk.getFile_path() == null)
                                || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                        "all column chunks of the same row group must be in the same file");
                ColumnMetaData metaData = columnChunk.meta_data;
                String[] path = metaData.path_in_schema.stream()
                        .map(value -> value.toLowerCase(Locale.ENGLISH))
                        .toArray(String[]::new);
                ColumnPath columnPath = ColumnPath.get(path);
                PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
                ColumnChunkMetaData column = ColumnChunkMetaData.get(
                        columnPath,
                        primitiveType,
                        CompressionCodecName.fromParquet(metaData.codec),
                        PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats),
                        readEncodings(metaData.encodings),
                        readStats(Optional.ofNullable(fileMetaData.getCreated_by()), Optional.ofNullable(metaData.statistics), primitiveType),
                        metaData.data_page_offset,
                        metaData.dictionary_page_offset,
                        metaData.num_values,
                        metaData.total_compressed_size,
                        metaData.total_uncompressed_size);
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }

    Map<String, String> keyValueMetaData = new HashMap<>();
    List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
    if (keyValueList != null) {
        for (KeyValue keyValue : keyValueList) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}
 
Example 9
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private static ColumnPath getPath(ColumnMetaData metaData) {
  String[] path = metaData.path_in_schema.toArray(new String[0]);
  return ColumnPath.get(path);
}