Java Code Examples for org.apache.parquet.schema.MessageType#getFields()

The following examples show how to use org.apache.parquet.schema.MessageType#getFields() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetSchemaUtil.java    From iceberg with Apache License 2.0 6 votes vote down vote up
/**
 * Prunes columns from a Parquet file schema that was written without field ids.
 * <p>
 * Files that were written without field ids are read assuming that schema evolution preserved
 * column order. Deleting columns was not allowed.
 * <p>
 * The order of columns in the resulting Parquet schema matches the Parquet file.
 *
 * @param fileSchema schema from a Parquet file that does not have field ids.
 * @param expectedSchema expected schema
 * @return a parquet schema pruned using the expected schema
 */
public static MessageType pruneColumnsFallback(MessageType fileSchema, Schema expectedSchema) {
  Set<Integer> selectedIds = Sets.newHashSet();

  for (Types.NestedField field : expectedSchema.columns()) {
    selectedIds.add(field.fieldId());
  }

  MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();

  int ordinal = 1;
  for (Type type : fileSchema.getFields()) {
    if (selectedIds.contains(ordinal)) {
      builder.addField(type.withId(ordinal));
    }
    ordinal += 1;
  }

  return builder.named(fileSchema.getName());
}
 
Example 2
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneNestedParentColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove parent column. All of it's children will be removed.
  String cargs[] = {inputFile, outputFile, "Links"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 3);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Gender");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Links");
  validateColumns(inputFile, prunePaths);
}
 
Example 3
Source File: ParquetSchemaUtil.java    From iceberg with Apache License 2.0 6 votes vote down vote up
/**
 * Prunes columns from a Parquet file schema that was written without field ids.
 * <p>
 * Files that were written without field ids are read assuming that schema evolution preserved
 * column order. Deleting columns was not allowed.
 * <p>
 * The order of columns in the resulting Parquet schema matches the Parquet file.
 *
 * @param fileSchema schema from a Parquet file that does not have field ids.
 * @param expectedSchema expected schema
 * @return a parquet schema pruned using the expected schema
 */
public static MessageType pruneColumnsFallback(MessageType fileSchema, Schema expectedSchema) {
  Set<Integer> selectedIds = Sets.newHashSet();

  for (Types.NestedField field : expectedSchema.columns()) {
    selectedIds.add(field.fieldId());
  }

  MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();

  int ordinal = 1;
  for (Type type : fileSchema.getFields()) {
    if (selectedIds.contains(ordinal)) {
      builder.addField(type.withId(ordinal));
    }
    ordinal += 1;
  }

  return builder.named(fileSchema.getName());
}
 
Example 4
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}
 
Example 5
Source File: HiveSchemaUtil.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Returns equivalent Hive table schema read from a parquet file.
 *
 * @param messageType : Parquet Schema
 * @return : Hive Table schema read from parquet file MAP[String,String]
 */
public static Map<String, String> convertParquetSchemaToHiveSchema(MessageType messageType) throws IOException {
  Map<String, String> schema = new LinkedHashMap<>();
  List<Type> parquetFields = messageType.getFields();
  for (Type parquetType : parquetFields) {
    StringBuilder result = new StringBuilder();
    String key = parquetType.getName();
    if (parquetType.isRepetition(Type.Repetition.REPEATED)) {
      result.append(createHiveArray(parquetType, ""));
    } else {
      result.append(convertField(parquetType));
    }

    schema.put(hiveCompatibleFieldName(key, false), result.toString());
  }
  return schema;
}
 
Example 6
Source File: ParquetConverter.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
public static List<IParquetInputField> buildInputFields( MessageType schema ) {
  List<IParquetInputField> inputFields = new ArrayList<>();

  for ( Type type : schema.getFields() ) {
    if ( type.isPrimitive() ) {
      inputFields.add( convertField( type ) );
    }
  }

  return inputFields;
}
 
Example 7
Source File: SchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Creates an Arrow Schema from an Parquet one and returns the mapping
 * @param parquetSchema the provided Parquet Schema
 * @return the mapping between the 2
 */
public SchemaMapping fromParquet(MessageType parquetSchema) {
  List<Type> fields = parquetSchema.getFields();
  List<TypeMapping> mappings = fromParquet(fields);
  List<Field> arrowFields = fields(mappings);
  return new SchemaMapping(new Schema(arrowFields), parquetSchema, mappings);
}
 
Example 8
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testPruneNestedColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove nested column
  String cargs[] = {inputFile, outputFile, "Links.Backward"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 4);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Gender");
  assertEquals(fields.get(3).getName(), "Links");
  List<Type> subFields = fields.get(3).asGroupType().getFields();
  assertEquals(subFields.size(), 1);
  assertEquals(subFields.get(0).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Links.Backward");
  validateColumns(inputFile, prunePaths);
}
 
Example 9
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testPruneOneColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove column
  String cargs[] = {inputFile, outputFile, "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 3);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Links");
  List<Type> subFields = fields.get(2).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Gender");
  validateColumns(inputFile, prunePaths);
}
 
Example 10
Source File: PruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private MessageType pruneColumnsInSchema(MessageType schema, Set<ColumnPath> prunePaths) {

    List<Type> fields = schema.getFields();
    List<String> currentPath = new ArrayList<>();
    List<Type> prunedFields = pruneColumnsInFields(fields, currentPath, prunePaths);
    MessageType newSchema = new MessageType(schema.getName(), prunedFields);
    return newSchema;
  }
 
Example 11
Source File: ParquetSchemaUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static MessageType addFallbackIds(MessageType fileSchema) {
  MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();

  int ordinal = 1; // ids are assigned starting at 1
  for (Type type : fileSchema.getFields()) {
    builder.addField(type.withId(ordinal));
    ordinal += 1;
  }

  return builder.named(fileSchema.getName());
}
 
Example 12
Source File: ParquetFileLineFetcher.java    From hugegraph-loader with Apache License 2.0 5 votes vote down vote up
private String[] parseHeader(MessageType schema) {
    List<Type> fields = schema.getFields();
    String[] header = new String[fields.size()];
    for (int i = 0; i < fields.size(); i++) {
        header[i] = fields.get(i).getName();
    }
    return header;
}
 
Example 13
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 5 votes vote down vote up
private MessageType buildReadSchema(MessageType originalSchema) {
    List<Type> originalFields = originalSchema.getFields();
    List<Type> projectedFields = new ArrayList<>();
    for (int i = 0; i < context.getTupleDescription().size(); i++) {
        if (context.getTupleDescription().get(i).isProjected()) {
            projectedFields.add(originalFields.get(i));
        }
    }
    return new MessageType(originalSchema.getName(), projectedFields);
}
 
Example 14
Source File: ParquetSchemaUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static MessageType addFallbackIds(MessageType fileSchema) {
  MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();

  int ordinal = 1; // ids are assigned starting at 1
  for (Type type : fileSchema.getFields()) {
    builder.addField(type.withId(ordinal));
    ordinal += 1;
  }

  return builder.named(fileSchema.getName());
}
 
Example 15
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
static void showDetails(PrettyPrintWriter out, MessageType type, boolean showOriginalTypes) {
  List<String> cpath = new ArrayList<String>();
  for (Type ftype : type.getFields()) {
    showDetails(out, ftype, 0, type, cpath, showOriginalTypes);
  }
}
 
Example 16
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, MessageType type) {
  List<String> cpath = new ArrayList<String>();
  for (Type ftype : type.getFields()) {
    showDetails(out, ftype, 0, type, cpath);
  }
}
 
Example 17
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Check whether any of columns in the given list is either nested or repetitive.
 *
 * @param footer  Parquet file schema
 * @param columns list of query SchemaPath objects
 */
public static boolean containsComplexColumn(ParquetMetadata footer, List<SchemaPath> columns) {

  MessageType schema = footer.getFileMetaData().getSchema();

  if (Utilities.isStarQuery(columns)) {
    for (Type type : schema.getFields()) {
      if (!type.isPrimitive()) {
        return true;
      }
    }
    for (ColumnDescriptor col : schema.getColumns()) {
      if (col.getMaxRepetitionLevel() > 0) {
        return true;
      }
    }
    return false;
  } else {
    Map<String, ColumnDescriptor> colDescMap = ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
    Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);

    for (SchemaPath schemaPath : columns) {
      // Schema path which is non-leaf is complex column
      if (!schemaPath.isLeaf()) {
        logger.trace("rowGroupScan contains complex column: {}", schemaPath.getUnIndexed().toString());
        return true;
      }

      // following column descriptor lookup failure may mean two cases, depending on subsequent SchemaElement lookup:
      // 1. success: queried column is complex, i.e. GroupType
      // 2. failure: queried column is not in schema and thus is non-complex
      ColumnDescriptor column = colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase());

      if (column == null) {
        SchemaElement schemaElement = schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase());
        if (schemaElement != null) {
          return true;
        }
      } else {
        if (column.getMaxRepetitionLevel() > 0) {
          logger.trace("rowGroupScan contains repetitive column: {}", schemaPath.getUnIndexed().toString());
          return true;
        }
      }
    }
  }
  return false;
}