Java Code Examples for org.apache.iceberg.Schema#columns()

The following examples show how to use org.apache.iceberg.Schema#columns() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SparkBenchmarkUtil.java From iceberg with Apache License 2.0

6 votes

public static UnsafeProjection projection(Schema expectedSchema, Schema actualSchema) {
  StructType struct = SparkSchemaUtil.convert(actualSchema);

  List<AttributeReference> refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava();
  List<Attribute> attrs = Lists.newArrayListWithExpectedSize(struct.fields().length);
  List<Expression> exprs = Lists.newArrayListWithExpectedSize(struct.fields().length);

  for (AttributeReference ref : refs) {
    attrs.add(ref.toAttribute());
  }

  for (Types.NestedField field : expectedSchema.columns()) {
    int indexInIterSchema = struct.fieldIndex(field.name());
    exprs.add(refs.get(indexInIterSchema));
  }

  return UnsafeProjection.create(
      JavaConverters.asScalaBufferConverter(exprs).asScala().toSeq(),
      JavaConverters.asScalaBufferConverter(attrs).asScala().toSeq());
}

Example 2

Source File: RowDataReader.java From iceberg with Apache License 2.0

6 votes

private static UnsafeProjection projection(Schema finalSchema, Schema readSchema) {
  StructType struct = SparkSchemaUtil.convert(readSchema);

  List<AttributeReference> refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava();
  List<Attribute> attrs = Lists.newArrayListWithExpectedSize(struct.fields().length);
  List<org.apache.spark.sql.catalyst.expressions.Expression> exprs =
      Lists.newArrayListWithExpectedSize(struct.fields().length);

  for (AttributeReference ref : refs) {
    attrs.add(ref.toAttribute());
  }

  for (Types.NestedField field : finalSchema.columns()) {
    int indexInReadSchema = struct.fieldIndex(field.name());
    exprs.add(refs.get(indexInReadSchema));
  }

  return UnsafeProjection.create(
      JavaConverters.asScalaBufferConverter(exprs).asScala().toSeq(),
      JavaConverters.asScalaBufferConverter(attrs).asScala().toSeq());
}

Example 3

Source File: IcebergStorage.java From iceberg with Apache License 2.0

6 votes

@Override
public List<String> getPredicateFields(String location, Job job) throws IOException {
  LOG.info("[{}]: getPredicateFields() -> {}", signature, location);
  Schema schema = load(location, job).schema();

  List<String> result = Lists.newArrayList();

  for (Types.NestedField nf : schema.columns()) {
    switch (nf.type().typeId()) {
      case MAP:
      case LIST:
      case STRUCT:
        continue;
      default:
        result.add(nf.name());
    }
  }

  return result;
}

Example 4

Source File: ParquetSchemaUtil.java From iceberg with Apache License 2.0

6 votes

/**
 * Prunes columns from a Parquet file schema that was written without field ids.
 * <p>
 * Files that were written without field ids are read assuming that schema evolution preserved
 * column order. Deleting columns was not allowed.
 * <p>
 * The order of columns in the resulting Parquet schema matches the Parquet file.
 *
 * @param fileSchema schema from a Parquet file that does not have field ids.
 * @param expectedSchema expected schema
 * @return a parquet schema pruned using the expected schema
 */
public static MessageType pruneColumnsFallback(MessageType fileSchema, Schema expectedSchema) {
  Set<Integer> selectedIds = Sets.newHashSet();

  for (Types.NestedField field : expectedSchema.columns()) {
    selectedIds.add(field.fieldId());
  }

  MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();

  int ordinal = 1;
  for (Type type : fileSchema.getFields()) {
    if (selectedIds.contains(ordinal)) {
      builder.addField(type.withId(ordinal));
    }
    ordinal += 1;
  }

  return builder.named(fileSchema.getName());
}

Example 5

Source File: HiveTypeConverter.java From metacat with Apache License 2.0

6 votes

/**
 * Converts iceberg schema to field dto.
 *
 * @param schema          schema
 * @param partitionFields partitioned fields
 * @return list of field Info
 */
public List<FieldInfo> icebergeSchemaTofieldDtos(final Schema schema,
                                                 final List<PartitionField> partitionFields) {
    final List<FieldInfo> fields = Lists.newArrayList();
    final List<String> partitionNames =
        partitionFields.stream()
            .map(f -> schema.findField(f.sourceId()).name()).collect(Collectors.toList());

    for (Types.NestedField field : schema.columns()) {
        final FieldInfo fieldInfo = new FieldInfo();
        fieldInfo.setName(field.name());
        final org.apache.iceberg.types.Type fieldType = field.type();
        fieldInfo.setSourceType(fieldType.toString());
        fieldInfo.setType(toMetacatType(fromIcebergToHiveType(fieldType)));
        fieldInfo.setIsNullable(field.isOptional());
        fieldInfo.setComment(field.doc());
        fieldInfo.setPartitionKey(partitionNames.contains(field.name()));
        fields.add(fieldInfo);
    }

    return fields;
}

Example 6

Source File: FilesTable.java From presto with Apache License 2.0

5 votes

private static Map<Integer, Type> getIcebergIdToTypeMapping(Schema schema)
{
    ImmutableMap.Builder<Integer, Type> icebergIdToTypeMapping = ImmutableMap.builder();
    for (Types.NestedField field : schema.columns()) {
        populateIcebergIdToTypeMapping(field, icebergIdToTypeMapping);
    }
    return icebergIdToTypeMapping.build();
}

Example 7

Source File: ArrowSchemaUtilTest.java From iceberg with Apache License 2.0

5 votes

private void validate(Schema iceberg, org.apache.arrow.vector.types.pojo.Schema arrow) {
  Assert.assertEquals(iceberg.columns().size(), arrow.getFields().size());

  for (Types.NestedField nf : iceberg.columns()) {
    Field field = arrow.findField(nf.name());
    Assert.assertNotNull("Missing filed: " + nf, field);
    validate(nf.type(), field, nf.isOptional());
  }
}

Example 8

Source File: TypeToMessageType.java From iceberg with Apache License 2.0

5 votes

public MessageType convert(Schema schema, String name) {
  Types.MessageTypeBuilder builder = Types.buildMessage();

  for (NestedField field : schema.columns()) {
    builder.addField(field(field));
  }

  return builder.named(AvroSchemaUtil.makeCompatibleName(name));
}

Example 9

Source File: IcebergPigInputFormat.java From iceberg with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
private boolean advance() throws IOException {
  if (reader != null) {
    reader.close();
  }

  if (!tasks.hasNext()) {
    return false;
  }

  FileScanTask currentTask = tasks.next();

  Schema tableSchema = (Schema) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_SCHEMA)));
  LOG.debug("[{}]: Task table schema: {}", signature, tableSchema);

  List<String> projectedFields =
      (List<String>) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_PROJECTED_FIELDS)));
  LOG.debug("[{}]: Task projected fields: {}", signature, projectedFields);

  Schema projectedSchema = projectedFields != null ? SchemaUtil.project(tableSchema, projectedFields) : tableSchema;

  PartitionSpec spec = currentTask.asFileScanTask().spec();
  DataFile file = currentTask.file();
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());

  Set<Integer> idColumns = spec.identitySourceIds();

  // schema needed for the projection and filtering
  boolean hasJoinedPartitionColumns = !idColumns.isEmpty();

  switch (file.format()) {
    case PARQUET:
      Map<Integer, Object> partitionValueMap = Maps.newHashMap();

      if (hasJoinedPartitionColumns) {

        Schema readSchema = TypeUtil.selectNot(projectedSchema, idColumns);
        Schema projectedPartitionSchema = TypeUtil.select(projectedSchema, idColumns);

        Map<String, Integer> partitionSpecFieldIndexMap = Maps.newHashMap();
        for (int i = 0; i < spec.fields().size(); i++) {
          partitionSpecFieldIndexMap.put(spec.fields().get(i).name(), i);
        }

        for (Types.NestedField field : projectedPartitionSchema.columns()) {
          int partitionIndex = partitionSpecFieldIndexMap.get(field.name());

          Object partitionValue = file.partition().get(partitionIndex, Object.class);
          partitionValueMap.put(field.fieldId(), convertPartitionValue(field.type(), partitionValue));
        }

        reader = Parquet.read(inputFile)
            .project(readSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      } else {
        reader = Parquet.read(inputFile)
            .project(projectedSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      }

      recordIterator = reader.iterator();

      break;
    default:
      throw new UnsupportedOperationException("Unsupported file format: " + file.format());
  }

  return true;
}