Java Code Examples for org.apache.iceberg.types.Types#NestedField

The following examples show how to use org.apache.iceberg.types.Types#NestedField . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StrictMetricsEvaluator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
  // Rows must match when: <----------Min----Max---X------->
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  if (canContainNulls(id)) {
    return ROWS_MIGHT_NOT_MATCH;
  }

  if (upperBounds != null && upperBounds.containsKey(id)) {
    T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));

    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_MUST_MATCH;
    }
  }

  return ROWS_MIGHT_NOT_MATCH;
}
 
Example 2
Source File: HiveTypeConverter.java    From metacat with Apache License 2.0 6 votes vote down vote up
/**
 * Converts iceberg schema to field dto.
 *
 * @param schema          schema
 * @param partitionFields partitioned fields
 * @return list of field Info
 */
public List<FieldInfo> icebergeSchemaTofieldDtos(final Schema schema,
                                                 final List<PartitionField> partitionFields) {
    final List<FieldInfo> fields = Lists.newArrayList();
    final List<String> partitionNames =
        partitionFields.stream()
            .map(f -> schema.findField(f.sourceId()).name()).collect(Collectors.toList());

    for (Types.NestedField field : schema.columns()) {
        final FieldInfo fieldInfo = new FieldInfo();
        fieldInfo.setName(field.name());
        final org.apache.iceberg.types.Type fieldType = field.type();
        fieldInfo.setSourceType(fieldType.toString());
        fieldInfo.setType(toMetacatType(fromIcebergToHiveType(fieldType)));
        fieldInfo.setIsNullable(field.isOptional());
        fieldInfo.setComment(field.doc());
        fieldInfo.setPartitionKey(partitionNames.contains(field.name()));
        fields.add(fieldInfo);
    }

    return fields;
}
 
Example 3
Source File: GenericPartitionFieldSummary.java    From iceberg with Apache License 2.0 6 votes vote down vote up
/**
 * Used by Avro reflection to instantiate this class when reading manifest files.
 */
public GenericPartitionFieldSummary(Schema avroSchema) {
  this.avroSchema = avroSchema;

  List<Types.NestedField> fields = AvroSchemaUtil.convert(avroSchema)
      .asNestedType()
      .asStructType()
      .fields();
  List<Types.NestedField> allFields = PartitionFieldSummary.getType().fields();

  this.fromProjectionPos = new int[fields.size()];
  for (int i = 0; i < fromProjectionPos.length; i += 1) {
    boolean found = false;
    for (int j = 0; j < allFields.size(); j += 1) {
      if (fields.get(i).fieldId() == allFields.get(j).fieldId()) {
        found = true;
        fromProjectionPos[i] = j;
      }
    }

    if (!found) {
      throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i));
    }
  }
}
 
Example 4
Source File: SchemaUpdate.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private void internalMove(String name, Move move) {
  Integer parentId = idToParent.get(move.fieldId());
  if (parentId != null) {
    Types.NestedField parent = schema.findField(parentId);
    Preconditions.checkArgument(parent.type().isStructType(),
        "Cannot move fields in non-struct type: %s", parent.type());

    if (move.type() == Move.MoveType.AFTER || move.type() == Move.MoveType.BEFORE) {
      Preconditions.checkArgument(
          parentId.equals(idToParent.get(move.referenceFieldId())),
          "Cannot move field %s to a different struct", name);
    }

    moves.put(parentId, move);
  } else {
    if (move.type() == Move.MoveType.AFTER || move.type() == Move.MoveType.BEFORE) {
      Preconditions.checkArgument(
          idToParent.get(move.referenceFieldId()) == null,
          "Cannot move field %s to a different struct", name);
    }

    moves.put(TABLE_ROOT_ID, move);
  }
}
 
Example 5
Source File: TestHelpers.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static void assertEqualsUnsafe(Types.StructType struct, Record rec, InternalRow row) {
  List<Types.NestedField> fields = struct.fields();
  for (int i = 0; i < fields.size(); i += 1) {
    Type fieldType = fields.get(i).type();

    Object expectedValue = rec.get(i);
    Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));

    assertEqualsUnsafe(fieldType, expectedValue, actualValue);
  }
}
 
Example 6
Source File: SchemaParser.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Types.StructType structFromJson(JsonNode json) {
  JsonNode fieldArray = json.get(FIELDS);
  Preconditions.checkArgument(fieldArray.isArray(),
      "Cannot parse struct fields from non-array: %s", fieldArray);

  List<Types.NestedField> fields = Lists.newArrayListWithExpectedSize(fieldArray.size());
  Iterator<JsonNode> iterator = fieldArray.elements();
  while (iterator.hasNext()) {
    JsonNode field = iterator.next();
    Preconditions.checkArgument(field.isObject(),
        "Cannot parse struct field from non-object: %s", field);

    int id = JsonUtil.getInt(ID, field);
    String name = JsonUtil.getString(NAME, field);
    Type type = typeFromJson(field.get(TYPE));

    String doc = JsonUtil.getStringOrNull(DOC, field);
    boolean isRequired = JsonUtil.getBool(REQUIRED, field);
    if (isRequired) {
      fields.add(Types.NestedField.required(id, name, type, doc));
    } else {
      fields.add(Types.NestedField.optional(id, name, type, doc));
    }
  }

  return Types.StructType.of(fields);
}
 
Example 7
Source File: SchemaConverter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static CompleteType fromIcebergType(Type type) {
  if (type.isPrimitiveType()) {
    return fromIcebergPrimitiveType(type.asPrimitiveType());
  } else {
    NestedType nestedType = type.asNestedType();
    if (nestedType.isListType()) {
      ListType listType = (ListType)nestedType;
      NestedField elementField = listType.fields().get(0);
      CompleteType elementType = fromIcebergType(elementField.type());
      return (elementType == null) ? null : elementType.asList();
    } else if (nestedType.isStructType()) {
      StructType structType = (StructType)nestedType;
      List<Types.NestedField> structFields = structType.fields();
      List<Field> innerFields = Lists.newArrayList();
      for (Types.NestedField nestedField : structFields) {
        Field field = fromIcebergColumn(nestedField);
        if (field == null) {
          return null;
        }
        innerFields.add(field);
      }
      return CompleteType.struct(innerFields);
    } else {
      // drop map type and all other unknown iceberg column types
      return null;
    }
  }
}
 
Example 8
Source File: NamedReference.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public BoundReference<T> bind(Types.StructType struct, boolean caseSensitive) {
  Schema schema = new Schema(struct.fields());
  Types.NestedField field = caseSensitive ?
      schema.findField(name) :
      schema.caseInsensitiveFindField(name);

  ValidationException.check(field != null,
      "Cannot find field '%s' in struct: %s", name, schema.asStruct());

  return new BoundReference<>(field, schema.accessorForField(field.fieldId()));
}
 
Example 9
Source File: PartitionSpec.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public Builder year(String sourceName, String targetName) {
  checkAndAddPartitionName(targetName);
  Types.NestedField sourceColumn = findSourceColumn(sourceName);
  PartitionField field = new PartitionField(
      sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.year(sourceColumn.type()));
  checkForRedundantPartitions(field);
  fields.add(field);
  return this;
}
 
Example 10
Source File: PartitionSpec.java    From iceberg with Apache License 2.0 5 votes vote down vote up
Builder identity(String sourceName, String targetName) {
  Types.NestedField sourceColumn = findSourceColumn(sourceName);
  checkAndAddPartitionName(targetName, sourceColumn.fieldId());
  fields.add(new PartitionField(
      sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.identity(sourceColumn.type())));
  return this;
}
 
Example 11
Source File: TestSparkTableUtilWithInMemoryCatalog.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void checkFieldMetrics(Dataset<Row> fileDF, Types.NestedField field, boolean isNull) {
  List<Row> metricRows = fileDF
      .selectExpr(
          String.format("lower_bounds['%d']", field.fieldId()),
          String.format("upper_bounds['%d']", field.fieldId())
      )
      .collectAsList();

  metricRows.forEach(row -> {
    Assert.assertEquals("Invalid metrics for column: " + field.name(), isNull, row.isNullAt(0));
    Assert.assertEquals("Invalid metrics for column: " + field.name(), isNull, row.isNullAt(1));
  });
}
 
Example 12
Source File: IcebergFilterGenerator.java    From metacat with Apache License 2.0 5 votes vote down vote up
/**
 * Constructor.
 *
 * @param fields partition fields
 */
public IcebergFilterGenerator(final List<Types.NestedField> fields) {
    fieldMap = Maps.newHashMap();
    for (final Types.NestedField field : fields) {
        fieldMap.put(field.name(), field);
    }
}
 
Example 13
Source File: IcebergPartitionData.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public IcebergPartitionData(Types.StructType partitionType) {
  for (Types.NestedField field : partitionType.fields()) {
    Preconditions.checkArgument(field.type().isPrimitiveType(),
      "Partitions cannot contain nested types: %s", field.type());
  }

  this.partitionType = partitionType;
  this.size = partitionType.fields().size();
  this.data = new Object[size];
}
 
Example 14
Source File: IcebergCatalog.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
void addColumns(List<Types.NestedField> columnsToAdd) {
  IcebergTableOperations tableOperations = new IcebergTableOperations(fsPath, configuration);
  table = new BaseTable(tableOperations, fsPath.getName());
  UpdateSchema updateSchema = table.updateSchema();
  columnsToAdd.forEach(c -> updateSchema.addColumn(c.name(), c.type()));
  updateSchema.commit();
}
 
Example 15
Source File: SchemaUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static Schema project(Schema schema, List<String> requiredFields) {
  List<Types.NestedField> columns = Lists.newArrayList();

  for (String column : requiredFields) {
    columns.add(schema.findField(column));
  }

  return new Schema(columns);
}
 
Example 16
Source File: TypeConverter.java    From presto with Apache License 2.0 5 votes vote down vote up
private static List<OrcType> toOrcStructType(int nextFieldTypeIndex, Types.StructType structType, Map<String, String> attributes)
{
    nextFieldTypeIndex++;
    List<OrcColumnId> fieldTypeIndexes = new ArrayList<>();
    List<String> fieldNames = new ArrayList<>();
    List<List<OrcType>> fieldTypesList = new ArrayList<>();
    for (Types.NestedField field : structType.fields()) {
        fieldTypeIndexes.add(new OrcColumnId(nextFieldTypeIndex));
        fieldNames.add(field.name());
        Map<String, String> fieldAttributes = ImmutableMap.<String, String>builder()
                .put(ORC_ICEBERG_ID_KEY, Integer.toString(field.fieldId()))
                .put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(field.isRequired()))
                .build();
        List<OrcType> fieldOrcTypes = toOrcType(nextFieldTypeIndex, field.type(), fieldAttributes);
        fieldTypesList.add(fieldOrcTypes);
        nextFieldTypeIndex += fieldOrcTypes.size();
    }

    ImmutableList.Builder<OrcType> orcTypes = ImmutableList.builder();
    orcTypes.add(new OrcType(
            OrcType.OrcTypeKind.STRUCT,
            fieldTypeIndexes,
            fieldNames,
            Optional.empty(),
            Optional.empty(),
            Optional.empty(),
            attributes));
    fieldTypesList.forEach(orcTypes::addAll);

    return orcTypes.build();
}
 
Example 17
Source File: TypeConverter.java    From presto with Apache License 2.0 5 votes vote down vote up
private static org.apache.iceberg.types.Type fromRow(RowType type)
{
    List<Types.NestedField> fields = new ArrayList<>();
    for (RowType.Field field : type.getFields()) {
        String name = field.getName().orElseThrow(() ->
                new PrestoException(NOT_SUPPORTED, "Row type field does not have a name: " + type.getDisplayName()));
        fields.add(Types.NestedField.required(fields.size() + 1, name, toIcebergType(field.getType())));
    }
    return Types.StructType.of(fields);
}
 
Example 18
Source File: TypeConverter.java    From presto with Apache License 2.0 4 votes vote down vote up
public static Type toPrestoType(org.apache.iceberg.types.Type type, TypeManager typeManager)
{
    switch (type.typeId()) {
        case BOOLEAN:
            return BooleanType.BOOLEAN;
        case BINARY:
        case FIXED:
            return VarbinaryType.VARBINARY;
        case DATE:
            return DateType.DATE;
        case DECIMAL:
            Types.DecimalType decimalType = (Types.DecimalType) type;
            return DecimalType.createDecimalType(decimalType.precision(), decimalType.scale());
        case DOUBLE:
            return DoubleType.DOUBLE;
        case LONG:
            return BigintType.BIGINT;
        case FLOAT:
            return RealType.REAL;
        case INTEGER:
            return IntegerType.INTEGER;
        case TIME:
            return TimeType.TIME;
        case TIMESTAMP:
            Types.TimestampType timestampType = (Types.TimestampType) type;
            if (timestampType.shouldAdjustToUTC()) {
                return TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE;
            }
            return TimestampType.TIMESTAMP;
        case UUID:
        case STRING:
            return VarcharType.createUnboundedVarcharType();
        case LIST:
            Types.ListType listType = (Types.ListType) type;
            return new ArrayType(toPrestoType(listType.elementType(), typeManager));
        case MAP:
            Types.MapType mapType = (Types.MapType) type;
            TypeSignature keyType = toPrestoType(mapType.keyType(), typeManager).getTypeSignature();
            TypeSignature valueType = toPrestoType(mapType.valueType(), typeManager).getTypeSignature();
            return typeManager.getParameterizedType(StandardTypes.MAP, ImmutableList.of(TypeSignatureParameter.typeParameter(keyType), TypeSignatureParameter.typeParameter(valueType)));
        case STRUCT:
            List<Types.NestedField> fields = ((Types.StructType) type).fields();
            return RowType.from(fields.stream()
                    .map(field -> new RowType.Field(Optional.of(field.name()), toPrestoType(field.type(), typeManager)))
                    .collect(toImmutableList()));
        default:
            throw new UnsupportedOperationException(format("Cannot convert from Iceberg type '%s' (%s) to Presto type", type, type.typeId()));
    }
}
 
Example 19
Source File: OrcMetrics.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private static Metrics buildOrcMetrics(final long numOfRows, final TypeDescription orcSchema,
                                       final ColumnStatistics[] colStats) {
  final Schema schema = ORCSchemaUtil.convert(orcSchema);
  final Set<TypeDescription> columnsInContainers = findColumnsInContainers(schema, orcSchema);
  Map<Integer, Long> columnSizes = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, Long> valueCounts = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, Long> nullCounts = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, ByteBuffer> lowerBounds = Maps.newHashMap();
  Map<Integer, ByteBuffer> upperBounds = Maps.newHashMap();

  for (int i = 0; i < colStats.length; i++) {
    final ColumnStatistics colStat = colStats[i];
    final TypeDescription orcCol = orcSchema.findSubtype(i);
    final Optional<Types.NestedField> icebergColOpt = ORCSchemaUtil.icebergID(orcCol)
        .map(schema::findField);

    if (icebergColOpt.isPresent()) {
      final Types.NestedField icebergCol = icebergColOpt.get();
      final int fieldId = icebergCol.fieldId();

      columnSizes.put(fieldId, colStat.getBytesOnDisk());

      if (!columnsInContainers.contains(orcCol)) {
        // Since ORC does not track null values nor repeated ones, the value count for columns in
        // containers (maps, list) may be larger than what it actually is, however these are not
        // used in experssions right now. For such cases, we use the value number of values
        // directly stored in ORC.
        if (colStat.hasNull()) {
          nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues());
        } else {
          nullCounts.put(fieldId, 0L);
        }
        valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId));

        Optional<ByteBuffer> orcMin = (colStat.getNumberOfValues() > 0) ?
            fromOrcMin(icebergCol, colStat) : Optional.empty();
        orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer));
        Optional<ByteBuffer> orcMax = (colStat.getNumberOfValues() > 0) ?
            fromOrcMax(icebergCol, colStat) : Optional.empty();
        orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer));
      }
    }
  }

  return new Metrics(numOfRows,
      columnSizes,
      valueCounts,
      nullCounts,
      lowerBounds,
      upperBounds);
}
 
Example 20
Source File: IcebergOrcFileWriter.java    From presto with Apache License 2.0 4 votes vote down vote up
private static Metrics computeMetrics(Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics)
{
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);

    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();

    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcColumns.size(); i++) {
        OrcColumnId orcColumnId = new OrcColumnId(i);
        if (excludedColumns.contains(orcColumnId)) {
            continue;
        }
        OrcType orcColumn = orcColumns.get(orcColumnId);
        ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
        int icebergId = getIcebergId(orcColumn);
        Types.NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(
            fileRowCount,
            null, // TODO: Add column size accounting to ORC column writers
            valueCounts.isEmpty() ? null : valueCounts,
            nullCounts.isEmpty() ? null : nullCounts,
            lowerBounds.isEmpty() ? null : lowerBounds,
            upperBounds.isEmpty() ? null : upperBounds);
}