org.apache.iceberg.PartitionField Java Examples

The following examples show how to use org.apache.iceberg.PartitionField. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Projections.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T> Expression predicate(BoundPredicate<T> pred) {
  Collection<PartitionField> parts = spec().getFieldsBySourceId(pred.ref().fieldId());
  if (parts == null) {
    // the predicate has no partition column
    return Expressions.alwaysTrue();
  }

  Expression result = Expressions.alwaysTrue();
  for (PartitionField part : parts) {
    // consider (d = 2019-01-01) with bucket(7, d) and bucket(5, d)
    // projections: b1 = bucket(7, '2019-01-01') = 5, b2 = bucket(5, '2019-01-01') = 0
    // any value where b1 != 5 or any value where b2 != 0 cannot be the '2019-01-01'
    //
    // similarly, if partitioning by day(ts) and hour(ts), the more restrictive
    // projection should be used. ts = 2019-01-01T01:00:00 produces day=2019-01-01 and
    // hour=2019-01-01-01. the value will be in 2019-01-01-01 and not in 2019-01-01-02.
    UnboundPredicate<?> inclusiveProjection = ((Transform<T, ?>) part.transform()).project(part.name(), pred);
    if (inclusiveProjection != null) {
      result = Expressions.and(result, inclusiveProjection);
    }
  }

  return result;
}
 
Example #2
Source File: HiveTypeConverter.java    From metacat with Apache License 2.0 6 votes vote down vote up
/**
 * Converts iceberg schema to field dto.
 *
 * @param schema          schema
 * @param partitionFields partitioned fields
 * @return list of field Info
 */
public List<FieldInfo> icebergeSchemaTofieldDtos(final Schema schema,
                                                 final List<PartitionField> partitionFields) {
    final List<FieldInfo> fields = Lists.newArrayList();
    final List<String> partitionNames =
        partitionFields.stream()
            .map(f -> schema.findField(f.sourceId()).name()).collect(Collectors.toList());

    for (Types.NestedField field : schema.columns()) {
        final FieldInfo fieldInfo = new FieldInfo();
        fieldInfo.setName(field.name());
        final org.apache.iceberg.types.Type fieldType = field.type();
        fieldInfo.setSourceType(fieldType.toString());
        fieldInfo.setType(toMetacatType(fromIcebergToHiveType(fieldType)));
        fieldInfo.setIsNullable(field.isOptional());
        fieldInfo.setComment(field.doc());
        fieldInfo.setPartitionKey(partitionNames.contains(field.name()));
        fields.add(fieldInfo);
    }

    return fields;
}
 
Example #3
Source File: PartitionKey.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
PartitionKey(PartitionSpec spec, Schema inputSchema) {
  this.spec = spec;

  List<PartitionField> fields = spec.fields();
  this.size = fields.size();
  this.partitionTuple = new Object[size];
  this.transforms = new Transform[size];
  this.accessors = (Accessor<InternalRow>[]) Array.newInstance(Accessor.class, size);

  Schema schema = spec.schema();
  Map<Integer, Accessor<InternalRow>> newAccessors = buildAccessors(inputSchema);
  for (int i = 0; i < size; i += 1) {
    PartitionField field = fields.get(i);
    Accessor<InternalRow> accessor = newAccessors.get(field.sourceId());
    if (accessor == null) {
      throw new RuntimeException(
          "Cannot build accessor for field: " + schema.findField(field.sourceId()));
    }
    this.accessors[i] = accessor;
    this.transforms[i] = field.transform();
  }
}
 
Example #4
Source File: Projections.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T> Expression predicate(BoundPredicate<T> pred) {
  Collection<PartitionField> parts = spec().getFieldsBySourceId(pred.ref().fieldId());
  if (parts == null) {
    // the predicate has no partition column
    return Expressions.alwaysFalse();
  }

  Expression result = Expressions.alwaysFalse();
  for (PartitionField part : parts) {
    // consider (ts > 2019-01-01T01:00:00) with day(ts) and hour(ts)
    // projections: d >= 2019-01-02 and h >= 2019-01-01-02 (note the inclusive bounds).
    // any timestamp where either projection predicate is true must match the original
    // predicate. For example, ts = 2019-01-01T03:00:00 matches the hour projection but not
    // the day, but does match the original predicate.
    UnboundPredicate<?> strictProjection = ((Transform<T, ?>) part.transform()).projectStrict(part.name(), pred);
    if (strictProjection != null) {
      result = Expressions.or(result, strictProjection);
    }
  }

  return result;
}
 
Example #5
Source File: ManifestsTable.java    From presto with Apache License 2.0 6 votes vote down vote up
private static void writePartitionSummaries(BlockBuilder blockBuilder, List<PartitionFieldSummary> summaries, PartitionSpec partitionSpec)
{
    for (int i = 0; i < summaries.size(); i++) {
        PartitionFieldSummary summary = summaries.get(i);
        PartitionField field = partitionSpec.fields().get(i);
        Type nestedType = partitionSpec.partitionType().fields().get(i).type();

        BlockBuilder rowBuilder = blockBuilder.beginBlockEntry();
        BOOLEAN.writeBoolean(rowBuilder, summary.containsNull());
        VARCHAR.writeString(rowBuilder, field.transform().toHumanString(
                Conversions.fromByteBuffer(nestedType, summary.lowerBound())));
        VARCHAR.writeString(rowBuilder, field.transform().toHumanString(
                Conversions.fromByteBuffer(nestedType, summary.upperBound())));
        blockBuilder.closeEntry();
    }
}
 
Example #6
Source File: IcebergUtil.java    From presto with Apache License 2.0 5 votes vote down vote up
public static Map<PartitionField, Integer> getIdentityPartitions(PartitionSpec partitionSpec)
{
    // TODO: expose transform information in Iceberg library
    ImmutableMap.Builder<PartitionField, Integer> columns = ImmutableMap.builder();
    for (int i = 0; i < partitionSpec.fields().size(); i++) {
        PartitionField field = partitionSpec.fields().get(i);
        if (field.transform().toString().equals("identity")) {
            columns.put(field, i);
        }
    }
    return columns.build();
}
 
Example #7
Source File: PartitionFields.java    From presto with Apache License 2.0 5 votes vote down vote up
private static String toPartitionField(PartitionSpec spec, PartitionField field)
{
    String name = spec.schema().findColumnName(field.sourceId());
    String transform = field.transform().toString();

    switch (transform) {
        case "identity":
            return name;
        case "year":
        case "month":
        case "day":
        case "hour":
            return format("%s(%s)", transform, name);
    }

    Matcher matcher = ICEBERG_BUCKET_PATTERN.matcher(transform);
    if (matcher.matches()) {
        return format("bucket(%s, %s)", name, matcher.group(1));
    }

    matcher = ICEBERG_TRUNCATE_PATTERN.matcher(transform);
    if (matcher.matches()) {
        return format("truncate(%s, %s)", name, matcher.group(1));
    }

    throw new UnsupportedOperationException("Unsupported partition transform: " + field);
}
 
Example #8
Source File: PartitionSpecVisitor.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static <R> List<R> visit(Schema schema, PartitionSpec spec, PartitionSpecVisitor<R> visitor) {
  List<R> results = Lists.newArrayListWithExpectedSize(spec.fields().size());

  for (PartitionField field : spec.fields()) {
    String sourceName = schema.findColumnName(field.sourceId());
    Transform<?, ?> transform = field.transform();

    if (transform instanceof Identity) {
      results.add(visitor.identity(sourceName, field.sourceId()));
    } else if (transform instanceof Bucket) {
      results.add(visitor.bucket(sourceName, field.sourceId(),
          ((Bucket<?>) transform).numBuckets()));
    } else if (transform instanceof Truncate) {
      results.add(visitor.truncate(sourceName, field.sourceId(),
          ((Truncate<?>) transform).width()));
    } else if (transform == Dates.YEAR || transform == Timestamps.YEAR) {
      results.add(visitor.year(sourceName, field.sourceId()));
    } else if (transform == Dates.MONTH || transform == Timestamps.MONTH) {
      results.add(visitor.month(sourceName, field.sourceId()));
    } else if (transform == Dates.DAY || transform == Timestamps.DAY) {
      results.add(visitor.day(sourceName, field.sourceId()));
    } else if (transform == Timestamps.HOUR) {
      results.add(visitor.hour(sourceName, field.sourceId()));
    }
  }

  return results;
}
 
Example #9
Source File: PartitionTable.java    From presto with Apache License 2.0 5 votes vote down vote up
private List<ColumnMetadata> getPartitionColumnsMetadata(List<PartitionField> fields, Schema schema)
{
    return fields.stream()
            .map(field -> new ColumnMetadata(
                    field.name(),
                    toPrestoType(field.transform().getResultType(schema.findType(field.sourceId())), typeManager)))
            .collect(toImmutableList());
}
 
Example #10
Source File: IcebergPageSink.java    From presto with Apache License 2.0 5 votes vote down vote up
public PartitionColumn(PartitionField field, int sourceChannel, Type sourceType, Type resultType, Function<Block, Block> blockTransform)
{
    this.field = requireNonNull(field, "field is null");
    this.sourceChannel = sourceChannel;
    this.sourceType = requireNonNull(sourceType, "sourceType is null");
    this.resultType = requireNonNull(resultType, "resultType is null");
    this.blockTransform = requireNonNull(blockTransform, "blockTransform is null");
}
 
Example #11
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example #12
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listOrcPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);

    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf));
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("orc")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example #13
Source File: IcebergSplitSource.java    From presto with Apache License 2.0 5 votes vote down vote up
private static Map<Integer, String> getPartitionKeys(FileScanTask scanTask)
{
    StructLike partition = scanTask.file().partition();
    PartitionSpec spec = scanTask.spec();
    Map<PartitionField, Integer> fieldToIndex = getIdentityPartitions(spec);
    Map<Integer, String> partitionKeys = new HashMap<>();

    fieldToIndex.forEach((field, index) -> {
        int id = field.sourceId();
        Type type = spec.schema().findType(id);
        Class<?> javaClass = type.typeId().javaClass();
        Object value = partition.get(index, javaClass);

        if (value == null) {
            partitionKeys.put(id, null);
        }
        else {
            String partitionValue;
            if (type.typeId() == FIXED || type.typeId() == BINARY) {
                // this is safe because Iceberg PartitionData directly wraps the byte array
                partitionValue = new String(((ByteBuffer) value).array(), UTF_8);
            }
            else {
                partitionValue = value.toString();
            }
            partitionKeys.put(id, partitionValue);
        }
    });

    return Collections.unmodifiableMap(partitionKeys);
}
 
Example #14
Source File: SparkUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
/**
 * Check whether the partition transforms in a spec can be used to write data.
 *
 * @param spec a PartitionSpec
 * @throws UnsupportedOperationException if the spec contains unknown partition transforms
 */
public static void validatePartitionTransforms(PartitionSpec spec) {
  if (spec.fields().stream().anyMatch(field -> field.transform() instanceof UnknownTransform)) {
    String unsupported = spec.fields().stream()
        .map(PartitionField::transform)
        .filter(transform -> transform instanceof UnknownTransform)
        .map(Transform::toString)
        .collect(Collectors.joining(", "));

    throw new UnsupportedOperationException(
        String.format("Cannot write using unsupported transforms: %s", unsupported));
  }
}
 
Example #15
Source File: DataAdditionCmdHandler.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public void validateIcebergSchemaForInsertCommand(List<String> fieldNames) {
  IcebergTableProps icebergTableProps = icebergCreateTableEntry.getIcebergTableProps();
  Preconditions.checkState(icebergTableProps.getIcebergOpType() == IcebergOperation.Type.INSERT,
    "unexpected state found");

  BatchSchema querySchema = icebergTableProps.getFullSchema();
  IcebergTableOperations tableOperations = new IcebergTableOperations(
    new org.apache.hadoop.fs.Path(icebergTableProps.getTableLocation()), icebergCreateTableEntry.getPlugin().getFsConfCopy());

  BatchSchema icebergSchema = new SchemaConverter().fromIceberg(
    tableOperations.current().schema());

  // this check can be removed once we support schema evolution in dremio.
  if (!icebergSchema.equalsIgnoreCase(tableSchemaFromKVStore)) {
    throw UserException.validationError().message("The schema for table %s does not match with the iceberg %s.",
      tableSchemaFromKVStore, icebergSchema).buildSilently();
  }

  List<String> icebergPartitionColumns = tableOperations.current().spec().fields().stream()
    .map(PartitionField::name).collect(Collectors.toList());

  // this check can be removed once we support partition spec evolution in dremio.
  if (!comparePartitionColumnLists(icebergPartitionColumns)) {
    throw UserException.validationError().message("The table partition columns %s do not match with the iceberg partition columns %s.",
      partitionColumns.toString(), icebergPartitionColumns.toString()).buildSilently();
  }

  BatchSchema partSchemaWithSelectedFields = tableSchemaFromKVStore.subset(fieldNames).orElse(tableSchemaFromKVStore);
  if (!querySchema.equalsIgnoreCase(partSchemaWithSelectedFields)) {
    throw UserException.validationError().message("Table %s doesn't match with query %s.",
        partSchemaWithSelectedFields, querySchema).buildSilently();
  }
}
 
Example #16
Source File: IcebergTableWrapper.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void buildPartitionColumns() {
  partitionColumns = table
    .spec()
    .fields()
    .stream()
    .map(PartitionField::sourceId)
    .map(schema::findColumnName) // column name from schema
    .collect(Collectors.toList());
}
 
Example #17
Source File: PartitionTable.java    From presto with Apache License 2.0 5 votes vote down vote up
private List<Type> partitionTypes(List<PartitionField> partitionFields)
{
    ImmutableList.Builder<Type> partitionTypeBuilder = ImmutableList.builder();
    for (PartitionField partitionField : partitionFields) {
        Type.PrimitiveType sourceType = idToTypeMapping.get(partitionField.sourceId());
        Type type = partitionField.transform().getResultType(sourceType);
        partitionTypeBuilder.add(type);
    }
    return partitionTypeBuilder.build();
}
 
Example #18
Source File: IcebergPageSink.java    From presto with Apache License 2.0 4 votes vote down vote up
public PartitionField getField()
{
    return field;
}
 
Example #19
Source File: ResidualEvaluator.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T> Expression predicate(BoundPredicate<T> pred) {
  // Get the strict projection and inclusive projection of this predicate in partition data,
  // then use them to determine whether to return the original predicate. The strict projection
  // returns true iff the original predicate would have returned true, so the predicate can be
  // eliminated if the strict projection evaluates to true. Similarly the inclusive projection
  // returns false iff the original predicate would have returned false, so the predicate can
  // also be eliminated if the inclusive projection evaluates to false.

  // If there is no strict projection or if it evaluates to false, then return the predicate.
  List<PartitionField> parts = spec.getFieldsBySourceId(pred.ref().fieldId());
  if (parts == null) {
    return pred; // not associated inclusive a partition field, can't be evaluated
  }

  for (PartitionField part : parts) {

    // checking the strict projection
    UnboundPredicate<?> strictProjection = ((Transform<T, ?>) part.transform()).projectStrict(part.name(), pred);
    Expression strictResult = null;

    if (strictProjection != null) {
      Expression bound = strictProjection.bind(spec.partitionType(), caseSensitive);
      if (bound instanceof BoundPredicate) {
        strictResult = super.predicate((BoundPredicate<?>) bound);
      } else {
        // if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse
        strictResult = bound;
      }
    }

    if (strictResult != null && strictResult.op() == Expression.Operation.TRUE) {
      // If strict is true, returning true
      return Expressions.alwaysTrue();
    }

    // checking the inclusive projection
    UnboundPredicate<?> inclusiveProjection = ((Transform<T, ?>) part.transform()).project(part.name(), pred);
    Expression inclusiveResult = null;
    if (inclusiveProjection != null) {
      Expression boundInclusive = inclusiveProjection.bind(spec.partitionType(), caseSensitive);
      if (boundInclusive instanceof BoundPredicate) {
        // using predicate method specific to inclusive
        inclusiveResult = super.predicate((BoundPredicate<?>) boundInclusive);
      } else {
        // if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse
        inclusiveResult = boundInclusive;
      }
    }

    if (inclusiveResult != null && inclusiveResult.op() == Expression.Operation.FALSE) {
      // If inclusive is false, returning false
      return Expressions.alwaysFalse();
    }

  }

  // neither strict not inclusive predicate was conclusive, returning the original pred
  return pred;
}