Java Code Examples for org.apache.iceberg.Schema#findField()

The following examples show how to use org.apache.iceberg.Schema#findField() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PartitionKey.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
PartitionKey(PartitionSpec spec, Schema inputSchema) {
  this.spec = spec;

  List<PartitionField> fields = spec.fields();
  this.size = fields.size();
  this.partitionTuple = new Object[size];
  this.transforms = new Transform[size];
  this.accessors = (Accessor<InternalRow>[]) Array.newInstance(Accessor.class, size);

  Schema schema = spec.schema();
  Map<Integer, Accessor<InternalRow>> newAccessors = buildAccessors(inputSchema);
  for (int i = 0; i < size; i += 1) {
    PartitionField field = fields.get(i);
    Accessor<InternalRow> accessor = newAccessors.get(field.sourceId());
    if (accessor == null) {
      throw new RuntimeException(
          "Cannot build accessor for field: " + schema.findField(field.sourceId()));
    }
    this.accessors[i] = accessor;
    this.transforms[i] = field.transform();
  }
}
 
Example 2
Source File: NamedReference.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public BoundReference<T> bind(Types.StructType struct, boolean caseSensitive) {
  Schema schema = new Schema(struct.fields());
  Types.NestedField field = caseSensitive ?
      schema.findField(name) :
      schema.caseInsensitiveFindField(name);

  ValidationException.check(field != null,
      "Cannot find field '%s' in struct: %s", name, schema.asStruct());

  return new BoundReference<>(field, schema.accessorForField(field.fieldId()));
}
 
Example 3
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}
 
Example 4
Source File: SchemaConverter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static Schema getChildSchemaForStruct(Schema schema, String structName) {
  if (schema == null) {
    return null;
  }

  NestedField structField = schema.findField(structName);
  if (!structField.type().isStructType()) {
    return null;
  }

  return new Schema(structField.type().asStructType().fields());
}
 
Example 5
Source File: SchemaConverter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static Schema getChildSchemaForList(Schema schema, String listName) {
  if (schema == null) {
    return null;
  }

  NestedField listField = schema.findField(listName);
  if (!listField.type().isListType()) {
    return null;

  }

  return new Schema(listField.type().asListType().fields().get(0));
}
 
Example 6
Source File: IcebergOrcFileWriter.java    From presto with Apache License 2.0 4 votes vote down vote up
private static Metrics computeMetrics(Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics)
{
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);

    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();

    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcColumns.size(); i++) {
        OrcColumnId orcColumnId = new OrcColumnId(i);
        if (excludedColumns.contains(orcColumnId)) {
            continue;
        }
        OrcType orcColumn = orcColumns.get(orcColumnId);
        ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
        int icebergId = getIcebergId(orcColumn);
        Types.NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(
            fileRowCount,
            null, // TODO: Add column size accounting to ORC column writers
            valueCounts.isEmpty() ? null : valueCounts,
            nullCounts.isEmpty() ? null : nullCounts,
            lowerBounds.isEmpty() ? null : lowerBounds,
            upperBounds.isEmpty() ? null : upperBounds);
}
 
Example 7
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testCustomMetricCollectionForNestedParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA)
      .identity("strCol")
      .build();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  properties.put("write.metadata.metrics.column.longCol", "counts");
  properties.put("write.metadata.metrics.column.record.id", "full");
  properties.put("write.metadata.metrics.column.record.data", "truncate(2)");
  Table table = tables.create(COMPLEX_SCHEMA, spec, properties, tableLocation);

  Iterable<InternalRow> rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0);
  JavaRDD<InternalRow> rdd = sc.parallelize(Lists.newArrayList(rows));
  Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false);

  df.coalesce(1).write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField longCol = schema.findField("longCol");
  Types.NestedField recordId = schema.findField("record.id");
  Types.NestedField recordData = schema.findField("record.data");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();

    Map<Integer, Long> nullValueCounts = file.nullValueCounts();
    Assert.assertEquals(3, nullValueCounts.size());
    Assert.assertTrue(nullValueCounts.containsKey(longCol.fieldId()));
    Assert.assertTrue(nullValueCounts.containsKey(recordId.fieldId()));
    Assert.assertTrue(nullValueCounts.containsKey(recordData.fieldId()));

    Map<Integer, Long> valueCounts = file.valueCounts();
    Assert.assertEquals(3, valueCounts.size());
    Assert.assertTrue(valueCounts.containsKey(longCol.fieldId()));
    Assert.assertTrue(valueCounts.containsKey(recordId.fieldId()));
    Assert.assertTrue(valueCounts.containsKey(recordData.fieldId()));

    Map<Integer, ByteBuffer> lowerBounds = file.lowerBounds();
    Assert.assertEquals(2, lowerBounds.size());
    Assert.assertTrue(lowerBounds.containsKey(recordId.fieldId()));
    ByteBuffer recordDataLowerBound = lowerBounds.get(recordData.fieldId());
    Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataLowerBound).length);

    Map<Integer, ByteBuffer> upperBounds = file.upperBounds();
    Assert.assertEquals(2, upperBounds.size());
    Assert.assertTrue(upperBounds.containsKey(recordId.fieldId()));
    ByteBuffer recordDataUpperBound = upperBounds.get(recordData.fieldId());
    Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataUpperBound).length);
  }
}
 
Example 8
Source File: IcebergTableHandler.java    From metacat with Apache License 2.0 4 votes vote down vote up
/**
 * Updates the iceberg schema if the provided tableInfo has updated field comments.
 *
 * @param tableInfo table information
 * @return true if an update is done
 */
public boolean update(final TableInfo tableInfo) {
    boolean result = false;
    final List<FieldInfo> fields = tableInfo.getFields();
    if (fields != null && !fields.isEmpty()
        // This parameter is only sent during data change and not during schema change.
        && Strings.isNullOrEmpty(tableInfo.getMetadata().get(DirectSqlTable.PARAM_PREVIOUS_METADATA_LOCATION))) {
        final QualifiedName tableName = tableInfo.getName();
        final String tableMetadataLocation = HiveTableUtil.getIcebergTableMetadataLocation(tableInfo);
        if (Strings.isNullOrEmpty(tableMetadataLocation)) {
            final String message = String.format("No metadata location specified for table %s", tableName);
            log.error(message);
            throw new MetacatBadRequestException(message);
        }
        final IcebergMetastoreTables icebergMetastoreTables = new IcebergMetastoreTables(
            new IcebergTableOps(conf, tableMetadataLocation,
                connectorContext.getConfig(),
                icebergTableOpsProxy));
        final Table table = icebergMetastoreTables.loadTable(
            HiveTableUtil.qualifiedNameToTableIdentifier(tableName));
        final UpdateSchema updateSchema = table.updateSchema();
        final Schema schema = table.schema();
        for (FieldInfo field : fields) {
            final Types.NestedField iField = schema.findField(field.getName());
            if (iField != null && !Objects.equals(field.getComment(), iField.doc())) {
                updateSchema.updateColumnDoc(field.getName(), field.getComment());
                result = true;
            }
        }
        if (result) {
            updateSchema.commit();
            final String newTableMetadataLocation = icebergMetastoreTables.getTableOps().currentMetadataLocation();
            if (!tableMetadataLocation.equalsIgnoreCase(newTableMetadataLocation)) {
                tableInfo.getMetadata().put(DirectSqlTable.PARAM_PREVIOUS_METADATA_LOCATION, tableMetadataLocation);
                tableInfo.getMetadata().put(DirectSqlTable.PARAM_METADATA_LOCATION,
                    newTableMetadataLocation);
            }
        }
    }
    return result;
}