Java Code Examples for org.apache.iceberg.Table#schema()

The following examples show how to use org.apache.iceberg.Table#schema() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RowDataRewriter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive,
                       Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) {
  this.schema = table.schema();
  this.spec = spec;
  this.locations = table.locationProvider();
  this.properties = table.properties();
  this.io = io;
  this.encryptionManager = encryptionManager;

  this.caseSensitive = caseSensitive;
  this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING);

  String formatString = table.properties().getOrDefault(
      TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
  this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
}
 
Example 2
Source File: TestDataFrameWrites.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException {
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  Iterable<Record> expected = RandomData.generate(tableSchema, 100, 0L);
  writeData(expected, tableSchema, location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> actual = result.collectAsList();

  Iterator<Record> expectedIter = expected.iterator();
  Iterator<Row> actualIter = actual.iterator();
  while (expectedIter.hasNext() && actualIter.hasNext()) {
    assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next());
  }
  Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext());

  table.currentSnapshot().addedFiles().forEach(dataFile ->
      Assert.assertTrue(
          String.format(
              "File should have the parent directory %s, but has: %s.",
              expectedDataDir.getAbsolutePath(),
              dataFile.path()),
          URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath())));
}
 
Example 3
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}
 
Example 4
Source File: Reader.java    From iceberg with Apache License 2.0 4 votes vote down vote up
Reader(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
    boolean caseSensitive, DataSourceOptions options) {
  this.table = table;
  this.snapshotId = options.get("snapshot-id").map(Long::parseLong).orElse(null);
  this.asOfTimestamp = options.get("as-of-timestamp").map(Long::parseLong).orElse(null);
  if (snapshotId != null && asOfTimestamp != null) {
    throw new IllegalArgumentException(
        "Cannot scan using both snapshot-id and as-of-timestamp to select the table snapshot");
  }

  this.startSnapshotId = options.get("start-snapshot-id").map(Long::parseLong).orElse(null);
  this.endSnapshotId = options.get("end-snapshot-id").map(Long::parseLong).orElse(null);
  if (snapshotId != null || asOfTimestamp != null) {
    if (startSnapshotId != null || endSnapshotId != null) {
      throw new IllegalArgumentException(
          "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either snapshot-id or " +
              "as-of-timestamp is specified");
    }
  } else {
    if (startSnapshotId == null && endSnapshotId != null) {
      throw new IllegalArgumentException("Cannot only specify option end-snapshot-id to do incremental scan");
    }
  }

  // look for split behavior overrides in options
  this.splitSize = options.get("split-size").map(Long::parseLong).orElse(null);
  this.splitLookback = options.get("lookback").map(Integer::parseInt).orElse(null);
  this.splitOpenFileCost = options.get("file-open-cost").map(Long::parseLong).orElse(null);

  if (io.getValue() instanceof HadoopFileIO) {
    String scheme = "no_exist";
    try {
      Configuration conf = new Configuration(SparkSession.active().sparkContext().hadoopConfiguration());
      // merge hadoop config set on table
      mergeIcebergHadoopConfs(conf, table.properties());
      // merge hadoop config passed as options and overwrite the one on table
      mergeIcebergHadoopConfs(conf, options.asMap());
      FileSystem fs = new Path(table.location()).getFileSystem(conf);
      scheme = fs.getScheme().toLowerCase(Locale.ENGLISH);
    } catch (IOException ioe) {
      LOG.warn("Failed to get Hadoop Filesystem", ioe);
    }
    this.localityPreferred = options.get("locality").map(Boolean::parseBoolean)
        .orElse(LOCALITY_WHITELIST_FS.contains(scheme));
  } else {
    this.localityPreferred = false;
  }

  this.schema = table.schema();
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.caseSensitive = caseSensitive;

  this.batchReadsEnabled = options.get("vectorization-enabled").map(Boolean::parseBoolean).orElse(
      PropertyUtil.propertyAsBoolean(table.properties(),
          TableProperties.PARQUET_VECTORIZATION_ENABLED, TableProperties.PARQUET_VECTORIZATION_ENABLED_DEFAULT));
  this.batchSize = options.get("batch-size").map(Integer::parseInt).orElse(
      PropertyUtil.propertyAsInt(table.properties(),
        TableProperties.PARQUET_BATCH_SIZE, TableProperties.PARQUET_BATCH_SIZE_DEFAULT));
}
 
Example 5
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Before
public void writeUnpartitionedTable() throws IOException {
  this.parent = temp.newFolder("TestFilteredScan");
  this.unpartitioned = new File(parent, "unpartitioned");
  File dataFolder = new File(unpartitioned, "data");
  Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs());

  Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString());
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));

  File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));

  // create records using the table's schema
  this.records = testRecords(tableSchema);

  switch (fileFormat) {
    case AVRO:
      try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
          .createWriterFunc(DataWriter::create)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case PARQUET:
      try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
          .createWriterFunc(GenericParquetWriter::buildWriter)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case ORC:
      try (FileAppender<Record> writer = ORC.write(localOutput(testFile))
          .createWriterFunc(GenericOrcWriter::buildWriter)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(records.size())
      .withFileSizeInBytes(testFile.length())
      .withPath(testFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();
}
 
Example 6
Source File: TestSparkReadProjection.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema,
                              Record record) throws IOException {
  File parent = temp.newFolder(desc);
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

  File testFile = new File(dataFolder, format.addExtension(UUID.randomUUID().toString()));

  Table table = TestTables.create(location, desc, writeSchema, PartitionSpec.unpartitioned());
  try {
    // Important: use the table's schema for the rest of the test
    // When tables are created, the column ids are reassigned.
    Schema tableSchema = table.schema();

    switch (format) {
      case AVRO:
        try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
            .createWriterFunc(DataWriter::create)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;

      case PARQUET:
        try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
            .createWriterFunc(GenericParquetWriter::buildWriter)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;

      case ORC:
        try (FileAppender<org.apache.iceberg.data.Record> writer = ORC.write(localOutput(testFile))
            .createWriterFunc(GenericOrcWriter::buildWriter)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;
    }

    DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
        .withRecordCount(100)
        .withFileSizeInBytes(testFile.length())
        .withPath(testFile.toString())
        .build();

    table.newAppend().appendFile(file).commit();

    table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit();

    // rewrite the read schema for the table's reassigned ids
    Map<Integer, Integer> idMapping = Maps.newHashMap();
    for (int id : allIds(writeSchema)) {
      // translate each id to the original schema's column name, then to the new schema's id
      String originalName = writeSchema.findColumnName(id);
      idMapping.put(id, tableSchema.findField(originalName).fieldId());
    }
    Schema expectedSchema = reassignIds(readSchema, idMapping);

    // Set the schema to the expected schema directly to simulate the table schema evolving
    TestTables.replaceMetadata(desc,
        TestTables.readMetadata(desc).updateSchema(expectedSchema, 100));

    Dataset<Row> df = spark.read()
        .format("org.apache.iceberg.spark.source.TestIcebergSource")
        .option("iceberg.table.name", desc)
        .load();

    return SparkValueConverter.convert(readSchema, df.collectAsList().get(0));

  } finally {
    TestTables.clearTables();
  }
}
 
Example 7
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testCustomMetricCollectionForNestedParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA)
      .identity("strCol")
      .build();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  properties.put("write.metadata.metrics.column.longCol", "counts");
  properties.put("write.metadata.metrics.column.record.id", "full");
  properties.put("write.metadata.metrics.column.record.data", "truncate(2)");
  Table table = tables.create(COMPLEX_SCHEMA, spec, properties, tableLocation);

  Iterable<InternalRow> rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0);
  JavaRDD<InternalRow> rdd = sc.parallelize(Lists.newArrayList(rows));
  Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false);

  df.coalesce(1).write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField longCol = schema.findField("longCol");
  Types.NestedField recordId = schema.findField("record.id");
  Types.NestedField recordData = schema.findField("record.data");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();

    Map<Integer, Long> nullValueCounts = file.nullValueCounts();
    Assert.assertEquals(3, nullValueCounts.size());
    Assert.assertTrue(nullValueCounts.containsKey(longCol.fieldId()));
    Assert.assertTrue(nullValueCounts.containsKey(recordId.fieldId()));
    Assert.assertTrue(nullValueCounts.containsKey(recordData.fieldId()));

    Map<Integer, Long> valueCounts = file.valueCounts();
    Assert.assertEquals(3, valueCounts.size());
    Assert.assertTrue(valueCounts.containsKey(longCol.fieldId()));
    Assert.assertTrue(valueCounts.containsKey(recordId.fieldId()));
    Assert.assertTrue(valueCounts.containsKey(recordData.fieldId()));

    Map<Integer, ByteBuffer> lowerBounds = file.lowerBounds();
    Assert.assertEquals(2, lowerBounds.size());
    Assert.assertTrue(lowerBounds.containsKey(recordId.fieldId()));
    ByteBuffer recordDataLowerBound = lowerBounds.get(recordData.fieldId());
    Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataLowerBound).length);

    Map<Integer, ByteBuffer> upperBounds = file.upperBounds();
    Assert.assertEquals(2, upperBounds.size());
    Assert.assertTrue(upperBounds.containsKey(recordId.fieldId()));
    ByteBuffer recordDataUpperBound = upperBounds.get(recordData.fieldId());
    Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataUpperBound).length);
  }
}
 
Example 8
Source File: TestAvroScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
protected void writeAndValidate(Schema schema) throws IOException {
  File parent = temp.newFolder("avro");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();

  File avroFile = new File(dataFolder,
      FileFormat.AVRO.addExtension(UUID.randomUUID().toString()));

  HadoopTables tables = new HadoopTables(CONF);
  Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString());

  // Important: use the table's schema for the rest of the test
  // When tables are created, the column ids are reassigned.
  Schema tableSchema = table.schema();

  List<Record> expected = RandomData.generateList(tableSchema, 100, 1L);

  try (FileAppender<Record> writer = Avro.write(localOutput(avroFile))
      .schema(tableSchema)
      .build()) {
    writer.addAll(expected);
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(100)
      .withFileSizeInBytes(avroFile.length())
      .withPath(avroFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> rows = df.collectAsList();
  Assert.assertEquals("Should contain 100 rows", 100, rows.size());

  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(tableSchema.asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example 9
Source File: TestParquetScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
protected void writeAndValidate(Schema schema) throws IOException {
  Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro",
      null == TypeUtil.find(
          schema,
          type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));

  File parent = temp.newFolder("parquet");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();

  File parquetFile = new File(dataFolder,
      FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));

  HadoopTables tables = new HadoopTables(CONF);
  Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString());

  // Important: use the table's schema for the rest of the test
  // When tables are created, the column ids are reassigned.
  Schema tableSchema = table.schema();

  List<GenericData.Record> expected = RandomData.generateList(tableSchema, 100, 1L);

  try (FileAppender<GenericData.Record> writer = Parquet.write(localOutput(parquetFile))
      .schema(tableSchema)
      .build()) {
    writer.addAll(expected);
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withFileSizeInBytes(parquetFile.length())
      .withPath(parquetFile.toString())
      .withRecordCount(100)
      .build();

  table.newAppend().appendFile(file).commit();
  table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit();

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> rows = df.collectAsList();
  Assert.assertEquals("Should contain 100 rows", 100, rows.size());

  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(tableSchema.asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example 10
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Before
public void writeUnpartitionedTable() throws IOException {
  this.parent = temp.newFolder("TestFilteredScan");
  this.unpartitioned = new File(parent, "unpartitioned");
  File dataFolder = new File(unpartitioned, "data");
  Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs());

  Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString());
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));

  File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));

  // create records using the table's schema
  org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(tableSchema, "test");
  this.records = testRecords(avroSchema);

  switch (fileFormat) {
    case AVRO:
      try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case PARQUET:
      try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(records.size())
      .withFileSizeInBytes(testFile.length())
      .withPath(testFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();
}
 
Example 11
Source File: IcebergTableHandler.java    From metacat with Apache License 2.0 4 votes vote down vote up
/**
 * Updates the iceberg schema if the provided tableInfo has updated field comments.
 *
 * @param tableInfo table information
 * @return true if an update is done
 */
public boolean update(final TableInfo tableInfo) {
    boolean result = false;
    final List<FieldInfo> fields = tableInfo.getFields();
    if (fields != null && !fields.isEmpty()
        // This parameter is only sent during data change and not during schema change.
        && Strings.isNullOrEmpty(tableInfo.getMetadata().get(DirectSqlTable.PARAM_PREVIOUS_METADATA_LOCATION))) {
        final QualifiedName tableName = tableInfo.getName();
        final String tableMetadataLocation = HiveTableUtil.getIcebergTableMetadataLocation(tableInfo);
        if (Strings.isNullOrEmpty(tableMetadataLocation)) {
            final String message = String.format("No metadata location specified for table %s", tableName);
            log.error(message);
            throw new MetacatBadRequestException(message);
        }
        final IcebergMetastoreTables icebergMetastoreTables = new IcebergMetastoreTables(
            new IcebergTableOps(conf, tableMetadataLocation,
                connectorContext.getConfig(),
                icebergTableOpsProxy));
        final Table table = icebergMetastoreTables.loadTable(
            HiveTableUtil.qualifiedNameToTableIdentifier(tableName));
        final UpdateSchema updateSchema = table.updateSchema();
        final Schema schema = table.schema();
        for (FieldInfo field : fields) {
            final Types.NestedField iField = schema.findField(field.getName());
            if (iField != null && !Objects.equals(field.getComment(), iField.doc())) {
                updateSchema.updateColumnDoc(field.getName(), field.getComment());
                result = true;
            }
        }
        if (result) {
            updateSchema.commit();
            final String newTableMetadataLocation = icebergMetastoreTables.getTableOps().currentMetadataLocation();
            if (!tableMetadataLocation.equalsIgnoreCase(newTableMetadataLocation)) {
                tableInfo.getMetadata().put(DirectSqlTable.PARAM_PREVIOUS_METADATA_LOCATION, tableMetadataLocation);
                tableInfo.getMetadata().put(DirectSqlTable.PARAM_METADATA_LOCATION,
                    newTableMetadataLocation);
            }
        }
    }
    return result;
}