Java Code Examples for org.apache.iceberg.Table#refresh()

The following examples show how to use org.apache.iceberg.Table#refresh() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestHadoopCommits.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testStaleMetadata() throws Exception {
  Table tableCopy = TABLES.load(tableLocation);

  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  // prepare changes on the copy without committing
  UpdateSchema updateCopy = tableCopy.updateSchema()
      .addColumn("m", Types.IntegerType.get());
  updateCopy.apply();

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertNotEquals("Unmodified copy should be out of date after update",
      table.schema().asStruct(), tableCopy.schema().asStruct());

  // update the table
  tableCopy.refresh();

  Assert.assertEquals("Copy should be back in sync",
      table.schema().asStruct(), tableCopy.schema().asStruct());

  AssertHelpers.assertThrows("Should fail with stale base metadata",
      CommitFailedException.class, "based on stale table metadata", updateCopy::commit);

  List<File> manifests = listManifestFiles();
  Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size());
}
 
Example 2
Source File: TestDataFrameWrites.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException {
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  Iterable<Record> expected = RandomData.generate(tableSchema, 100, 0L);
  writeData(expected, tableSchema, location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> actual = result.collectAsList();

  Iterator<Record> expectedIter = expected.iterator();
  Iterator<Row> actualIter = actual.iterator();
  while (expectedIter.hasNext() && actualIter.hasNext()) {
    assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next());
  }
  Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext());

  table.currentSnapshot().addedFiles().forEach(dataFile ->
      Assert.assertTrue(
          String.format(
              "File should have the parent directory %s, but has: %s.",
              expectedDataDir.getAbsolutePath(),
              dataFile.path()),
          URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath())));
}
 
Example 3
Source File: TestHadoopCommits.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testStaleVersionHint() throws Exception {
  Table stale = TABLES.load(tableLocation);

  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertEquals("Should write the current version to the hint file",
      2, readVersionHint());

  Assert.assertNotEquals("Stable table schema should not match",
      UPDATED_SCHEMA.asStruct(), stale.schema().asStruct());

  // roll the version hint back to 1
  replaceVersionHint(1);

  Table reloaded = TABLES.load(tableLocation);
  Assert.assertEquals("Updated schema for newly loaded table should match",
      UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct());

  stale.refresh();
  Assert.assertEquals("Refreshed schema for stale table should match",
      UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct());
}
 
Example 4
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedOverwrite() throws IOException {
  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  // overwrite with the same data; should not produce two copies
  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("overwrite")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 5
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) {
  File location = new File(parent, desc);
  Table table = TABLES.create(SCHEMA, spec, location.toString());

  // Do not combine or split files because the tests expect a split per partition.
  // A target split size of 2048 helps us achieve that.
  table.updateProperties().set("read.split.target-size", "2048").commit();

  // copy the unpartitioned table into the partitioned table to produce the partitioned data
  Dataset<Row> allRows = spark.read()
      .format("iceberg")
      .load(unpartitioned.toString());

  allRows
      .coalesce(1) // ensure only 1 file per partition is written
      .withColumn("part", callUDF(udf, column(partitionColumn)))
      .sortWithinPartitions("part")
      .drop("part")
      .write()
      .format("iceberg")
      .mode("append")
      .save(table.location());

  table.refresh();

  return table;
}
 
Example 6
Source File: TestHiveTableConcurrency.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentFastAppends() {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(2));

  AtomicInteger barrier = new AtomicInteger(0);
  Tasks.range(2)
      .stopOnFailure().throwFailureWhenFinished()
      .executeWith(executorService)
      .run(index -> {
        for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) {
          while (barrier.get() < numCommittedFiles * 2) {
            try {
              Thread.sleep(10);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }

          icebergTable.newFastAppend().appendFile(file).commit();
          barrier.incrementAndGet();
        }
      });

  icebergTable.refresh();
  Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size());
}
 
Example 7
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteProjection() throws IOException {
  Assume.assumeTrue(
      "Not supported in Spark 3.0; analysis requires all columns are present",
      spark.version().startsWith("2"));

  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, null),
      new SimpleRecord(2, null),
      new SimpleRecord(3, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id").write() // select only id column
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 8
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testAllEntriesTable() throws Exception {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test");
  Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
  Table entriesTable = loadTable(tableIdentifier, "all_entries");

  Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class);
  Dataset<Row> df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class);

  df1.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  // delete the first file to test that not only live files are listed
  table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit();

  // add a second file
  df2.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  // ensure table data isn't stale
  table.refresh();

  List<Row> actual = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier, "all_entries"))
      .orderBy("snapshot_id")
      .collectAsList();

  List<GenericData.Record> expected = Lists.newArrayList();
  for (ManifestFile manifest : Iterables.concat(Iterables.transform(table.snapshots(), Snapshot::allManifests))) {
    InputFile in = table.io().newInputFile(manifest.path());
    try (CloseableIterable<GenericData.Record> rows = Avro.read(in).project(entriesTable.schema()).build()) {
      // each row must inherit snapshot_id and sequence_number
      rows.forEach(row -> {
        row.put(2, 0L);
        GenericData.Record file = (GenericData.Record) row.get("data_file");
        file.put(0, FileContent.DATA.id());
        expected.add(row);
      });
    }
  }

  expected.sort(Comparator.comparing(o -> (Long) o.get("snapshot_id")));

  Assert.assertEquals("Entries table should have 3 rows", 3, expected.size());
  Assert.assertEquals("Actual results should have 3 rows", 3, actual.size());
  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(i), actual.get(i));
  }
}
 
Example 9
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testPartitionsTable() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "partitions_test");
  Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build());
  Table partitionsTable = loadTable(tableIdentifier, "partitions");
  Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class);
  Dataset<Row> df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class);

  df1.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  table.refresh();
  long firstCommitId = table.currentSnapshot().snapshotId();

  // add a second file
  df2.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  List<Row> actual = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier, "partitions"))
      .orderBy("partition.id")
      .collectAsList();

  GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(
      partitionsTable.schema(), "partitions"));
  GenericRecordBuilder partitionBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(
      partitionsTable.schema().findType("partition").asStructType(), "partition"));
  List<GenericData.Record> expected = Lists.newArrayList();
  expected.add(builder
      .set("partition", partitionBuilder.set("id", 1).build())
      .set("record_count", 1L)
      .set("file_count", 1)
      .build());
  expected.add(builder
      .set("partition", partitionBuilder.set("id", 2).build())
      .set("record_count", 1L)
      .set("file_count", 1)
      .build());

  Assert.assertEquals("Partitions table should have two rows", 2, expected.size());
  Assert.assertEquals("Actual results should have two rows", 2, actual.size());
  for (int i = 0; i < 2; i += 1) {
    TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i));
  }

  // check time travel
  List<Row> actualAfterFirstCommit = spark.read()
      .format("iceberg")
      .option("snapshot-id", String.valueOf(firstCommitId))
      .load(loadLocation(tableIdentifier, "partitions"))
      .orderBy("partition.id")
      .collectAsList();

  Assert.assertEquals("Actual results should have one row", 1, actualAfterFirstCommit.size());
  TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0));
}
 
Example 10
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testAllManifestsTable() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test");
  Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build());
  Table manifestTable = loadTable(tableIdentifier, "all_manifests");
  Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class);

  List<ManifestFile> manifests = Lists.newArrayList();

  df1.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  manifests.addAll(table.currentSnapshot().allManifests());

  table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();

  manifests.addAll(table.currentSnapshot().allManifests());

  List<Row> actual = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier, "all_manifests"))
      .orderBy("path")
      .collectAsList();

  table.refresh();

  GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(
      manifestTable.schema(), "manifests"));
  GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(
      manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary"));
  List<GenericData.Record> expected = Lists.newArrayList(Iterables.transform(manifests, manifest ->
      builder.set("path", manifest.path())
          .set("length", manifest.length())
          .set("partition_spec_id", manifest.partitionSpecId())
          .set("added_snapshot_id", manifest.snapshotId())
          .set("added_data_files_count", manifest.addedFilesCount())
          .set("existing_data_files_count", manifest.existingFilesCount())
          .set("deleted_data_files_count", manifest.deletedFilesCount())
          .set("partition_summaries", Lists.transform(manifest.partitions(), partition ->
              summaryBuilder
                  .set("contains_null", false)
                  .set("lower_bound", "1")
                  .set("upper_bound", "1")
                  .build()
          ))
          .build()
  ));

  expected.sort(Comparator.comparing(o -> o.get("path").toString()));

  Assert.assertEquals("Manifests table should have two manifest rows", 2, actual.size());
  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i));
  }
}
 
Example 11
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testManifestsTable() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test");
  Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build());
  Table manifestTable = loadTable(tableIdentifier, "manifests");
  Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class);

  df1.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  List<Row> actual = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier, "manifests"))
      .collectAsList();

  table.refresh();

  GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(
      manifestTable.schema(), "manifests"));
  GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(
      manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary"));
  List<GenericData.Record> expected = Lists.transform(table.currentSnapshot().allManifests(), manifest ->
      builder.set("path", manifest.path())
          .set("length", manifest.length())
          .set("partition_spec_id", manifest.partitionSpecId())
          .set("added_snapshot_id", manifest.snapshotId())
          .set("added_data_files_count", manifest.addedFilesCount())
          .set("existing_data_files_count", manifest.existingFilesCount())
          .set("deleted_data_files_count", manifest.deletedFilesCount())
          .set("partition_summaries", Lists.transform(manifest.partitions(), partition ->
              summaryBuilder
                  .set("contains_null", false)
                  .set("lower_bound", "1")
                  .set("upper_bound", "1")
                  .build()
              ))
          .build()
  );

  Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size());
  TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(0), actual.get(0));
}
 
Example 12
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testSnapshotsTable() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "snapshots_test");
  Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
  Table snapTable = loadTable(tableIdentifier, "snapshots");

  List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "1"));
  Dataset<Row> inputDf = spark.createDataFrame(records, SimpleRecord.class);

  inputDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  table.refresh();
  long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis();
  long firstSnapshotId = table.currentSnapshot().snapshotId();
  String firstManifestList = table.currentSnapshot().manifestListLocation();

  table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();

  long secondSnapshotTimestamp = table.currentSnapshot().timestampMillis();
  long secondSnapshotId = table.currentSnapshot().snapshotId();
  String secondManifestList = table.currentSnapshot().manifestListLocation();

  // rollback the table state to the first snapshot
  table.rollback().toSnapshotId(firstSnapshotId).commit();

  List<Row> actual = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier, "snapshots"))
      .collectAsList();

  GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots"));
  List<GenericData.Record> expected = Lists.newArrayList(
      builder.set("committed_at", firstSnapshotTimestamp * 1000)
          .set("snapshot_id", firstSnapshotId)
          .set("parent_id", null)
          .set("operation", "append")
          .set("manifest_list", firstManifestList)
          .set("summary", ImmutableMap.of(
              "added-records", "1",
              "added-data-files", "1",
              "changed-partition-count", "1",
              "total-data-files", "1",
              "total-records", "1"
          ))
          .build(),
      builder.set("committed_at", secondSnapshotTimestamp * 1000)
          .set("snapshot_id", secondSnapshotId)
          .set("parent_id", firstSnapshotId)
          .set("operation", "delete")
          .set("manifest_list", secondManifestList)
          .set("summary", ImmutableMap.of(
              "deleted-records", "1",
              "deleted-data-files", "1",
              "changed-partition-count", "1",
              "total-records", "0",
              "total-data-files", "0"
          ))
          .build()
  );

  Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size());
  TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(0), actual.get(0));
  TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(1), actual.get(1));
}
 
Example 13
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testWriteProjectionWithMiddle() throws IOException {
  Assume.assumeTrue(
      "Not supported in Spark 3.0; analysis requires all columns are present",
      spark.version().startsWith("2"));

  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Schema schema = new Schema(
      optional(1, "c1", Types.IntegerType.get()),
      optional(2, "c2", Types.StringType.get()),
      optional(3, "c3", Types.StringType.get())
  );
  Table table = tables.create(schema, spec, location.toString());

  List<ThreeColumnRecord> expected = Lists.newArrayList(
      new ThreeColumnRecord(1, null, "hello"),
      new ThreeColumnRecord(2, null, "world"),
      new ThreeColumnRecord(3, null, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, ThreeColumnRecord.class);

  df.select("c1", "c3").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<ThreeColumnRecord> actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 14
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testFilesUnpartitionedTable() throws Exception {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_files_test");
  Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
  Table entriesTable = loadTable(tableIdentifier, "entries");
  Table filesTable = loadTable(tableIdentifier, "files");

  Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class);
  Dataset<Row> df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class);

  df1.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  table.refresh();
  DataFile toDelete = Iterables.getOnlyElement(table.currentSnapshot().addedFiles());

  // add a second file
  df2.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  // delete the first file to test that only live files are listed
  table.newDelete().deleteFile(toDelete).commit();

  List<Row> actual = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier, "files"))
      .collectAsList();

  List<GenericData.Record> expected = Lists.newArrayList();
  for (ManifestFile manifest : table.currentSnapshot().dataManifests()) {
    InputFile in = table.io().newInputFile(manifest.path());
    try (CloseableIterable<GenericData.Record> rows = Avro.read(in).project(entriesTable.schema()).build()) {
      for (GenericData.Record record : rows) {
        if ((Integer) record.get("status") < 2 /* added or existing */) {
          GenericData.Record file = (GenericData.Record) record.get("data_file");
          file.put(0, FileContent.DATA.id());
          expected.add(file);
        }
      }
    }
  }

  Assert.assertEquals("Files table should have one row", 1, expected.size());
  Assert.assertEquals("Actual results should have one row", 1, actual.size());
  TestHelpers.assertEqualsSafe(filesTable.schema().asStruct(), expected.get(0), actual.get(0));
}
 
Example 15
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testEntriesTableWithSnapshotIdInheritance() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_inheritance_test");
  PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build();
  Table table = createTable(tableIdentifier, SCHEMA, spec);

  table.updateProperties()
      .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true")
      .commit();

  List<SimpleRecord> records = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b")
  );

  Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
  inputDF.select("id", "data").write()
      .format("parquet")
      .mode("overwrite")
      .partitionBy("id")
      .saveAsTable("parquet_table");

  try {
    String stagingLocation = table.location() + "/metadata";
    SparkTableUtil.importSparkTable(
        spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), table, stagingLocation);

    List<Row> actual = spark.read()
        .format("iceberg")
        .load(loadLocation(tableIdentifier, "entries"))
        .select("sequence_number", "snapshot_id", "data_file")
        .collectAsList();

    table.refresh();

    long snapshotId = table.currentSnapshot().snapshotId();

    Assert.assertEquals("Entries table should have 2 rows", 2, actual.size());
    Assert.assertEquals("Sequence number must match", 0, actual.get(0).getLong(0));
    Assert.assertEquals("Snapshot id must match", snapshotId, actual.get(0).getLong(1));
    Assert.assertEquals("Sequence number must match", 0, actual.get(1).getLong(0));
    Assert.assertEquals("Snapshot id must match", snapshotId, actual.get(1).getLong(1));
  } finally {
    spark.sql("DROP TABLE parquet_table");
  }
}
 
Example 16
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testEntriesTable() throws Exception {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test");
  Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
  Table entriesTable = loadTable(tableIdentifier, "entries");

  List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "1"));

  Dataset<Row> inputDf = spark.createDataFrame(records, SimpleRecord.class);
  inputDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  table.refresh();

  List<Row> actual = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier, "entries"))
      .collectAsList();

  Snapshot snapshot = table.currentSnapshot();

  Assert.assertEquals("Should only contain one manifest", 1, snapshot.allManifests().size());

  InputFile manifest = table.io().newInputFile(snapshot.allManifests().get(0).path());
  List<GenericData.Record> expected = Lists.newArrayList();
  try (CloseableIterable<GenericData.Record> rows = Avro.read(manifest).project(entriesTable.schema()).build()) {
    // each row must inherit snapshot_id and sequence_number
    rows.forEach(row -> {
      row.put(2, 0L);
      GenericData.Record file = (GenericData.Record) row.get("data_file");
      file.put(0, FileContent.DATA.id());
      expected.add(row);
    });
  }

  Assert.assertEquals("Entries table should have one row", 1, expected.size());
  Assert.assertEquals("Actual results should have one row", 1, actual.size());
  TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(0), actual.get(0));
}
 
Example 17
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws IOException {
  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  table.updateProperties()
      .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger
      .commit();

  List<SimpleRecord> expected = Lists.newArrayListWithCapacity(4000);
  for (int i = 0; i < 4000; i++) {
    expected.add(new SimpleRecord(i, "a"));
  }

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);

  List<DataFile> files = Lists.newArrayList();
  for (ManifestFile manifest : table.currentSnapshot().allManifests()) {
    for (DataFile file : ManifestFiles.read(manifest, table.io())) {
      files.add(file);
    }
  }
  // TODO: ORC file now not support target file size
  if (!format.equals(FileFormat.ORC)) {
    Assert.assertEquals("Should have 4 DataFiles", 4, files.size());
    Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000));
  }
}
 
Example 18
Source File: TestRewriteDataFilesAction.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testRewriteDataFilesPartitionedTable() {
  PartitionSpec spec = PartitionSpec.builderFor(SCHEMA)
      .identity("c1")
      .truncate("c2", 2)
      .build();
  Map<String, String> options = Maps.newHashMap();
  Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

  List<ThreeColumnRecord> records1 = Lists.newArrayList(
      new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"),
      new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC")
  );
  writeRecords(records1);

  List<ThreeColumnRecord> records2 = Lists.newArrayList(
      new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"),
      new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD")
  );
  writeRecords(records2);

  List<ThreeColumnRecord> records3 = Lists.newArrayList(
      new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"),
      new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG")
  );
  writeRecords(records3);

  List<ThreeColumnRecord> records4 = Lists.newArrayList(
      new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"),
      new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH")
  );
  writeRecords(records4);

  table.refresh();

  CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
  List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
  Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());

  Actions actions = Actions.forTable(table);

  RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
  Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size());
  Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFiles().size());

  table.refresh();

  CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
  List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
  Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size());

  List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
  expectedRecords.addAll(records1);
  expectedRecords.addAll(records2);
  expectedRecords.addAll(records3);
  expectedRecords.addAll(records4);

  Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
  List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3")
      .as(Encoders.bean(ThreeColumnRecord.class))
      .collectAsList();

  Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
 
Example 19
Source File: TestRewriteManifestsAction.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testRewriteSmallManifestsPartitionedTable() throws IOException {
  PartitionSpec spec = PartitionSpec.builderFor(SCHEMA)
      .identity("c1")
      .truncate("c2", 2)
      .build();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
  Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

  List<ThreeColumnRecord> records1 = Lists.newArrayList(
      new ThreeColumnRecord(1, null, "AAAA"),
      new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")
  );
  writeRecords(records1);

  List<ThreeColumnRecord> records2 = Lists.newArrayList(
      new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"),
      new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")
  );
  writeRecords(records2);

  List<ThreeColumnRecord> records3 = Lists.newArrayList(
      new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"),
      new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF")
  );
  writeRecords(records3);

  List<ThreeColumnRecord> records4 = Lists.newArrayList(
      new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"),
      new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH")
  );
  writeRecords(records4);

  table.refresh();

  List<ManifestFile> manifests = table.currentSnapshot().allManifests();
  Assert.assertEquals("Should have 4 manifests before rewrite", 4, manifests.size());

  Actions actions = Actions.forTable(table);

  // we will expect to have 2 manifests with 4 entries in each after rewrite
  long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests);
  long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes);

  table.updateProperties()
      .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes))
      .commit();

  RewriteManifestsActionResult result = actions.rewriteManifests()
      .rewriteIf(manifest -> true)
      .execute();

  Assert.assertEquals("Action should rewrite 4 manifests", 4, result.deletedManifests().size());
  Assert.assertEquals("Action should add 2 manifests", 2, result.addedManifests().size());

  table.refresh();

  List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
  Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());

  Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount());
  Assert.assertFalse(newManifests.get(0).hasAddedFiles());
  Assert.assertFalse(newManifests.get(0).hasDeletedFiles());

  Assert.assertEquals(4, (long) newManifests.get(1).existingFilesCount());
  Assert.assertFalse(newManifests.get(1).hasAddedFiles());
  Assert.assertFalse(newManifests.get(1).hasDeletedFiles());

  List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
  expectedRecords.addAll(records1);
  expectedRecords.addAll(records2);
  expectedRecords.addAll(records3);
  expectedRecords.addAll(records4);

  Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
  List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2")
      .as(Encoders.bean(ThreeColumnRecord.class))
      .collectAsList();

  Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
 
Example 20
Source File: TestRewriteManifestsAction.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testRewriteSmallManifestsNonPartitionedTable() throws IOException {
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
  Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

  List<ThreeColumnRecord> records1 = Lists.newArrayList(
      new ThreeColumnRecord(1, null, "AAAA"),
      new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")
  );
  writeRecords(records1);

  List<ThreeColumnRecord> records2 = Lists.newArrayList(
      new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"),
      new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")
  );
  writeRecords(records2);

  table.refresh();

  List<ManifestFile> manifests = table.currentSnapshot().allManifests();
  Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size());

  Actions actions = Actions.forTable(table);

  RewriteManifestsActionResult result = actions.rewriteManifests()
      .rewriteIf(manifest -> true)
      .execute();

  Assert.assertEquals("Action should rewrite 2 manifests", 2, result.deletedManifests().size());
  Assert.assertEquals("Action should add 1 manifests", 1, result.addedManifests().size());

  table.refresh();

  List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
  Assert.assertEquals("Should have 1 manifests after rewrite", 1, newManifests.size());

  Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount());
  Assert.assertFalse(newManifests.get(0).hasAddedFiles());
  Assert.assertFalse(newManifests.get(0).hasDeletedFiles());

  List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
  expectedRecords.addAll(records1);
  expectedRecords.addAll(records2);

  Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
  List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2")
      .as(Encoders.bean(ThreeColumnRecord.class))
      .collectAsList();

  Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}