Java Code Examples for org.apache.iceberg.Table#currentSnapshot()

The following examples show how to use org.apache.iceberg.Table#currentSnapshot() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0

6 votes

@Test
public void testCreateTableTxnAndAppend() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());

  AppendFiles append = txn.newAppend();
  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();
  append.appendFile(dataFile);
  append.commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Snapshot snapshot = table.currentSnapshot();
  Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1);
}

Example 2

Source File: TestRewriteManifestsAction.java From iceberg with Apache License 2.0

4 votes

@Test
public void testRewriteImportedManifests() throws IOException {
  PartitionSpec spec = PartitionSpec.builderFor(SCHEMA)
      .identity("c3")
      .build();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
  Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

  List<ThreeColumnRecord> records = Lists.newArrayList(
      new ThreeColumnRecord(1, null, "AAAA"),
      new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")
  );
  File parquetTableDir = temp.newFolder("parquet_table");
  String parquetTableLocation = parquetTableDir.toURI().toString();

  try {
    Dataset<Row> inputDF = spark.createDataFrame(records, ThreeColumnRecord.class);
    inputDF.select("c1", "c2", "c3")
        .write()
        .format("parquet")
        .mode("overwrite")
        .option("path", parquetTableLocation)
        .partitionBy("c3")
        .saveAsTable("parquet_table");

    File stagingDir = temp.newFolder("staging-dir");
    SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());

    Snapshot snapshot = table.currentSnapshot();

    Actions actions = Actions.forTable(table);

    RewriteManifestsActionResult result = actions.rewriteManifests()
        .rewriteIf(manifest -> true)
        .stagingLocation(temp.newFolder().toString())
        .execute();

    Assert.assertEquals("Action should rewrite all manifests", snapshot.allManifests(), result.deletedManifests());
    Assert.assertEquals("Action should add 1 manifest", 1, result.addedManifests().size());

  } finally {
    spark.sql("DROP TABLE parquet_table");
  }
}

Example 3

Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0

4 votes

@Test
public void testEntriesTable() throws Exception {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test");
  Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
  Table entriesTable = loadTable(tableIdentifier, "entries");

  List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "1"));

  Dataset<Row> inputDf = spark.createDataFrame(records, SimpleRecord.class);
  inputDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(loadLocation(tableIdentifier));

  table.refresh();

  List<Row> actual = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier, "entries"))
      .collectAsList();

  Snapshot snapshot = table.currentSnapshot();

  Assert.assertEquals("Should only contain one manifest", 1, snapshot.allManifests().size());

  InputFile manifest = table.io().newInputFile(snapshot.allManifests().get(0).path());
  List<GenericData.Record> expected = Lists.newArrayList();
  try (CloseableIterable<GenericData.Record> rows = Avro.read(manifest).project(entriesTable.schema()).build()) {
    // each row must inherit snapshot_id and sequence_number
    rows.forEach(row -> {
      row.put(2, 0L);
      GenericData.Record file = (GenericData.Record) row.get("data_file");
      file.put(0, FileContent.DATA.id());
      expected.add(row);
    });
  }

  Assert.assertEquals("Entries table should have one row", 1, expected.size());
  Assert.assertEquals("Actual results should have one row", 1, actual.size());
  TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(0), actual.get(0));
}

Example 4

Source File: TestSnapshotSelection.java From iceberg with Apache License 2.0

4 votes

@Test
public void testSnapshotSelectionById() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, tableLocation);

  // produce the first snapshot
  List<SimpleRecord> firstBatchRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class);
  firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);

  // produce the second snapshot
  List<SimpleRecord> secondBatchRecords = Lists.newArrayList(
      new SimpleRecord(4, "d"),
      new SimpleRecord(5, "e"),
      new SimpleRecord(6, "f")
  );
  Dataset<Row> secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class);
  secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);

  Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots()));

  // verify records in the current snapshot
  Dataset<Row> currentSnapshotResult = spark.read()
      .format("iceberg")
      .load(tableLocation);
  List<SimpleRecord> currentSnapshotRecords = currentSnapshotResult.orderBy("id")
      .as(Encoders.bean(SimpleRecord.class))
      .collectAsList();
  List<SimpleRecord> expectedRecords = Lists.newArrayList();
  expectedRecords.addAll(firstBatchRecords);
  expectedRecords.addAll(secondBatchRecords);
  Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords);

  // verify records in the previous snapshot
  Snapshot currentSnapshot = table.currentSnapshot();
  Long parentSnapshotId = currentSnapshot.parentId();
  Dataset<Row> previousSnapshotResult = spark.read()
      .format("iceberg")
      .option("snapshot-id", parentSnapshotId)
      .load(tableLocation);
  List<SimpleRecord> previousSnapshotRecords = previousSnapshotResult.orderBy("id")
      .as(Encoders.bean(SimpleRecord.class))
      .collectAsList();
  Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords);
}