org.apache.iceberg.AppendFiles Java Examples

The following examples show how to use org.apache.iceberg.AppendFiles. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveCreateReplaceTableTest.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnAndAppend() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());

  AppendFiles append = txn.newAppend();
  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();
  append.appendFile(dataFile);
  append.commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Snapshot snapshot = table.currentSnapshot();
  Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1);
}
 
Example #2
Source File: IcebergMetadata.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics)
{
    IcebergWritableTableHandle table = (IcebergWritableTableHandle) insertHandle;
    org.apache.iceberg.Table icebergTable = transaction.table();

    List<CommitTaskData> commitTasks = fragments.stream()
            .map(slice -> commitTaskCodec.fromJson(slice.getBytes()))
            .collect(toImmutableList());

    Type[] partitionColumnTypes = icebergTable.spec().fields().stream()
            .map(field -> field.transform().getResultType(
                    icebergTable.schema().findType(field.sourceId())))
            .toArray(Type[]::new);

    AppendFiles appendFiles = transaction.newFastAppend();
    for (CommitTaskData task : commitTasks) {
        HdfsContext context = new HdfsContext(session, table.getSchemaName(), table.getTableName());
        Configuration configuration = hdfsEnvironment.getConfiguration(context, new Path(task.getPath()));

        DataFiles.Builder builder = DataFiles.builder(icebergTable.spec())
                .withInputFile(HadoopInputFile.fromLocation(task.getPath(), configuration))
                .withFormat(table.getFileFormat())
                .withMetrics(task.getMetrics().metrics());

        if (!icebergTable.spec().fields().isEmpty()) {
            String partitionDataJson = task.getPartitionDataJson()
                    .orElseThrow(() -> new VerifyException("No partition data for partitioned table"));
            builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes));
        }

        appendFiles.appendFile(builder.build());
    }

    appendFiles.commit();
    transaction.commitTransaction();

    return Optional.of(new HiveWrittenPartitions(commitTasks.stream()
            .map(CommitTaskData::getPath)
            .collect(toImmutableList())));
}
 
Example #3
Source File: TestIcebergPartitions.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Test
public void testNonIdentityPartitions() throws Exception {
  File root = tempDir.newFolder();
  HadoopTables tables = new HadoopTables(conf);
  PartitionSpec partitionSpec = PartitionSpec
      .builderFor(schema)
      .bucket(NAME, 2)
      .build();
  Table table = tables.create(schema, partitionSpec, root.getAbsolutePath());

  // Append some data files.
  Transaction transaction = table.newTransaction();
  AppendFiles appendFiles = transaction.newAppend();
  appendFiles.appendFile(createDataFile(root, "d1", 1, "jack", 100));
  appendFiles.appendFile(createDataFile(root, "d2", 1, "jack", 200));
  appendFiles.appendFile(createDataFile(root, "d3", 2, "jill", 300));
  appendFiles.appendFile(createDataFile(root, "d4", 2, "jill", 400));
  appendFiles.appendFile(createDataFile(root, "d5", 2, "jill", 500));
  appendFiles.commit();
  transaction.commitTransaction();

  try {
    IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(),
        HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo();
    fail("Expected error while reading metadata of iceberg table with non-identity partition field");
  } catch (Exception ex) {
    Assert.assertTrue("UserException expected", ex instanceof UserException);
    UserException uex = ((UserException) ex);
    Assert.assertEquals("Invalid ErrorType. Expected " + UserBitShared.DremioPBError.ErrorType.UNSUPPORTED_OPERATION
            + " but got " + uex.getErrorType(), UserBitShared.DremioPBError.ErrorType.UNSUPPORTED_OPERATION, uex.getErrorType());
    String expectedErrorMsg = "Column values and partition values are not same for [name] column";
    Assert.assertTrue("Expected message to contain " + expectedErrorMsg + " but was "
        + uex.getOriginalMessage() + " instead", uex.getOriginalMessage().contains(expectedErrorMsg));
  }
}
 
Example #4
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testIdentityPartitionProjections() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(LOG_SCHEMA, IDENTITY_PARTITION_SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());

  List<Record> inputRecords = RandomGenericData.generate(LOG_SCHEMA, 10, 0);
  Integer idx = 0;
  AppendFiles append = table.newAppend();
  for (Record record : inputRecords) {
    record.set(1, "2020-03-2" + idx);
    record.set(2, idx.toString());
    append.appendFile(writeFile(table, Row.of("2020-03-2" + idx, idx.toString()), format, ImmutableList.of(record)));
    idx += 1;
  }
  append.commit();

  // individual fields
  validateIdentityPartitionProjections(location.toString(), withColumns("date"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("id"), inputRecords);
  // field pairs
  validateIdentityPartitionProjections(location.toString(), withColumns("date", "message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level", "message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("date", "level"), inputRecords);
  // out-of-order pairs
  validateIdentityPartitionProjections(location.toString(), withColumns("message", "date"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("message", "level"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level", "date"), inputRecords);
  // full projection
  validateIdentityPartitionProjections(location.toString(), LOG_SCHEMA, inputRecords);
  // out-of-order triplets
  validateIdentityPartitionProjections(location.toString(), withColumns("date", "level", "message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level", "date", "message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("date", "message", "level"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level", "message", "date"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("message", "date", "level"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("message", "level", "date"), inputRecords);
}
 
Example #5
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 4 votes vote down vote up
/**
 * Import files from given partitions to an Iceberg table.
 *
 * @param spark a Spark session
 * @param partitions partitions to import
 * @param targetTable an Iceberg table where to import the data
 * @param spec a partition spec
 * @param stagingDir a staging directory to store temporary manifest files
 */
public static void importSparkPartitions(
    SparkSession spark, List<SparkPartition> partitions, Table targetTable, PartitionSpec spec, String stagingDir) {
  Configuration conf = spark.sessionState().newHadoopConf();
  SerializableConfiguration serializableConf = new SerializableConfiguration(conf);
  int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism());
  int numShufflePartitions = spark.sessionState().conf().numShufflePartitions();
  MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties());

  JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
  JavaRDD<SparkPartition> partitionRDD = sparkContext.parallelize(partitions, parallelism);

  Dataset<SparkPartition> partitionDS = spark.createDataset(
      partitionRDD.rdd(),
      Encoders.javaSerialization(SparkPartition.class));

  List<ManifestFile> manifests = partitionDS
      .flatMap((FlatMapFunction<SparkPartition, DataFile>) sparkPartition ->
              listPartition(sparkPartition, spec, serializableConf, metricsConfig).iterator(),
          Encoders.javaSerialization(DataFile.class))
      .repartition(numShufflePartitions)
      .map((MapFunction<DataFile, Tuple2<String, DataFile>>) file ->
              Tuple2.apply(file.path().toString(), file),
          Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class)))
      .orderBy(col("_1"))
      .mapPartitions(
          (MapPartitionsFunction<Tuple2<String, DataFile>, ManifestFile>) fileTuple ->
              buildManifest(serializableConf, spec, stagingDir, fileTuple),
          Encoders.javaSerialization(ManifestFile.class))
      .collectAsList();

  try {
    boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean(
        targetTable.properties(),
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED,
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT);

    AppendFiles append = targetTable.newAppend();
    manifests.forEach(append::appendManifest);
    append.commit();

    if (!snapshotIdInheritanceEnabled) {
      // delete original manifests as they were rewritten before the commit
      deleteManifests(targetTable.io(), manifests);
    }
  } catch (Throwable e) {
    deleteManifests(targetTable.io(), manifests);
    throw e;
  }
}
 
Example #6
Source File: TestCreateTable.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
@Test
public void testDroppingOfMapTypeColumn() throws Exception{
  String table1 = "iceberg_map_test";
  try {
    File table1Folder = new File(getDfsTestTmpSchemaLocation(), table1);
    HadoopTables hadoopTables = new HadoopTables(new Configuration());

    Schema schema = new Schema(
      Types.NestedField.optional(1, "col1", Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get())),
      Types.NestedField.optional(2, "col2", Types.IntegerType.get())
    );
    PartitionSpec spec = PartitionSpec
      .builderFor(schema)
      .build();
    Table table = hadoopTables.create(schema, spec, table1Folder.getPath());
    Transaction transaction = table.newTransaction();
    AppendFiles appendFiles = transaction.newAppend();
    final String testWorkingPath = TestTools.getWorkingPath() + "/src/test/resources/iceberg/mapTest";
    final String parquetFile = "iceberg_map_test.parquet";
    File dataFile = new File(testWorkingPath, parquetFile);
    appendFiles.appendFile(
      DataFiles.builder(spec)
        .withInputFile(Files.localInput(dataFile))
        .withRecordCount(1)
        .withFormat(FileFormat.PARQUET)
        .build()
    );
    appendFiles.commit();
    transaction.commitTransaction();

    testBuilder()
      .sqlQuery("select * from dfs_test.iceberg_map_test")
      .unOrdered()
      .baselineColumns("col2")
      .baselineValues(1)
      .build()
      .run();

    Thread.sleep(1001);
    String insertCommandSql = "insert into  dfs_test.iceberg_map_test select * from (values(2))";
    test(insertCommandSql);
    Thread.sleep(1001);

    testBuilder()
      .sqlQuery("select * from dfs_test.iceberg_map_test")
      .unOrdered()
      .baselineColumns("col2")
      .baselineValues(1)
      .baselineValues(2)
      .build()
      .run();
  }
  finally {
    FileUtils.deleteQuietly(new File(getDfsTestTmpSchemaLocation(), table1));
  }
}
 
Example #7
Source File: TestIcebergTableDrop.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
@Test
public void testDropTable() throws Exception {
  try (AutoCloseable c = enableIcebergTables()) {
    Path rootPath = Paths.get(getDfsTestTmpSchemaLocation(), "iceberg", "nation");
    Files.createDirectories(rootPath);
    String root = rootPath.toString();

    String tableName = "dfs_test.iceberg.nation";

    HadoopTables tables = new HadoopTables(conf);
    Table table = tables.create(schema, null, root);
    IcebergTableInfo tableInfo =
        new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root)
            .getTableInfo();
    assertEquals(tableInfo.getRecordCount(), 0);

    // Append some data files.
    Transaction transaction = table.newTransaction();
    AppendFiles appendFiles = transaction.newAppend();
    appendFiles.appendFile(createDataFile(rootPath.toFile(), "d1"));
    appendFiles.commit();
    transaction.commitTransaction();

    testBuilder()
        .sqlQuery("select count(*) c from " + tableName)
        .unOrdered()
        .baselineColumns("c")
        .baselineValues(25L)
        .build()
        .run();

    testBuilder()
        .sqlQuery("DROP TABLE " + tableName)
        .unOrdered()
        .baselineColumns("ok", "summary")
        .baselineValues(true, String.format("Table [%s] dropped", tableName))
        .build()
        .run();

    errorMsgTestHelper(
        "select count(*) c from " + tableName, "Table '" + tableName + "' not found");
  }
}
 
Example #8
Source File: TestRefresh.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
@Test
public void testRefresh() throws Exception {
  try (AutoCloseable c = enableIcebergTables()) {
    Path rootPath = Paths.get(getDfsTestTmpSchemaLocation(), "iceberg", "metadata_refresh");
    Files.createDirectories(rootPath);
    String root = rootPath.toString();
    String tableName = "dfs_test.iceberg.metadata_refresh";

    HadoopTables tables = new HadoopTables(conf);
    Table table = tables.create(schema, null, root);

    IcebergTableInfo tableInfo =
        new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root)
            .getTableInfo();
    assertEquals(tableInfo.getRecordCount(), 0);

    // Append some data files.
    Transaction transaction = table.newTransaction();
    AppendFiles appendFiles = transaction.newAppend();
    appendFiles.appendFile(createDataFile(rootPath.toFile(), "d1"));
    appendFiles.commit();
    transaction.commitTransaction();

    testBuilder()
        .sqlQuery("select count(*) c from " + tableName)
        .unOrdered()
        .baselineColumns("c")
        .baselineValues(25L)
        .build()
        .run();

    // to detect an mtime change.
    Thread.sleep(1000);

    // refresh without an update
    testBuilder()
        .sqlQuery("ALTER TABLE " + tableName + " REFRESH METADATA")
        .unOrdered()
        .baselineColumns("ok", "summary")
        .baselineValues(
            true,
            String.format(
                "Table '%s' read signature reviewed but source stated metadata is unchanged, no refresh occurred.",
                tableName))
        .build()
        .run();

    // Do another append
    transaction = table.newTransaction();
    appendFiles = transaction.newAppend();
    appendFiles.appendFile(createDataFile(rootPath.toFile(), "d2"));
    appendFiles.commit();
    transaction.commitTransaction();

    // refresh
    testBuilder()
        .sqlQuery("ALTER TABLE " + tableName + " REFRESH METADATA")
        .unOrdered()
        .baselineColumns("ok", "summary")
        .baselineValues(true, String.format("Metadata for table '%s' refreshed.", tableName))
        .build()
        .run();

    // validate increased row count
    testBuilder()
        .sqlQuery("select count(*) c from " + tableName)
        .unOrdered()
        .baselineColumns("c")
        .baselineValues(50L)
        .build()
        .run();
  }
}
 
Example #9
Source File: TestIcebergPartitions.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
@Test
public void testPartitions() throws Exception {
  File root = tempDir.newFolder();
  HadoopTables tables = new HadoopTables(conf);
  Table table = tables.create(schema, spec, root.getAbsolutePath());

  // test empty table.
  IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(),
    HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo();
  assertEquals(tableInfo.getRecordCount(), 0);

  List<String> expectedColumns = Arrays.asList(ID, NAME);
  assertEquals(expectedColumns, tableInfo.getPartitionColumns());

  assertEquals(0, ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()).size());

  // Append some data files.
  Transaction transaction = table.newTransaction();
  AppendFiles appendFiles = transaction.newAppend();
  appendFiles.appendFile(createDataFile(root, "d1", 1, "jack", 100));
  appendFiles.appendFile(createDataFile(root, "d2", 1, "jack", 200));
  appendFiles.appendFile(createDataFile(root, "d3", 2, "jill", 300));
  appendFiles.appendFile(createDataFile(root, "d4", 2, "jill", 400));
  appendFiles.appendFile(createDataFile(root, "d5", 2, "jill", 500));
  appendFiles.commit();
  transaction.commitTransaction();

  tableInfo = new IcebergTableWrapper(getSabotContext(),
    HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo();
  assertEquals(1500, tableInfo.getRecordCount());
  assertEquals(2, ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()).size());

  // validate first partition
  final AtomicLong recordCount = new AtomicLong(0);
  PartitionChunk p1 = findPartition(ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()), 1, "jack");
  assertNotNull(p1);
  assertEquals(2, p1.getSplitCount());
  p1.getSplits().iterator().forEachRemaining(x -> recordCount.addAndGet(x.getRecordCount()));
  assertEquals(300, recordCount.intValue());

  // validate second partition
  PartitionChunk p2 = findPartition(ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()), 2, "jill");
  assertNotNull(p2);

  assertEquals(3, p2.getSplitCount());
  recordCount.set(0);
  p2.getSplits().iterator().forEachRemaining(x -> recordCount.addAndGet(x.getRecordCount()));
  assertEquals(1200, recordCount.intValue());
}