Java Code Examples for org.apache.iceberg.Table

The following examples show how to use org.apache.iceberg.Table. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: TestSnapshotSelection.java    License: Apache License 2.0 7 votes vote down vote up
@Test(expected = IllegalArgumentException.class)
public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, tableLocation);

  List<SimpleRecord> firstBatchRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class);
  firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);

  long timestamp = System.currentTimeMillis();
  long snapshotId = table.currentSnapshot().snapshotId();
  Dataset<Row> df = spark.read()
      .format("iceberg")
      .option("snapshot-id", snapshotId)
      .option("as-of-timestamp", timestamp)
      .load(tableLocation);

  df.collectAsList();
}
 
Example 2
Source Project: presto   Source File: SnapshotsTable.java    License: Apache License 2.0 6 votes vote down vote up
private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable)
{
    PageListBuilder pagesBuilder = PageListBuilder.forTable(tableMetadata);

    TimeZoneKey timeZoneKey = session.getTimeZoneKey();
    icebergTable.snapshots().forEach(snapshot -> {
        pagesBuilder.beginRow();
        pagesBuilder.appendTimestamp(packDateTimeWithZone(snapshot.timestampMillis(), timeZoneKey));
        pagesBuilder.appendBigint(snapshot.snapshotId());
        if (checkNonNull(snapshot.parentId(), pagesBuilder)) {
            pagesBuilder.appendBigint(snapshot.parentId());
        }
        if (checkNonNull(snapshot.operation(), pagesBuilder)) {
            pagesBuilder.appendVarchar(snapshot.operation());
        }
        if (checkNonNull(snapshot.manifestListLocation(), pagesBuilder)) {
            pagesBuilder.appendVarchar(snapshot.manifestListLocation());
        }
        if (checkNonNull(snapshot.summary(), pagesBuilder)) {
            pagesBuilder.appendVarcharVarcharMap(snapshot.summary());
        }
        pagesBuilder.endRow();
    });

    return pagesBuilder.build();
}
 
Example 3
Source Project: presto   Source File: IcebergSplitManager.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableHandle handle, SplitSchedulingStrategy splitSchedulingStrategy)
{
    IcebergTableHandle table = (IcebergTableHandle) handle;

    HiveMetastore metastore = transactionManager.get(transaction).getMetastore();
    Table icebergTable = getIcebergTable(metastore, hdfsEnvironment, session, table.getSchemaTableName());

    TableScan tableScan = getTableScan(session, table.getPredicate(), table.getSnapshotId(), icebergTable);

    // TODO Use residual. Right now there is no way to propagate residual to presto but at least we can
    //      propagate it at split level so the parquet pushdown can leverage it.
    IcebergSplitSource splitSource = new IcebergSplitSource(tableScan.planTasks());

    return new ClassLoaderSafeConnectorSplitSource(splitSource, Thread.currentThread().getContextClassLoader());
}
 
Example 4
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) {
  File location = new File(parent, desc);
  Table byId = TABLES.create(SCHEMA, spec, location.toString());

  // Do not combine or split files because the tests expect a split per partition.
  // A target split size of 2048 helps us achieve that.
  byId.updateProperties().set("read.split.target-size", "2048").commit();

  // copy the unpartitioned table into the partitioned table to produce the partitioned data
  Dataset<Row> allRows = spark.read()
      .format("iceberg")
      .load(unpartitioned.toString());

  allRows
      .coalesce(1) // ensure only 1 file per partition is written
      .withColumn("part", callUDF(udf, column(partitionColumn)))
      .sortWithinPartitions("part")
      .drop("part")
      .write()
      .format("iceberg")
      .mode("append")
      .save(byId.location());

  return location;
}
 
Example 5
Source Project: iceberg   Source File: HiveCreateReplaceTableTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testReplaceTableTxnTableModifiedConcurrently() {
  Table table = catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());
  Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, false);

  // update the table concurrently
  table.updateProperties()
      .set("another-prop", "another-value")
      .commit();

  txn.updateProperties()
      .set("prop", "value")
      .commit();
  txn.commitTransaction();

  // the replace should still succeed
  table = catalog.loadTable(TABLE_IDENTIFIER);
  Assert.assertNull("Table props should be updated", table.properties().get("another-prop"));
  Assert.assertEquals("Table props should match", "value", table.properties().get("prop"));
}
 
Example 6
Source Project: iceberg   Source File: TestHadoopCatalog.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateAndDropTableWithoutNamespace() throws Exception {
  Configuration conf = new Configuration();
  String warehousePath = temp.newFolder().getAbsolutePath();
  HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);

  TableIdentifier testTable = TableIdentifier.of("tbl");
  Table table = catalog.createTable(testTable, SCHEMA, PartitionSpec.unpartitioned());

  Assert.assertEquals(table.schema().toString(), TABLE_SCHEMA.toString());
  Assert.assertEquals("hadoop.tbl", table.toString());
  String metaLocation = catalog.defaultWarehouseLocation(testTable);

  FileSystem fs = Util.getFs(new Path(metaLocation), conf);
  Assert.assertTrue(fs.isDirectory(new Path(metaLocation)));

  catalog.dropTable(testTable);
  Assert.assertFalse(fs.isDirectory(new Path(metaLocation)));
}
 
Example 7
Source Project: iceberg   Source File: IcebergStorage.java    License: Apache License 2.0 6 votes vote down vote up
private Table load(String location, Job job) throws IOException {
  if (iceberg == null) {
    Class<?> tablesImpl = job.getConfiguration().getClass(PIG_ICEBERG_TABLES_IMPL, HadoopTables.class);
    LOG.info("Initializing iceberg tables implementation: {}", tablesImpl);
    iceberg = (Tables) ReflectionUtils.newInstance(tablesImpl, job.getConfiguration());
  }

  Table result = tables.get(location);

  if (result == null) {
    try {
      LOG.info("[{}]: Loading table for location: {}", signature, location);
      result = iceberg.load(location);
      tables.put(location, result);
    } catch (Exception e) {
      throw new FrontendException("Failed to instantiate tables implementation", e);
    }
  }

  return result;
}
 
Example 8
Source Project: iceberg   Source File: HiveTableTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testExistingTableUpdate() throws TException {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
  // add a column
  icebergTable.updateSchema().addColumn("data", Types.LongType.get()).commit();

  icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  // Only 2 snapshotFile Should exist and no manifests should exist
  Assert.assertEquals(2, metadataVersionFiles(TABLE_NAME).size());
  Assert.assertEquals(0, manifestFiles(TABLE_NAME).size());
  Assert.assertEquals(altered.asStruct(), icebergTable.schema().asStruct());

  final org.apache.hadoop.hive.metastore.api.Table table = metastoreClient.getTable(DB_NAME, TABLE_NAME);
  final List<String> hiveColumns = table.getSd().getCols().stream()
      .map(FieldSchema::getName)
      .collect(Collectors.toList());
  final List<String> icebergColumns = altered.columns().stream()
      .map(Types.NestedField::name)
      .collect(Collectors.toList());
  Assert.assertEquals(icebergColumns, hiveColumns);
}
 
Example 9
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}
 
Example 10
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testSnapshotReads() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(),
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  table.newAppend()
       .appendFile(writeFile(table, null, format, expectedRecords))
       .commit();
  long snapshotId = table.currentSnapshot().snapshotId();
  table.newAppend()
       .appendFile(writeFile(table, null, format, RandomGenericData.generate(table.schema(), 1, 0L)))
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .snapshotId(snapshotId);

  validate(job, expectedRecords);
}
 
Example 11
Source Project: iceberg   Source File: IcebergSourceFlatDataBenchmark.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected final Table initTable() {
  Schema schema = new Schema(
      required(1, "longCol", Types.LongType.get()),
      required(2, "intCol", Types.IntegerType.get()),
      required(3, "floatCol", Types.FloatType.get()),
      optional(4, "doubleCol", Types.DoubleType.get()),
      optional(5, "decimalCol", Types.DecimalType.of(20, 5)),
      optional(6, "dateCol", Types.DateType.get()),
      optional(7, "timestampCol", Types.TimestampType.withZone()),
      optional(8, "stringCol", Types.StringType.get()));
  PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
  HadoopTables tables = new HadoopTables(hadoopConf());
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
  return tables.create(schema, partitionSpec, properties, newTableLocation());
}
 
Example 12
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testProjection() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .project(projectedSchema);
  List<Record> outputRecords = readRecords(job.getConfiguration());
  Assert.assertEquals(inputRecords.size(), outputRecords.size());
  Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
}
 
Example 13
Source Project: iceberg   Source File: HadoopTables.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Loads the table location from a FileSystem path location.
 *
 * @param location a path URI (e.g. hdfs:///warehouse/my_table/)
 * @return table implementation
 */
@Override
public Table load(String location) {
  TableOperations ops = newTableOps(location);
  if (ops.current() == null) {
    // try to resolve a metadata table, which we encode as URI fragments
    // e.g. hdfs:///warehouse/my_table#snapshots
    int hashIndex = location.lastIndexOf('#');
    if (hashIndex != -1 && location.length() - 1 != hashIndex) {
      // we found char '#', and it is not the last char of location
      String baseTable = location.substring(0, hashIndex);
      String metaTable = location.substring(hashIndex + 1);
      MetadataTableType type = MetadataTableType.from(metaTable);
      if (type != null) {
        return loadMetadataTable(baseTable, type);
      } else {
        throw new NoSuchTableException("Table does not exist at location: " + location);
      }
    } else {
      throw new NoSuchTableException("Table does not exist at location: " + location);
    }
  }

  return new BaseTable(ops, location);
}
 
Example 14
Source Project: iceberg   Source File: RewriteManifestsAction.java    License: Apache License 2.0 6 votes vote down vote up
RewriteManifestsAction(SparkSession spark, Table table) {
  this.spark = spark;
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class);
  this.table = table;
  this.spec = table.spec();
  this.targetManifestSizeBytes = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.MANIFEST_TARGET_SIZE_BYTES,
      TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT);
  this.fileIO = SparkUtil.serializableFileIO(table);

  // default the staging location to the metadata location
  TableOperations ops = ((HasTableOperations) table).operations();
  Path metadataFilePath = new Path(ops.metadataFileLocation("file"));
  this.stagingLocation = metadataFilePath.getParent().toString();

  // use the current table format version for new manifests
  this.formatVersion = ops.current().formatVersion();
}
 
Example 15
Source Project: iceberg   Source File: RowDataRewriter.java    License: Apache License 2.0 6 votes vote down vote up
public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive,
                       Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) {
  this.schema = table.schema();
  this.spec = spec;
  this.locations = table.locationProvider();
  this.properties = table.properties();
  this.io = io;
  this.encryptionManager = encryptionManager;

  this.caseSensitive = caseSensitive;
  this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING);

  String formatString = table.properties().getOrDefault(
      TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
  this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
}
 
Example 16
Source Project: dremio-oss   Source File: TestIcebergPartitions.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testNonIdentityPartitions() throws Exception {
  File root = tempDir.newFolder();
  HadoopTables tables = new HadoopTables(conf);
  PartitionSpec partitionSpec = PartitionSpec
      .builderFor(schema)
      .bucket(NAME, 2)
      .build();
  Table table = tables.create(schema, partitionSpec, root.getAbsolutePath());

  // Append some data files.
  Transaction transaction = table.newTransaction();
  AppendFiles appendFiles = transaction.newAppend();
  appendFiles.appendFile(createDataFile(root, "d1", 1, "jack", 100));
  appendFiles.appendFile(createDataFile(root, "d2", 1, "jack", 200));
  appendFiles.appendFile(createDataFile(root, "d3", 2, "jill", 300));
  appendFiles.appendFile(createDataFile(root, "d4", 2, "jill", 400));
  appendFiles.appendFile(createDataFile(root, "d5", 2, "jill", 500));
  appendFiles.commit();
  transaction.commitTransaction();

  try {
    IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(),
        HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo();
    fail("Expected error while reading metadata of iceberg table with non-identity partition field");
  } catch (Exception ex) {
    Assert.assertTrue("UserException expected", ex instanceof UserException);
    UserException uex = ((UserException) ex);
    Assert.assertEquals("Invalid ErrorType. Expected " + UserBitShared.DremioPBError.ErrorType.UNSUPPORTED_OPERATION
            + " but got " + uex.getErrorType(), UserBitShared.DremioPBError.ErrorType.UNSUPPORTED_OPERATION, uex.getErrorType());
    String expectedErrorMsg = "Column values and partition values are not same for [name] column";
    Assert.assertTrue("Expected message to contain " + expectedErrorMsg + " but was "
        + uex.getOriginalMessage() + " instead", uex.getOriginalMessage().contains(expectedErrorMsg));
  }
}
 
Example 17
Source Project: iceberg   Source File: SparkWriteBuilder.java    License: Apache License 2.0 5 votes vote down vote up
SparkWriteBuilder(SparkSession spark, Table table, LogicalWriteInfo info) {
  this.spark = spark;
  this.table = table;
  this.writeQueryId = info.queryId();
  this.dsSchema = info.schema();
  this.options = info.options();
  this.overwriteMode = options.containsKey("overwrite-mode") ?
      options.get("overwrite-mode").toLowerCase(Locale.ROOT) : null;
}
 
Example 18
Source Project: presto   Source File: ManifestsTable.java    License: Apache License 2.0 5 votes vote down vote up
private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, Table icebergTable, Optional<Long> snapshotId)
{
    PageListBuilder pagesBuilder = PageListBuilder.forTable(tableMetadata);

    Snapshot snapshot = snapshotId.map(icebergTable::snapshot)
            .orElseGet(icebergTable::currentSnapshot);
    if (snapshot == null) {
        if (snapshotId.isPresent()) {
            throw new PrestoException(ICEBERG_INVALID_SNAPSHOT_ID, "Invalid snapshot ID: " + snapshotId.get());
        }
        throw new PrestoException(ICEBERG_INVALID_METADATA, "There's no snapshot associated with table " + tableMetadata.getTable().toString());
    }
    Map<Integer, PartitionSpec> partitionSpecsById = icebergTable.specs();

    snapshot.manifests().forEach(file -> {
        pagesBuilder.beginRow();
        pagesBuilder.appendVarchar(file.path());
        pagesBuilder.appendBigint(file.length());
        pagesBuilder.appendInteger(file.partitionSpecId());
        pagesBuilder.appendBigint(file.snapshotId());
        pagesBuilder.appendInteger(file.addedFilesCount());
        pagesBuilder.appendInteger(file.existingFilesCount());
        pagesBuilder.appendInteger(file.deletedFilesCount());
        writePartitionSummaries(pagesBuilder.nextColumn(), file.partitions(), partitionSpecsById.get(file.partitionSpecId()));
        pagesBuilder.endRow();
    });

    return pagesBuilder.build();
}
 
Example 19
Source Project: presto   Source File: IcebergUtil.java    License: Apache License 2.0 5 votes vote down vote up
public static TableScan getTableScan(ConnectorSession session, TupleDomain<IcebergColumnHandle> predicates, Optional<Long> snapshotId, Table icebergTable)
{
    Expression expression = ExpressionConverter.toIcebergExpression(predicates);
    TableScan tableScan = icebergTable.newScan().filter(expression);
    return snapshotId
            .map(id -> isSnapshot(icebergTable, id) ? tableScan.useSnapshot(id) : tableScan.asOfTime(id))
            .orElse(tableScan);
}
 
Example 20
Source Project: iceberg   Source File: SnapshotUtil.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @return whether ancestorSnapshotId is an ancestor of snapshotId
 */
public static boolean ancestorOf(Table table, long snapshotId, long ancestorSnapshotId) {
  Snapshot current = table.snapshot(snapshotId);
  while (current != null) {
    long id = current.snapshotId();
    if (ancestorSnapshotId == id) {
      return true;
    } else if (current.parentId() != null) {
      current = table.snapshot(current.parentId());
    } else {
      return false;
    }
  }
  return false;
}
 
Example 21
Source Project: iceberg   Source File: TestIcebergSourceHadoopTables.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec) {
  if (spec.equals(PartitionSpec.unpartitioned())) {
    return TABLES.create(schema, tableLocation);
  }
  return TABLES.create(schema, spec, tableLocation);
}
 
Example 22
Source Project: iceberg   Source File: HiveCreateReplaceTableTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateOrReplaceTableTxnTableNotExists() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, true);
  txn.updateProperties()
      .set("prop", "value")
      .commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Assert.assertEquals("Table props should match", "value", table.properties().get("prop"));
}
 
Example 23
Source Project: iceberg   Source File: TestDataFrameWrites.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteWithCustomDataLocation() throws IOException {
  File location = createTableFolder();
  File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir");
  Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location);
  table.updateProperties().set(
      TableProperties.WRITE_NEW_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit();
  writeAndValidateWithLocations(table, location, tablePropertyDataLocation);
}
 
Example 24
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SparkTable getTable(StructType schema, Transform[] partitioning, Map<String, String> options) {
  // TODO: if partitioning is non-null, the table is being created?
  // Get Iceberg table from options
  Configuration conf = new Configuration(SparkSession.active().sparkContext().hadoopConfiguration());
  Table icebergTable = getTableAndResolveHadoopConfiguration(options, conf);

  // Build Spark table based on Iceberg table, and return it
  return new SparkTable(icebergTable, schema);
}
 
Example 25
Source Project: iceberg   Source File: TestSparkDataWrite.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteProjection() throws IOException {
  Assume.assumeTrue(
      "Not supported in Spark 3.0; analysis requires all columns are present",
      spark.version().startsWith("2"));

  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, null),
      new SimpleRecord(2, null),
      new SimpleRecord(3, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id").write() // select only id column
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 26
Source Project: iceberg   Source File: SparkStreamingWrite.java    License: Apache License 2.0 5 votes vote down vote up
SparkStreamingWrite(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
                    CaseInsensitiveStringMap options, boolean truncateBatches, String queryId,
                    String applicationId, String wapId, Schema writeSchema, StructType dsSchema) {
  super(
      table, io, encryptionManager, options, false, truncateBatches, Expressions.alwaysTrue(), applicationId, wapId,
      writeSchema, dsSchema);
  this.truncateBatches = truncateBatches;
  this.queryId = queryId;
}
 
Example 27
Source Project: iceberg   Source File: TestHadoopCommits.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testStaleVersionHint() throws Exception {
  Table stale = TABLES.load(tableLocation);

  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertEquals("Should write the current version to the hint file",
      2, readVersionHint());

  Assert.assertNotEquals("Stable table schema should not match",
      UPDATED_SCHEMA.asStruct(), stale.schema().asStruct());

  // roll the version hint back to 1
  replaceVersionHint(1);

  Table reloaded = TABLES.load(tableLocation);
  Assert.assertEquals("Updated schema for newly loaded table should match",
      UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct());

  stale.refresh();
  Assert.assertEquals("Refreshed schema for stale table should match",
      UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct());
}
 
Example 28
Source Project: iceberg   Source File: TestHiveTableConcurrency.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentFastAppends() {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(2));

  AtomicInteger barrier = new AtomicInteger(0);
  Tasks.range(2)
      .stopOnFailure().throwFailureWhenFinished()
      .executeWith(executorService)
      .run(index -> {
        for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) {
          while (barrier.get() < numCommittedFiles * 2) {
            try {
              Thread.sleep(10);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }

          icebergTable.newFastAppend().appendFile(file).commit();
          barrier.incrementAndGet();
        }
      });

  icebergTable.refresh();
  Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size());
}
 
Example 29
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 5 votes vote down vote up
protected Table findTable(Map<String, String> options, Configuration conf) {
  Preconditions.checkArgument(options.containsKey("path"), "Cannot open table: path is not set");
  String path = options.get("path");

  if (path.contains("/")) {
    HadoopTables tables = new HadoopTables(conf);
    return tables.load(path);
  } else {
    HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf);
    TableIdentifier tableIdentifier = TableIdentifier.parse(path);
    return hiveCatalog.loadTable(tableIdentifier);
  }
}
 
Example 30
Source Project: iceberg   Source File: RewriteDataFilesAction.java    License: Apache License 2.0 5 votes vote down vote up
RewriteDataFilesAction(SparkSession spark, Table table) {
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.table = table;
  this.spec = table.spec();
  this.filter = Expressions.alwaysTrue();
  this.caseSensitive = Boolean.parseBoolean(spark.conf().get("spark.sql.caseSensitive", "false"));

  long splitSize = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.SPLIT_SIZE,
      TableProperties.SPLIT_SIZE_DEFAULT);
  long targetFileSize = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
      TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetSizeInBytes = Math.min(splitSize, targetFileSize);

  this.splitLookback = PropertyUtil.propertyAsInt(
      table.properties(),
      TableProperties.SPLIT_LOOKBACK,
      TableProperties.SPLIT_LOOKBACK_DEFAULT);
  this.splitOpenFileCost = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.SPLIT_OPEN_FILE_COST,
      TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT);

  this.fileIO = SparkUtil.serializableFileIO(table);
  this.encryptionManager = table.encryption();
}