org.apache.iceberg.Table Java Examples

The following examples show how to use org.apache.iceberg.Table. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestSnapshotSelection.java    From iceberg with Apache License 2.0 7 votes vote down vote up
@Test(expected = IllegalArgumentException.class)
public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, tableLocation);

  List<SimpleRecord> firstBatchRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class);
  firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);

  long timestamp = System.currentTimeMillis();
  long snapshotId = table.currentSnapshot().snapshotId();
  Dataset<Row> df = spark.read()
      .format("iceberg")
      .option("snapshot-id", snapshotId)
      .option("as-of-timestamp", timestamp)
      .load(tableLocation);

  df.collectAsList();
}
 
Example #2
Source File: TestHadoopCommits.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testStaleMetadata() throws Exception {
  Table tableCopy = TABLES.load(tableLocation);

  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  // prepare changes on the copy without committing
  UpdateSchema updateCopy = tableCopy.updateSchema()
      .addColumn("m", Types.IntegerType.get());
  updateCopy.apply();

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertNotEquals("Unmodified copy should be out of date after update",
      table.schema().asStruct(), tableCopy.schema().asStruct());

  // update the table
  tableCopy.refresh();

  Assert.assertEquals("Copy should be back in sync",
      table.schema().asStruct(), tableCopy.schema().asStruct());

  AssertHelpers.assertThrows("Should fail with stale base metadata",
      CommitFailedException.class, "based on stale table metadata", updateCopy::commit);

  List<File> manifests = listManifestFiles();
  Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size());
}
 
Example #3
Source File: RewriteManifestsAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
RewriteManifestsAction(SparkSession spark, Table table) {
  this.spark = spark;
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class);
  this.table = table;
  this.spec = table.spec();
  this.targetManifestSizeBytes = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.MANIFEST_TARGET_SIZE_BYTES,
      TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT);
  this.fileIO = SparkUtil.serializableFileIO(table);

  // default the staging location to the metadata location
  TableOperations ops = ((HasTableOperations) table).operations();
  Path metadataFilePath = new Path(ops.metadataFileLocation("file"));
  this.stagingLocation = metadataFilePath.getParent().toString();

  // use the current table format version for new manifests
  this.formatVersion = ops.current().formatVersion();
}
 
Example #4
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) {
  File location = new File(parent, desc);
  Table byId = TABLES.create(SCHEMA, spec, location.toString());

  // Do not combine or split files because the tests expect a split per partition.
  // A target split size of 2048 helps us achieve that.
  byId.updateProperties().set("read.split.target-size", "2048").commit();

  // copy the unpartitioned table into the partitioned table to produce the partitioned data
  Dataset<Row> allRows = spark.read()
      .format("iceberg")
      .load(unpartitioned.toString());

  allRows
      .coalesce(1) // ensure only 1 file per partition is written
      .withColumn("part", callUDF(udf, column(partitionColumn)))
      .sortWithinPartitions("part")
      .drop("part")
      .write()
      .format("iceberg")
      .mode("append")
      .save(byId.location());

  return location;
}
 
Example #5
Source File: IcebergSplitManager.java    From presto with Apache License 2.0 6 votes vote down vote up
@Override
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableHandle handle, SplitSchedulingStrategy splitSchedulingStrategy)
{
    IcebergTableHandle table = (IcebergTableHandle) handle;

    HiveMetastore metastore = transactionManager.get(transaction).getMetastore();
    Table icebergTable = getIcebergTable(metastore, hdfsEnvironment, session, table.getSchemaTableName());

    TableScan tableScan = getTableScan(session, table.getPredicate(), table.getSnapshotId(), icebergTable);

    // TODO Use residual. Right now there is no way to propagate residual to presto but at least we can
    //      propagate it at split level so the parquet pushdown can leverage it.
    IcebergSplitSource splitSource = new IcebergSplitSource(tableScan.planTasks());

    return new ClassLoaderSafeConnectorSplitSource(splitSource, Thread.currentThread().getContextClassLoader());
}
 
Example #6
Source File: RowDataRewriter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive,
                       Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) {
  this.schema = table.schema();
  this.spec = spec;
  this.locations = table.locationProvider();
  this.properties = table.properties();
  this.io = io;
  this.encryptionManager = encryptionManager;

  this.caseSensitive = caseSensitive;
  this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING);

  String formatString = table.properties().getOrDefault(
      TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
  this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
}
 
Example #7
Source File: SnapshotsTable.java    From presto with Apache License 2.0 6 votes vote down vote up
private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable)
{
    PageListBuilder pagesBuilder = PageListBuilder.forTable(tableMetadata);

    TimeZoneKey timeZoneKey = session.getTimeZoneKey();
    icebergTable.snapshots().forEach(snapshot -> {
        pagesBuilder.beginRow();
        pagesBuilder.appendTimestamp(packDateTimeWithZone(snapshot.timestampMillis(), timeZoneKey));
        pagesBuilder.appendBigint(snapshot.snapshotId());
        if (checkNonNull(snapshot.parentId(), pagesBuilder)) {
            pagesBuilder.appendBigint(snapshot.parentId());
        }
        if (checkNonNull(snapshot.operation(), pagesBuilder)) {
            pagesBuilder.appendVarchar(snapshot.operation());
        }
        if (checkNonNull(snapshot.manifestListLocation(), pagesBuilder)) {
            pagesBuilder.appendVarchar(snapshot.manifestListLocation());
        }
        if (checkNonNull(snapshot.summary(), pagesBuilder)) {
            pagesBuilder.appendVarcharVarcharMap(snapshot.summary());
        }
        pagesBuilder.endRow();
    });

    return pagesBuilder.build();
}
 
Example #8
Source File: HadoopTables.java    From iceberg with Apache License 2.0 6 votes vote down vote up
/**
 * Loads the table location from a FileSystem path location.
 *
 * @param location a path URI (e.g. hdfs:///warehouse/my_table/)
 * @return table implementation
 */
@Override
public Table load(String location) {
  TableOperations ops = newTableOps(location);
  if (ops.current() == null) {
    // try to resolve a metadata table, which we encode as URI fragments
    // e.g. hdfs:///warehouse/my_table#snapshots
    int hashIndex = location.lastIndexOf('#');
    if (hashIndex != -1 && location.length() - 1 != hashIndex) {
      // we found char '#', and it is not the last char of location
      String baseTable = location.substring(0, hashIndex);
      String metaTable = location.substring(hashIndex + 1);
      MetadataTableType type = MetadataTableType.from(metaTable);
      if (type != null) {
        return loadMetadataTable(baseTable, type);
      } else {
        throw new NoSuchTableException("Table does not exist at location: " + location);
      }
    } else {
      throw new NoSuchTableException("Table does not exist at location: " + location);
    }
  }

  return new BaseTable(ops, location);
}
 
Example #9
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testProjection() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .project(projectedSchema);
  List<Record> outputRecords = readRecords(job.getConfiguration());
  Assert.assertEquals(inputRecords.size(), outputRecords.size());
  Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
}
 
Example #10
Source File: HiveCreateReplaceTableTest.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testReplaceTableTxnTableModifiedConcurrently() {
  Table table = catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());
  Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, false);

  // update the table concurrently
  table.updateProperties()
      .set("another-prop", "another-value")
      .commit();

  txn.updateProperties()
      .set("prop", "value")
      .commit();
  txn.commitTransaction();

  // the replace should still succeed
  table = catalog.loadTable(TABLE_IDENTIFIER);
  Assert.assertNull("Table props should be updated", table.properties().get("another-prop"));
  Assert.assertEquals("Table props should match", "value", table.properties().get("prop"));
}
 
Example #11
Source File: IcebergSourceFlatDataBenchmark.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
protected final Table initTable() {
  Schema schema = new Schema(
      required(1, "longCol", Types.LongType.get()),
      required(2, "intCol", Types.IntegerType.get()),
      required(3, "floatCol", Types.FloatType.get()),
      optional(4, "doubleCol", Types.DoubleType.get()),
      optional(5, "decimalCol", Types.DecimalType.of(20, 5)),
      optional(6, "dateCol", Types.DateType.get()),
      optional(7, "timestampCol", Types.TimestampType.withZone()),
      optional(8, "stringCol", Types.StringType.get()));
  PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
  HadoopTables tables = new HadoopTables(hadoopConf());
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
  return tables.create(schema, partitionSpec, properties, newTableLocation());
}
 
Example #12
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testSnapshotReads() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(),
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  table.newAppend()
       .appendFile(writeFile(table, null, format, expectedRecords))
       .commit();
  long snapshotId = table.currentSnapshot().snapshotId();
  table.newAppend()
       .appendFile(writeFile(table, null, format, RandomGenericData.generate(table.schema(), 1, 0L)))
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .snapshotId(snapshotId);

  validate(job, expectedRecords);
}
 
Example #13
Source File: TestHadoopCatalog.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateAndDropTableWithoutNamespace() throws Exception {
  Configuration conf = new Configuration();
  String warehousePath = temp.newFolder().getAbsolutePath();
  HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);

  TableIdentifier testTable = TableIdentifier.of("tbl");
  Table table = catalog.createTable(testTable, SCHEMA, PartitionSpec.unpartitioned());

  Assert.assertEquals(table.schema().toString(), TABLE_SCHEMA.toString());
  Assert.assertEquals("hadoop.tbl", table.toString());
  String metaLocation = catalog.defaultWarehouseLocation(testTable);

  FileSystem fs = Util.getFs(new Path(metaLocation), conf);
  Assert.assertTrue(fs.isDirectory(new Path(metaLocation)));

  catalog.dropTable(testTable);
  Assert.assertFalse(fs.isDirectory(new Path(metaLocation)));
}
 
Example #14
Source File: IcebergSource.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}
 
Example #15
Source File: IcebergStorage.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private Table load(String location, Job job) throws IOException {
  if (iceberg == null) {
    Class<?> tablesImpl = job.getConfiguration().getClass(PIG_ICEBERG_TABLES_IMPL, HadoopTables.class);
    LOG.info("Initializing iceberg tables implementation: {}", tablesImpl);
    iceberg = (Tables) ReflectionUtils.newInstance(tablesImpl, job.getConfiguration());
  }

  Table result = tables.get(location);

  if (result == null) {
    try {
      LOG.info("[{}]: Loading table for location: {}", signature, location);
      result = iceberg.load(location);
      tables.put(location, result);
    } catch (Exception e) {
      throw new FrontendException("Failed to instantiate tables implementation", e);
    }
  }

  return result;
}
 
Example #16
Source File: HiveTableTest.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testExistingTableUpdate() throws TException {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
  // add a column
  icebergTable.updateSchema().addColumn("data", Types.LongType.get()).commit();

  icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  // Only 2 snapshotFile Should exist and no manifests should exist
  Assert.assertEquals(2, metadataVersionFiles(TABLE_NAME).size());
  Assert.assertEquals(0, manifestFiles(TABLE_NAME).size());
  Assert.assertEquals(altered.asStruct(), icebergTable.schema().asStruct());

  final org.apache.hadoop.hive.metastore.api.Table table = metastoreClient.getTable(DB_NAME, TABLE_NAME);
  final List<String> hiveColumns = table.getSd().getCols().stream()
      .map(FieldSchema::getName)
      .collect(Collectors.toList());
  final List<String> icebergColumns = altered.columns().stream()
      .map(Types.NestedField::name)
      .collect(Collectors.toList());
  Assert.assertEquals(icebergColumns, hiveColumns);
}
 
Example #17
Source File: TestIcebergCTASWithPartition.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void verifyPartitionValue(String tableFolder, Class expectedClass, Object expectedValue) {
  Table table = new HadoopTables(new Configuration()).load(tableFolder);
  for (FileScanTask fileScanTask : table.newScan().planFiles()) {
    StructLike structLike = fileScanTask.file().partition();
    Assert.assertEquals(structLike.get(0, expectedClass), expectedValue);
  }
}
 
Example #18
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedByIdStartsWith() {
  Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id");

  CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of(
      "path", table.location())
  );

  SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);

  pushFilters(builder, new StringStartsWith("data", "junc"));
  Batch scan = builder.build().toBatch();

  Assert.assertEquals(1, scan.planInputPartitions().length);
}
 
Example #19
Source File: SparkTable.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public SparkTable(Table icebergTable, StructType requestedSchema) {
  this.icebergTable = icebergTable;
  this.requestedSchema = requestedSchema;

  if (requestedSchema != null) {
    // convert the requested schema to throw an exception if any requested fields are unknown
    SparkSchemaUtil.convert(icebergTable.schema(), requestedSchema);
  }
}
 
Example #20
Source File: SchemaEvolutionTest.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void widenDecimalPrecision() throws IOException {
  // Set up a new table to test this conversion
  Schema schema = new Schema(optional(1, "decimal", Types.DecimalType.of(2, 2)));
  File location = Files.createTempDirectory("temp").toFile();
  HadoopTables tables = new HadoopTables(spark.sparkContext().hadoopConfiguration());
  Table decimalTable = tables.create(schema, location.toString());

  decimalTable.updateSchema().updateColumn("decimal", Types.DecimalType.of(4, 2)).commit();

  log.info("Widen decimal type:\n" + decimalTable.schema().toString());
}
 
Example #21
Source File: TestRemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testOlderThanTimestamp() throws InterruptedException {
  Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);

  List<ThreeColumnRecord> records = Lists.newArrayList(
      new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")
  );
  Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);

  df.select("c1", "c2", "c3")
      .write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
  df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");

  Thread.sleep(1000);

  long timestamp = System.currentTimeMillis();

  Thread.sleep(1000);

  df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");

  Actions actions = Actions.forTable(table);

  List<String> result = actions.removeOrphanFiles()
      .olderThan(timestamp)
      .execute();

  Assert.assertEquals("Should delete only 2 files", 2, result.size());
}
 
Example #22
Source File: TestRewriteDataFilesAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testRewriteDataFilesEmptyTable() {
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

  Assert.assertNull("Table must be empty", table.currentSnapshot());

  Actions actions = Actions.forTable(table);

  actions.rewriteDataFiles().execute();

  Assert.assertNull("Table must stay empty", table.currentSnapshot());
}
 
Example #23
Source File: TestSparkTableUtilWithInMemoryCatalog.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testImportPartitions() throws IOException {
  Table table = TABLES.create(SCHEMA, SPEC, tableLocation);

  List<SimpleRecord> records = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  File parquetTableDir = temp.newFolder("parquet_table");
  String parquetTableLocation = parquetTableDir.toURI().toString();

  try {
    Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
    inputDF.select("id", "data").write()
        .format("parquet")
        .mode("append")
        .option("path", parquetTableLocation)
        .partitionBy("data")
        .saveAsTable("parquet_table");

    File stagingDir = temp.newFolder("staging-dir");
    List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
    SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());

    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));

    List<SimpleRecord> actualRecords = spark.read()
        .format("iceberg")
        .load(tableLocation)
        .orderBy("id")
        .as(Encoders.bean(SimpleRecord.class))
        .collectAsList();

    Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
  } finally {
    spark.sql("DROP TABLE parquet_table");
  }
}
 
Example #24
Source File: IcebergMetastoreTables.java    From metacat with Apache License 2.0 5 votes vote down vote up
@Override
public Table createTable(final TableIdentifier identifier,
                         final Schema schema,
                         final PartitionSpec spec,
                         final String location,
                         final Map<String, String> properties) {
    throw new MetacatNotSupportedException("not supported");
}
 
Example #25
Source File: IcebergSource.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Table getTableAndResolveHadoopConfiguration(
    DataSourceOptions options, Configuration conf) {
  // Overwrite configurations from the Spark Context with configurations from the options.
  mergeIcebergHadoopConfs(conf, options.asMap());
  Table table = findTable(options, conf);
  // Set confs from table properties
  mergeIcebergHadoopConfs(conf, table.properties());
  // Re-overwrite values set in options and table properties but were not in the environment.
  mergeIcebergHadoopConfs(conf, options.asMap());
  return table;
}
 
Example #26
Source File: TestIcebergPartitionData.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void verifyPartitionValue(PartitionSpec partitionSpec, IcebergPartitionData partitionData,
                                  String columnName, Class expectedClass, Object expectedValue) throws Exception {
  File tableFolder = new File(folder.getRoot(), "icebergPartitionTest");
  try {
    tableFolder.mkdir();
    File dataFile = new File(folder.getRoot(), "a.parquet");

    dataFile.createNewFile();

    DataFile d1 = DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(dataFile))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartition(partitionData)
      .build();

    IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(Path.of(tableFolder.toPath().toString()),
      (new SchemaConverter()).fromIceberg(schema), Lists.newArrayList(columnName), new Configuration());
    committer.consumeData(Lists.newArrayList(d1));
    committer.commit();


    Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath());
    for (FileScanTask fileScanTask : table.newScan().planFiles()) {
      StructLike structLike = fileScanTask.file().partition();
      if (expectedClass == ByteBuffer.class) {
        Assert.assertEquals(structLike.get(0, expectedClass).hashCode(), ByteBuffer.wrap((byte[])expectedValue).hashCode());
      } else {
        Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue));
      }
    }

  }
  finally {
    tableFolder.delete();
  }

}
 
Example #27
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedByDataStartsWithFilter() {
  Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");
  CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));

  SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);

  pushFilters(builder, new StringStartsWith("data", "junc"));
  Batch scan = builder.build().toBatch();

  Assert.assertEquals(1, scan.planInputPartitions().length);
}
 
Example #28
Source File: RewriteDataFilesAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
RewriteDataFilesAction(SparkSession spark, Table table) {
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.table = table;
  this.spec = table.spec();
  this.filter = Expressions.alwaysTrue();
  this.caseSensitive = Boolean.parseBoolean(spark.conf().get("spark.sql.caseSensitive", "false"));

  long splitSize = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.SPLIT_SIZE,
      TableProperties.SPLIT_SIZE_DEFAULT);
  long targetFileSize = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
      TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetSizeInBytes = Math.min(splitSize, targetFileSize);

  this.splitLookback = PropertyUtil.propertyAsInt(
      table.properties(),
      TableProperties.SPLIT_LOOKBACK,
      TableProperties.SPLIT_LOOKBACK_DEFAULT);
  this.splitOpenFileCost = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.SPLIT_OPEN_FILE_COST,
      TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT);

  this.fileIO = SparkUtil.serializableFileIO(table);
  this.encryptionManager = table.encryption();
}
 
Example #29
Source File: IcebergSource.java    From iceberg with Apache License 2.0 5 votes vote down vote up
protected Table findTable(Map<String, String> options, Configuration conf) {
  Preconditions.checkArgument(options.containsKey("path"), "Cannot open table: path is not set");
  String path = options.get("path");

  if (path.contains("/")) {
    HadoopTables tables = new HadoopTables(conf);
    return tables.load(path);
  } else {
    HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf);
    TableIdentifier tableIdentifier = TableIdentifier.parse(path);
    return hiveCatalog.loadTable(tableIdentifier);
  }
}
 
Example #30
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testFailedResidualFiltering() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
      ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
      location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  expectedRecords.get(1).set(2, "2020-03-20");

  DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
      .appendFile(dataFile1)
      .commit();

  Job jobShouldFail1 = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(jobShouldFail1);
  configBuilder.useHiveRows().readFrom(location.toString())
      .filter(Expressions.and(
          Expressions.equal("date", "2020-03-20"),
          Expressions.equal("id", 0)));
  AssertHelpers.assertThrows(
      "Residuals are not evaluated today for Iceberg Generics In memory model of HIVE",
      UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.",
      () -> validate(jobShouldFail1, expectedRecords));

  Job jobShouldFail2 = Job.getInstance(conf);
  configBuilder = IcebergInputFormat.configure(jobShouldFail2);
  configBuilder.usePigTuples().readFrom(location.toString())
      .filter(Expressions.and(
          Expressions.equal("date", "2020-03-20"),
          Expressions.equal("id", 0)));
  AssertHelpers.assertThrows(
      "Residuals are not evaluated today for Iceberg Generics In memory model of PIG",
      UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.",
      () -> validate(jobShouldFail2, expectedRecords));
}