Java Code Examples for org.apache.iceberg.PartitionSpec#unpartitioned()

The following examples show how to use org.apache.iceberg.PartitionSpec#unpartitioned() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestSnapshotSelection.java    From iceberg with Apache License 2.0 7 votes vote down vote up
@Test(expected = IllegalArgumentException.class)
public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, tableLocation);

  List<SimpleRecord> firstBatchRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class);
  firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);

  long timestamp = System.currentTimeMillis();
  long snapshotId = table.currentSnapshot().snapshotId();
  Dataset<Row> df = spark.read()
      .format("iceberg")
      .option("snapshot-id", snapshotId)
      .option("as-of-timestamp", timestamp)
      .load(tableLocation);

  df.collectAsList();
}
 
Example 2
Source File: IcebergSourceNestedDataBenchmark.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
protected final Table initTable() {
  Schema schema = new Schema(
      required(0, "id", Types.LongType.get()),
      optional(4, "nested", Types.StructType.of(
          required(1, "col1", Types.StringType.get()),
          required(2, "col2", Types.DoubleType.get()),
          required(3, "col3", Types.LongType.get())
      ))
  );
  PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
  HadoopTables tables = new HadoopTables(hadoopConf());
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
  return tables.create(schema, partitionSpec, properties, newTableLocation());
}
 
Example 3
Source File: VectorizedReadFlatParquetDataBenchmark.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
protected Table initTable() {
  Schema schema = new Schema(
      optional(1, "longCol", Types.LongType.get()),
      optional(2, "intCol", Types.IntegerType.get()),
      optional(3, "floatCol", Types.FloatType.get()),
      optional(4, "doubleCol", Types.DoubleType.get()),
      optional(5, "decimalCol", Types.DecimalType.of(20, 5)),
      optional(6, "dateCol", Types.DateType.get()),
      optional(7, "timestampCol", Types.TimestampType.withZone()),
      optional(8, "stringCol", Types.StringType.get()));
  PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
  HadoopTables tables = new HadoopTables(hadoopConf());
  Map<String, String> properties = parquetWriteProps();
  return tables.create(schema, partitionSpec, properties, newTableLocation());
}
 
Example 4
Source File: TestRewriteManifestsAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testRewriteManifestsEmptyTable() throws IOException {
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
  Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

  Assert.assertNull("Table must be empty", table.currentSnapshot());

  Actions actions = Actions.forTable(table);

  actions.rewriteManifests()
      .rewriteIf(manifest -> true)
      .stagingLocation(temp.newFolder().toString())
      .execute();

  Assert.assertNull("Table must stay empty", table.currentSnapshot());
}
 
Example 5
Source File: HadoopTables.java    From iceberg with Apache License 2.0 6 votes vote down vote up
/**
 * Create a table using the FileSystem implementation resolve from
 * location.
 *
 * @param schema iceberg schema used to create the table
 * @param spec partitioning spec, if null the table will be unpartitioned
 * @param properties a string map of table properties, initialized to empty if null
 * @param location a path URI (e.g. hdfs:///warehouse/my_table)
 * @return newly created table implementation
 */
@Override
public Table create(Schema schema, PartitionSpec spec, Map<String, String> properties,
                    String location) {
  Preconditions.checkNotNull(schema, "A table schema is required");

  TableOperations ops = newTableOps(location);
  if (ops.current() != null) {
    throw new AlreadyExistsException("Table already exists at location: " + location);
  }

  Map<String, String> tableProps = properties == null ? ImmutableMap.of() : properties;
  PartitionSpec partitionSpec = spec == null ? PartitionSpec.unpartitioned() : spec;
  TableMetadata metadata = TableMetadata.newTableMetadata(schema, partitionSpec, location, tableProps);
  ops.commit(null, metadata);

  return new BaseTable(ops, location);
}
 
Example 6
Source File: TestSparkSchema.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty()),
              new StructField("data", DataTypes.StringType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation)
      .select("id");

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
 
Example 7
Source File: TestSparkSchema.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testSparkReadSchemaIsHonored() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation);

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
 
Example 8
Source File: TestDataSourceOptions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testDefaultMetadataSplitSize() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  tables.create(SCHEMA, spec, options, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // 32MB split size

  int expectedSplits = ((int) tables.load(tableLocation + "#entries")
      .currentSnapshot().allManifests().get(0).length() + splitSize - 1) / splitSize;

  Dataset<Row> metadataDf = spark.read()
      .format("iceberg")
      .load(tableLocation + "#entries");

  int partitionNum = metadataDf.javaRDD().getNumPartitions();
  Assert.assertEquals("Spark partitions should match", expectedSplits, partitionNum);
}
 
Example 9
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedOverwrite() throws IOException {
  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  // overwrite with the same data; should not produce two copies
  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("overwrite")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 10
Source File: TestSparkSchema.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testFailIfSparkReadSchemaIsOff() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("idd", DataTypes.IntegerType, true, Metadata.empty()) // wrong field name
          }
      );

  AssertHelpers.assertThrows("Iceberg should not allow a projection that contain unknown fields",
      java.lang.IllegalArgumentException.class, "Field idd not found in source schema",
      () ->
          spark.read()
              .schema(sparkReadSchema)
              .format("iceberg")
              .load(tableLocation)
  );
}
 
Example 11
Source File: TestDataSourceOptions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteFormatOptionOverridesTableProperties() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro");
  Table table = tables.create(SCHEMA, spec, options, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
    tasks.forEach(task -> {
      FileFormat fileFormat = FileFormat.fromFileName(task.file().path());
      Assert.assertEquals(FileFormat.PARQUET, fileFormat);
    });
  }
}
 
Example 12
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteProjection() throws IOException {
  Assume.assumeTrue(
      "Not supported in Spark 3.0; analysis requires all columns are present",
      spark.version().startsWith("2"));

  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, null),
      new SimpleRecord(2, null),
      new SimpleRecord(3, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id").write() // select only id column
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 13
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertTrue(file.nullValueCounts().isEmpty());
    Assert.assertTrue(file.valueCounts().isEmpty());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example 14
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}
 
Example 15
Source File: TestSnapshotSelection.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test(expected = IllegalArgumentException.class)
public void testSnapshotSelectionByInvalidSnapshotId() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, tableLocation);

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .option("snapshot-id", -10)
      .load(tableLocation);

  df.collectAsList();
}
 
Example 16
Source File: ResidualEvaluator.java    From iceberg with Apache License 2.0 4 votes vote down vote up
UnpartitionedResidualEvaluator(Expression expr) {
  super(PartitionSpec.unpartitioned(), expr, false);
  this.expr = expr;
}
 
Example 17
Source File: TestRewriteDataFilesAction.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testRewriteDataFilesUnpartitionedTable() {
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

  List<ThreeColumnRecord> records1 = Lists.newArrayList(
      new ThreeColumnRecord(1, null, "AAAA"),
      new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")
  );
  writeRecords(records1);

  List<ThreeColumnRecord> records2 = Lists.newArrayList(
      new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"),
      new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")
  );
  writeRecords(records2);

  table.refresh();

  CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
  List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
  Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size());

  Actions actions = Actions.forTable(table);

  RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
  Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size());
  Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());

  table.refresh();

  CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
  List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
  Assert.assertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size());

  List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
  expectedRecords.addAll(records1);
  expectedRecords.addAll(records2);

  Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
  List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2")
      .as(Encoders.bean(ThreeColumnRecord.class))
      .collectAsList();

  Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
 
Example 18
Source File: TestDataSourceOptions.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  Table table = tables.create(SCHEMA, spec, options, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  // produce 1st manifest
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);
  // produce 2nd manifest
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  List<ManifestFile> manifests = table.currentSnapshot().allManifests();

  Assert.assertEquals("Must be 2 manifests", 2, manifests.size());

  // set the target metadata split size so each manifest ends up in a separate split
  table.updateProperties()
      .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length()))
      .commit();

  Dataset<Row> entriesDf = spark.read()
      .format("iceberg")
      .load(tableLocation + "#entries");
  Assert.assertEquals("Num partitions must match", 2, entriesDf.javaRDD().getNumPartitions());

  // override the table property using options
  entriesDf = spark.read()
      .format("iceberg")
      .option("split-size", String.valueOf(128 * 1024 * 1024))
      .load(tableLocation + "#entries");
  Assert.assertEquals("Num partitions must match", 1, entriesDf.javaRDD().getNumPartitions());
}
 
Example 19
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testWriteProjectionWithMiddle() throws IOException {
  Assume.assumeTrue(
      "Not supported in Spark 3.0; analysis requires all columns are present",
      spark.version().startsWith("2"));

  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Schema schema = new Schema(
      optional(1, "c1", Types.IntegerType.get()),
      optional(2, "c2", Types.StringType.get()),
      optional(3, "c3", Types.StringType.get())
  );
  Table table = tables.create(schema, spec, location.toString());

  List<ThreeColumnRecord> expected = Lists.newArrayList(
      new ThreeColumnRecord(1, null, "hello"),
      new ThreeColumnRecord(2, null, "world"),
      new ThreeColumnRecord(3, null, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, ThreeColumnRecord.class);

  df.select("c1", "c3").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<ThreeColumnRecord> actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 20
Source File: Spark3Util.java    From iceberg with Apache License 2.0 4 votes vote down vote up
/**
 * Converts Spark transforms into a {@link PartitionSpec}.
 *
 * @param schema the table schema
 * @param partitioning Spark Transforms
 * @return a PartitionSpec
 */
public static PartitionSpec toPartitionSpec(Schema schema, Transform[] partitioning) {
  if (partitioning == null || partitioning.length == 0) {
    return PartitionSpec.unpartitioned();
  }

  PartitionSpec.Builder builder = PartitionSpec.builderFor(schema);
  for (Transform transform : partitioning) {
    Preconditions.checkArgument(transform.references().length == 1,
        "Cannot convert transform with more than one column reference: %s", transform);
    String colName = DOT.join(transform.references()[0].fieldNames());
    switch (transform.name()) {
      case "identity":
        builder.identity(colName);
        break;
      case "bucket":
        builder.bucket(colName, findWidth(transform));
        break;
      case "years":
        builder.year(colName);
        break;
      case "months":
        builder.month(colName);
        break;
      case "date":
      case "days":
        builder.day(colName);
        break;
      case "date_hour":
      case "hours":
        builder.hour(colName);
        break;
      case "truncate":
        builder.truncate(colName, findWidth(transform));
        break;
      default:
        throw new UnsupportedOperationException("Transform is not supported: " + transform);
    }
  }

  return builder.build();
}