org.apache.spark.sql.sources.v2.DataSourceOptions Java Examples

The following examples show how to use org.apache.spark.sql.sources.v2.DataSourceOptions. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveStreamingDataSource.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
private HiveStreamingDataSourceWriter createDataSourceWriter(final String id, final StructType schema,
  final DataSourceOptions options) {
  String dbName = null;
  if(options.get("default.db").isPresent()) {
    dbName = options.get("default.db").get();
  } else {
    dbName = options.get("database").orElse("default");
  }
  String tableName = options.get("table").orElse(null);
  String partition = options.get("partition").orElse(null);
  List<String> partitionValues = partition == null ? null : Arrays.asList(partition.split(","));
  String metastoreUri = options.get("metastoreUri").orElse("thrift://localhost:9083");
  String metastoreKerberosPrincipal = options.get("metastoreKrbPrincipal").orElse(null);
  LOG.info("OPTIONS - database: {} table: {} partition: {} metastoreUri: {} metastoreKerberosPrincipal: {}",
    dbName, tableName, partition, metastoreUri, metastoreKerberosPrincipal);
  return new HiveStreamingDataSourceWriter(id, schema, dbName, tableName,
    partitionValues, metastoreUri, metastoreKerberosPrincipal);
}
 
Example #2
Source File: IcebergSource.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
  if (readSchema != null) {
    // convert() will fail if readSchema contains fields not in table.schema()
    SparkSchemaUtil.convert(table.schema(), readSchema);
    reader.pruneColumns(readSchema);
  }

  return reader;
}
 
Example #3
Source File: IcebergSource.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}
 
Example #4
Source File: IcebergSource.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public StreamWriter createStreamWriter(String runId, StructType dsStruct,
                                       OutputMode mode, DataSourceOptions options) {
  Preconditions.checkArgument(
      mode == OutputMode.Append() || mode == OutputMode.Complete(),
      "Output mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  // Spark 2.4.x passes runId to createStreamWriter instead of real queryId,
  // so we fetch it directly from sparkContext to make writes idempotent
  String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY());
  String appId = lazySparkSession().sparkContext().applicationId();

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct);
}
 
Example #5
Source File: Writer.java    From iceberg with Apache License 2.0 6 votes vote down vote up
Writer(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
       DataSourceOptions options, boolean replacePartitions, String applicationId, String wapId,
       Schema writeSchema, StructType dsSchema) {
  this.table = table;
  this.format = getFileFormat(table.properties(), options);
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.replacePartitions = replacePartitions;
  this.applicationId = applicationId;
  this.wapId = wapId;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;

  long tableTargetFileSize = PropertyUtil.propertyAsLong(
      table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize);
}
 
Example #6
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example #7
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));

  List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example #8
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testInFilterForTimestamp() {
  File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("ts", new Timestamp[]{
      new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000),
      new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000),
      new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000),
      new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000),
      null
  }));

  Assert.assertEquals("Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size());
}
 
Example #9
Source File: SparkBigQueryConfigTest.java    From spark-bigquery-connector with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaults() {
    Configuration hadoopConfiguration  = new Configuration();
    DataSourceOptions options = new DataSourceOptions(defaultOptions);
    SparkBigQueryConfig config = SparkBigQueryConfig.from(options, ImmutableMap.of(), hadoopConfiguration,10);
    assertThat(config.getTableId()).isEqualTo(TableId.of("dataset", "table"));
    assertThat(config.getFilter()).isEqualTo(Optional.empty());
    assertThat(config.getSchema()).isEqualTo(Optional.empty());
    assertThat(config.getMaxParallelism()).isEqualTo(OptionalInt.empty());
    assertThat(config.getTemporaryGcsBucket()).isEqualTo(Optional.empty());
    assertThat(config.getIntermediateFormat()).isEqualTo(SparkBigQueryConfig.DEFAULT_INTERMEDIATE_FORMAT);
    assertThat(config.getReadDataFormat()).isEqualTo(SparkBigQueryConfig.DEFAULT_READ_DATA_FORMAT);
    assertThat(config.getMaterializationProject()).isEqualTo(Optional.empty());
    assertThat(config.getMaterializationDataset()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionField()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionExpirationMs()).isEqualTo(OptionalLong.empty());
    assertThat(config.getPartitionRequireFilter()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionType()).isEqualTo(Optional.empty());
    assertThat(config.getClusteredFields()).isEqualTo(Optional.empty());
    assertThat(config.getCreateDisposition()).isEqualTo(Optional.empty());
    assertThat(config.getLoadSchemaUpdateOptions()).isEqualTo(ImmutableList.of());
    assertThat(config.getViewExpirationTimeInHours()).isEqualTo(24);
    assertThat(config.getMaxReadRowsRetries()).isEqualTo(3);
}
 
Example #10
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00"));

  List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example #11
Source File: HiveStreamingDataSource.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
private HiveStreamingDataSourceWriter createDataSourceWriter(final String id, final StructType schema,
  final DataSourceOptions options) {
  String dbName;
  if(options.get("default.db").isPresent()) {
    dbName = options.get("default.db").get();
  } else {
    dbName = options.get("database").orElse("default");
  }
  String tableName = options.get("table").orElse(null);
  String partition = options.get("partition").orElse(null);
  List<String> partitionValues = partition == null ? null : Arrays.asList(partition.split(","));
  String metastoreUri = options.get("metastoreUri").orElse("thrift://localhost:9083");
  String commitIntervalRows = options.get("commitIntervalRows").orElse("" + DEFAULT_COMMIT_INTERVAL_ROWS);
  long commitInterval = Long.parseLong(commitIntervalRows);
  String metastoreKrbPrincipal = options.get("metastoreKrbPrincipal").orElse(null);
  LOG.info("OPTIONS - database: {} table: {} partition: {} commitIntervalRows: {} metastoreUri: {} " +
      "metastoreKrbPrincipal: {}", dbName, tableName, partition, commitInterval,
    metastoreUri, metastoreKrbPrincipal);
  return new HiveStreamingDataSourceWriter(id, schema, commitInterval, dbName, tableName,
    partitionValues, metastoreUri, metastoreKrbPrincipal);
}
 
Example #12
Source File: DatasetSourceBatch.java    From beam with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private DatasetReader(DataSourceOptions options) {
  if (!options.get(BEAM_SOURCE_OPTION).isPresent()) {
    throw new RuntimeException("Beam source was not set in DataSource options");
  }
  this.source =
      Base64Serializer.deserializeUnchecked(
          options.get(BEAM_SOURCE_OPTION).get(), BoundedSource.class);

  if (!options.get(DEFAULT_PARALLELISM).isPresent()) {
    throw new RuntimeException("Spark default parallelism was not set in DataSource options");
  }
  this.numPartitions = Integer.parseInt(options.get(DEFAULT_PARALLELISM).get());
  checkArgument(numPartitions > 0, "Number of partitions must be greater than zero.");

  if (!options.get(PIPELINE_OPTIONS).isPresent()) {
    throw new RuntimeException("Beam pipelineOptions were not set in DataSource options");
  }
  this.serializablePipelineOptions =
      new SerializablePipelineOptions(options.get(PIPELINE_OPTIONS).get());
}
 
Example #13
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example #14
Source File: DatasetSourceStreaming.java    From beam with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private DatasetMicroBatchReader(String checkpointLocation, DataSourceOptions options) {
  if (!options.get(BEAM_SOURCE_OPTION).isPresent()) {
    throw new RuntimeException("Beam source was not set in DataSource options");
  }
  this.source =
      Base64Serializer.deserializeUnchecked(
          options.get(BEAM_SOURCE_OPTION).get(), UnboundedSource.class);

  if (!options.get(DEFAULT_PARALLELISM).isPresent()) {
    throw new RuntimeException("Spark default parallelism was not set in DataSource options");
  }
  this.numPartitions = Integer.parseInt(options.get(DEFAULT_PARALLELISM).get());
  checkArgument(numPartitions > 0, "Number of partitions must be greater than zero.");

  if (!options.get(PIPELINE_OPTIONS).isPresent()) {
    throw new RuntimeException("Beam pipelineOptions were not set in DataSource options");
  }
  this.serializablePipelineOptions =
      new SerializablePipelineOptions(options.get(PIPELINE_OPTIONS).get());
}
 
Example #15
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  // set spark.sql.caseSensitive to false
  String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
  TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");

  try {
    IcebergSource source = new IcebergSource();

    for (int i = 0; i < 10; i += 1) {
      DataSourceReader reader = source.createReader(options);

      pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match

      List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
      Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

      // validate row filtering
      assertEqualsSafe(SCHEMA.asStruct(), expected(i),
          read(unpartitioned.toString(), "id = " + i));
    }
  } finally {
    // return global conf to previous state
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
  }
}
 
Example #16
Source File: SparkBigQueryConfig.java    From spark-bigquery-connector with Apache License 2.0 5 votes vote down vote up
private static Optional<String> getAnyOption(
        ImmutableMap<String, String> globalOptions,
        DataSourceOptions options,
        Collection<String> names) {
    return names.stream()
            .map(name -> getAnyOption(globalOptions, options, name))
            .filter(optional -> optional.isPresent())
            .findFirst()
            .orElse(Optional.empty());
}
 
Example #17
Source File: FlexibleRowDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    return new Reader(host, port, table);
}
 
Example #18
Source File: ParallelRowDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example #19
Source File: BigQueryDataSourceV2.java    From spark-bigquery-connector with Apache License 2.0 5 votes vote down vote up
@Override
public DataSourceReader createReader(StructType schema, DataSourceOptions options) {
    SparkSession spark = getDefaultSparkSessionOrCreate();

    Injector injector = Guice.createInjector(
            new BigQueryClientModule(),
            new SparkBigQueryConnectorModule(spark, options, Optional.ofNullable(schema)));

    BigQueryDataSourceReader reader = injector.getInstance(BigQueryDataSourceReader.class);
    return reader;
}
 
Example #20
Source File: HiveWarehouseConnector.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType schema,
    SaveMode mode, DataSourceOptions options) {
  Map<String, String> params = getOptions(options);
  String stagingDirPrefix = HWConf.LOAD_STAGING_DIR.getFromOptionsMap(params);
  Path path = new Path(stagingDirPrefix);
  Configuration conf = SparkSession.getActiveSession().get().sparkContext().hadoopConfiguration();
  return Optional.of(getDataSourceWriter(jobId, schema, path, params, conf));
}
 
Example #21
Source File: HiveWarehouseConnector.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
@Override public DataSourceReader createReader(DataSourceOptions options) {
  try {
    return getDataSourceReader(getOptions(options));
  } catch (IOException e) {
    LOG.error("Error creating {}", getClass().getName());
    LOG.error(ExceptionUtils.getStackTrace(e));
    throw new RuntimeException(e);
  }
}
 
Example #22
Source File: IcebergSource.java    From iceberg with Apache License 2.0 5 votes vote down vote up
protected Table findTable(DataSourceOptions options, Configuration conf) {
  Optional<String> path = options.get("path");
  Preconditions.checkArgument(path.isPresent(), "Cannot open table: path is not set");

  if (path.get().contains("/")) {
    HadoopTables tables = new HadoopTables(conf);
    return tables.load(path.get());
  } else {
    HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf);
    TableIdentifier tableIdentifier = TableIdentifier.parse(path.get());
    return hiveCatalog.loadTable(tableIdentifier);
  }
}
 
Example #23
Source File: IcebergSource.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Table getTableAndResolveHadoopConfiguration(
    DataSourceOptions options, Configuration conf) {
  // Overwrite configurations from the Spark Context with configurations from the options.
  mergeIcebergHadoopConfs(conf, options.asMap());
  Table table = findTable(options, conf);
  // Set confs from table properties
  mergeIcebergHadoopConfs(conf, table.properties());
  // Re-overwrite values set in options and table properties but were not in the environment.
  mergeIcebergHadoopConfs(conf, options.asMap());
  return table;
}
 
Example #24
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, unfiltered.planInputPartitions().size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}
 
Example #25
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testInFilter() {
  File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("data", new String[]{"foo", "junction", "brush", null}));

  Assert.assertEquals(2, reader.planInputPartitions().size());
}
 
Example #26
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, planTasks(unfiltered).size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}
 
Example #27
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedByDataStartsWithFilter() {
  File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new StringStartsWith("data", "junc"));

  Assert.assertEquals(1, reader.planInputPartitions().size());
}
 
Example #28
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedByIdStartsWith() {
  File location = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new StringStartsWith("data", "junc"));

  Assert.assertEquals(1, reader.planInputPartitions().size());
}
 
Example #29
Source File: IcebergSource.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct, SaveMode mode,
                                                 DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append, "Save mode %s is not supported", mode);

  Table table = findTable(options);

  Schema dfSchema = SparkSchemaUtil.convert(table.schema(), dfStruct);
  List<String> errors = CheckCompatibility.writeCompatibilityErrors(table.schema(), dfSchema);
  if (!errors.isEmpty()) {
    StringBuilder sb = new StringBuilder();
    sb.append("Cannot write incompatible dataframe to table with schema:\n")
        .append(table.schema()).append("\nProblems:");
    for (String error : errors) {
      sb.append("\n* ").append(error);
    }
    throw new IllegalArgumentException(sb.toString());
  }

  Optional<String> formatOption = options.get("iceberg.write.format");
  FileFormat format;
  if (formatOption.isPresent()) {
    format = FileFormat.valueOf(formatOption.get().toUpperCase(Locale.ENGLISH));
  } else {
    format = FileFormat.valueOf(table.properties()
        .getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT)
        .toUpperCase(Locale.ENGLISH));
  }

  return Optional.of(new Writer(table, lazyConf(), format));
}
 
Example #30
Source File: IcebergSource.java    From iceberg with Apache License 2.0 5 votes vote down vote up
protected Table findTable(DataSourceOptions options) {
  Optional<String> location = options.get("path");
  Preconditions.checkArgument(location.isPresent(),
      "Cannot open table without a location: path is not set");

  HadoopTables tables = new HadoopTables(lazyConf());

  return tables.load(location.get());
}