Java Code Examples for org.apache.spark.sql.sources.v2.DataSourceOptions

The following examples show how to use org.apache.spark.sql.sources.v2.DataSourceOptions. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Test
public void testDefaults() {
    Configuration hadoopConfiguration  = new Configuration();
    DataSourceOptions options = new DataSourceOptions(defaultOptions);
    SparkBigQueryConfig config = SparkBigQueryConfig.from(options, ImmutableMap.of(), hadoopConfiguration,10);
    assertThat(config.getTableId()).isEqualTo(TableId.of("dataset", "table"));
    assertThat(config.getFilter()).isEqualTo(Optional.empty());
    assertThat(config.getSchema()).isEqualTo(Optional.empty());
    assertThat(config.getMaxParallelism()).isEqualTo(OptionalInt.empty());
    assertThat(config.getTemporaryGcsBucket()).isEqualTo(Optional.empty());
    assertThat(config.getIntermediateFormat()).isEqualTo(SparkBigQueryConfig.DEFAULT_INTERMEDIATE_FORMAT);
    assertThat(config.getReadDataFormat()).isEqualTo(SparkBigQueryConfig.DEFAULT_READ_DATA_FORMAT);
    assertThat(config.getMaterializationProject()).isEqualTo(Optional.empty());
    assertThat(config.getMaterializationDataset()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionField()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionExpirationMs()).isEqualTo(OptionalLong.empty());
    assertThat(config.getPartitionRequireFilter()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionType()).isEqualTo(Optional.empty());
    assertThat(config.getClusteredFields()).isEqualTo(Optional.empty());
    assertThat(config.getCreateDisposition()).isEqualTo(Optional.empty());
    assertThat(config.getLoadSchemaUpdateOptions()).isEqualTo(ImmutableList.of());
    assertThat(config.getViewExpirationTimeInHours()).isEqualTo(24);
    assertThat(config.getMaxReadRowsRetries()).isEqualTo(3);
}
 
Example 2
Source Project: iceberg   Source File: Writer.java    License: Apache License 2.0 6 votes vote down vote up
Writer(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
       DataSourceOptions options, boolean replacePartitions, String applicationId, String wapId,
       Schema writeSchema, StructType dsSchema) {
  this.table = table;
  this.format = getFileFormat(table.properties(), options);
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.replacePartitions = replacePartitions;
  this.applicationId = applicationId;
  this.wapId = wapId;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;

  long tableTargetFileSize = PropertyUtil.propertyAsLong(
      table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize);
}
 
Example 3
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
  if (readSchema != null) {
    // convert() will fail if readSchema contains fields not in table.schema()
    SparkSchemaUtil.convert(table.schema(), readSchema);
    reader.pruneColumns(readSchema);
  }

  return reader;
}
 
Example 4
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}
 
Example 5
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public StreamWriter createStreamWriter(String runId, StructType dsStruct,
                                       OutputMode mode, DataSourceOptions options) {
  Preconditions.checkArgument(
      mode == OutputMode.Append() || mode == OutputMode.Complete(),
      "Output mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  // Spark 2.4.x passes runId to createStreamWriter instead of real queryId,
  // so we fetch it directly from sparkContext to make writes idempotent
  String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY());
  String appId = lazySparkSession().sparkContext().applicationId();

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct);
}
 
Example 6
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example 7
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));

  List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example 8
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testInFilterForTimestamp() {
  File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("ts", new Timestamp[]{
      new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000),
      new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000),
      new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000),
      new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000),
      null
  }));

  Assert.assertEquals("Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size());
}
 
Example 9
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example 10
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00"));

  List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example 11
Source Project: beam   Source File: DatasetSourceBatch.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private DatasetReader(DataSourceOptions options) {
  if (!options.get(BEAM_SOURCE_OPTION).isPresent()) {
    throw new RuntimeException("Beam source was not set in DataSource options");
  }
  this.source =
      Base64Serializer.deserializeUnchecked(
          options.get(BEAM_SOURCE_OPTION).get(), BoundedSource.class);

  if (!options.get(DEFAULT_PARALLELISM).isPresent()) {
    throw new RuntimeException("Spark default parallelism was not set in DataSource options");
  }
  this.numPartitions = Integer.parseInt(options.get(DEFAULT_PARALLELISM).get());
  checkArgument(numPartitions > 0, "Number of partitions must be greater than zero.");

  if (!options.get(PIPELINE_OPTIONS).isPresent()) {
    throw new RuntimeException("Beam pipelineOptions were not set in DataSource options");
  }
  this.serializablePipelineOptions =
      new SerializablePipelineOptions(options.get(PIPELINE_OPTIONS).get());
}
 
Example 12
Source Project: beam   Source File: DatasetSourceStreaming.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private DatasetMicroBatchReader(String checkpointLocation, DataSourceOptions options) {
  if (!options.get(BEAM_SOURCE_OPTION).isPresent()) {
    throw new RuntimeException("Beam source was not set in DataSource options");
  }
  this.source =
      Base64Serializer.deserializeUnchecked(
          options.get(BEAM_SOURCE_OPTION).get(), UnboundedSource.class);

  if (!options.get(DEFAULT_PARALLELISM).isPresent()) {
    throw new RuntimeException("Spark default parallelism was not set in DataSource options");
  }
  this.numPartitions = Integer.parseInt(options.get(DEFAULT_PARALLELISM).get());
  checkArgument(numPartitions > 0, "Number of partitions must be greater than zero.");

  if (!options.get(PIPELINE_OPTIONS).isPresent()) {
    throw new RuntimeException("Beam pipelineOptions were not set in DataSource options");
  }
  this.serializablePipelineOptions =
      new SerializablePipelineOptions(options.get(PIPELINE_OPTIONS).get());
}
 
Example 13
Source Project: spark-llap   Source File: HiveStreamingDataSource.java    License: Apache License 2.0 6 votes vote down vote up
private HiveStreamingDataSourceWriter createDataSourceWriter(final String id, final StructType schema,
  final DataSourceOptions options) {
  String dbName;
  if(options.get("default.db").isPresent()) {
    dbName = options.get("default.db").get();
  } else {
    dbName = options.get("database").orElse("default");
  }
  String tableName = options.get("table").orElse(null);
  String partition = options.get("partition").orElse(null);
  List<String> partitionValues = partition == null ? null : Arrays.asList(partition.split(","));
  String metastoreUri = options.get("metastoreUri").orElse("thrift://localhost:9083");
  String commitIntervalRows = options.get("commitIntervalRows").orElse("" + DEFAULT_COMMIT_INTERVAL_ROWS);
  long commitInterval = Long.parseLong(commitIntervalRows);
  String metastoreKrbPrincipal = options.get("metastoreKrbPrincipal").orElse(null);
  LOG.info("OPTIONS - database: {} table: {} partition: {} commitIntervalRows: {} metastoreUri: {} " +
      "metastoreKrbPrincipal: {}", dbName, tableName, partition, commitInterval,
    metastoreUri, metastoreKrbPrincipal);
  return new HiveStreamingDataSourceWriter(id, schema, commitInterval, dbName, tableName,
    partitionValues, metastoreUri, metastoreKrbPrincipal);
}
 
Example 14
Source Project: spark-llap   Source File: HiveStreamingDataSource.java    License: Apache License 2.0 6 votes vote down vote up
private HiveStreamingDataSourceWriter createDataSourceWriter(final String id, final StructType schema,
  final DataSourceOptions options) {
  String dbName = null;
  if(options.get("default.db").isPresent()) {
    dbName = options.get("default.db").get();
  } else {
    dbName = options.get("database").orElse("default");
  }
  String tableName = options.get("table").orElse(null);
  String partition = options.get("partition").orElse(null);
  List<String> partitionValues = partition == null ? null : Arrays.asList(partition.split(","));
  String metastoreUri = options.get("metastoreUri").orElse("thrift://localhost:9083");
  String metastoreKerberosPrincipal = options.get("metastoreKrbPrincipal").orElse(null);
  LOG.info("OPTIONS - database: {} table: {} partition: {} metastoreUri: {} metastoreKerberosPrincipal: {}",
    dbName, tableName, partition, metastoreUri, metastoreKerberosPrincipal);
  return new HiveStreamingDataSourceWriter(id, schema, dbName, tableName,
    partitionValues, metastoreUri, metastoreKerberosPrincipal);
}
 
Example 15
public SparkBigQueryConnectorModule(
        SparkSession spark,
        DataSourceOptions options,
        Optional<StructType> schema) {
    this.spark = spark;
    this.options = options;
    this.schema = schema;
}
 
Example 16
@Override
public DataSourceReader createReader(StructType schema, DataSourceOptions options) {
    SparkSession spark = getDefaultSparkSessionOrCreate();

    Injector injector = Guice.createInjector(
            new BigQueryClientModule(),
            new SparkBigQueryConnectorModule(spark, options, Optional.ofNullable(schema)));

    BigQueryDataSourceReader reader = injector.getInstance(BigQueryDataSourceReader.class);
    return reader;
}
 
Example 17
private static Optional<String> getOptionFromMultipleParams(
        DataSourceOptions options,
        Collection<String> names,
        Supplier<Optional<String>> fallback) {
    return names.stream().map(name -> getOption(options, name))
            .filter(Optional::isPresent)
            .findFirst()
            .orElseGet(fallback);
}
 
Example 18
private static Optional<String> getAnyOption(
        ImmutableMap<String, String> globalOptions,
        DataSourceOptions options,
        Collection<String> names) {
    return names.stream()
            .map(name -> getAnyOption(globalOptions, options, name))
            .filter(optional -> optional.isPresent())
            .findFirst()
            .orElse(Optional.empty());
}
 
Example 19
private static boolean getAnyBooleanOption(ImmutableMap<String, String> globalOptions,
                                           DataSourceOptions options,
                                           String name,
                                           boolean defaultValue) {
    return getAnyOption(globalOptions, options, name)
            .map(Boolean::valueOf)
            .orElse(defaultValue);
}
 
Example 20
Source Project: flight-spark-source   Source File: DefaultSource.java    License: Apache License 2.0 5 votes vote down vote up
public DataSourceReader createReader(DataSourceOptions dataSourceOptions) {
  Location defaultLocation = Location.forGrpcInsecure(
    dataSourceOptions.get("host").orElse("localhost"),
    dataSourceOptions.getInt("port", 47470)
  );
  String sql = dataSourceOptions.get("path").orElse("");
  FlightDataSourceReader.FactoryOptions options = new FlightDataSourceReader.FactoryOptions(
    defaultLocation,
    sql,
    dataSourceOptions.get("username").orElse("anonymous"),
    dataSourceOptions.get("password").orElse(null),
    dataSourceOptions.getBoolean("parallel", false), null);
  Broadcast<FlightDataSourceReader.FactoryOptions> bOptions = lazySparkContext().broadcast(options);
  return new FlightDataSourceReader(bOptions);
}
 
Example 21
Source Project: spark-data-sources   Source File: SimpleRowDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    return new Reader(host, port);
}
 
Example 22
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example 23
Source Project: spark-data-sources   Source File: FlexibleRowDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    return new Reader(host, port, table);
}
 
Example 24
Source Project: spark-data-sources   Source File: ParallelRowDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example 25
Source Project: spark-data-sources   Source File: PartitioningRowDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example 26
Source Project: iceberg   Source File: StreamingWriter.java    License: Apache License 2.0 5 votes vote down vote up
StreamingWriter(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
                DataSourceOptions options, String queryId, OutputMode mode, String applicationId,
                Schema writeSchema, StructType dsSchema) {
  super(table, io, encryptionManager, options, false, applicationId, writeSchema, dsSchema);
  this.queryId = queryId;
  this.mode = mode;
}
 
Example 27
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 5 votes vote down vote up
protected Table findTable(DataSourceOptions options, Configuration conf) {
  Optional<String> path = options.get("path");
  Preconditions.checkArgument(path.isPresent(), "Cannot open table: path is not set");

  if (path.get().contains("/")) {
    HadoopTables tables = new HadoopTables(conf);
    return tables.load(path.get());
  } else {
    HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf);
    TableIdentifier tableIdentifier = TableIdentifier.parse(path.get());
    return hiveCatalog.loadTable(tableIdentifier);
  }
}
 
Example 28
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 5 votes vote down vote up
private Table getTableAndResolveHadoopConfiguration(
    DataSourceOptions options, Configuration conf) {
  // Overwrite configurations from the Spark Context with configurations from the options.
  mergeIcebergHadoopConfs(conf, options.asMap());
  Table table = findTable(options, conf);
  // Set confs from table properties
  mergeIcebergHadoopConfs(conf, table.properties());
  // Re-overwrite values set in options and table properties but were not in the environment.
  mergeIcebergHadoopConfs(conf, options.asMap());
  return table;
}
 
Example 29
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  // set spark.sql.caseSensitive to false
  String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
  TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");

  try {
    IcebergSource source = new IcebergSource();

    for (int i = 0; i < 10; i += 1) {
      DataSourceReader reader = source.createReader(options);

      pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match

      List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
      Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

      // validate row filtering
      assertEqualsSafe(SCHEMA.asStruct(), expected(i),
          read(unpartitioned.toString(), "id = " + i));
    }
  } finally {
    // return global conf to previous state
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
  }
}
 
Example 30
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, unfiltered.planInputPartitions().size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}