Java Code Examples for org.apache.spark.sql.sources.v2.reader.DataSourceReader

The following examples show how to use org.apache.spark.sql.sources.v2.reader.DataSourceReader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
  if (readSchema != null) {
    // convert() will fail if readSchema contains fields not in table.schema()
    SparkSchemaUtil.convert(table.schema(), readSchema);
    reader.pruneColumns(readSchema);
  }

  return reader;
}
 
Example 2
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example 3
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));

  List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example 4
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testInFilterForTimestamp() {
  File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("ts", new Timestamp[]{
      new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000),
      new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000),
      new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000),
      new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000),
      null
  }));

  Assert.assertEquals("Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size());
}
 
Example 5
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example 6
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00"));

  List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example 7
@Override
public DataSourceReader createReader(StructType schema, DataSourceOptions options) {
    SparkSession spark = getDefaultSparkSessionOrCreate();

    Injector injector = Guice.createInjector(
            new BigQueryClientModule(),
            new SparkBigQueryConnectorModule(spark, options, Optional.ofNullable(schema)));

    BigQueryDataSourceReader reader = injector.getInstance(BigQueryDataSourceReader.class);
    return reader;
}
 
Example 8
Source Project: flight-spark-source   Source File: DefaultSource.java    License: Apache License 2.0 5 votes vote down vote up
public DataSourceReader createReader(DataSourceOptions dataSourceOptions) {
  Location defaultLocation = Location.forGrpcInsecure(
    dataSourceOptions.get("host").orElse("localhost"),
    dataSourceOptions.getInt("port", 47470)
  );
  String sql = dataSourceOptions.get("path").orElse("");
  FlightDataSourceReader.FactoryOptions options = new FlightDataSourceReader.FactoryOptions(
    defaultLocation,
    sql,
    dataSourceOptions.get("username").orElse("anonymous"),
    dataSourceOptions.get("password").orElse(null),
    dataSourceOptions.getBoolean("parallel", false), null);
  Broadcast<FlightDataSourceReader.FactoryOptions> bOptions = lazySparkContext().broadcast(options);
  return new FlightDataSourceReader(bOptions);
}
 
Example 9
Source Project: spark-data-sources   Source File: SimpleRowDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    return new Reader(host, port);
}
 
Example 10
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example 11
Source Project: spark-data-sources   Source File: FlexibleRowDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    return new Reader(host, port, table);
}
 
Example 12
Source Project: spark-data-sources   Source File: ParallelRowDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example 13
Source Project: spark-data-sources   Source File: PartitioningRowDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example 14
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  // set spark.sql.caseSensitive to false
  String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
  TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");

  try {
    IcebergSource source = new IcebergSource();

    for (int i = 0; i < 10; i += 1) {
      DataSourceReader reader = source.createReader(options);

      pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match

      List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
      Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

      // validate row filtering
      assertEqualsSafe(SCHEMA.asStruct(), expected(i),
          read(unpartitioned.toString(), "id = " + i));
    }
  } finally {
    // return global conf to previous state
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
  }
}
 
Example 15
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, unfiltered.planInputPartitions().size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}
 
Example 16
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testInFilter() {
  File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("data", new String[]{"foo", "junction", "brush", null}));

  Assert.assertEquals(2, reader.planInputPartitions().size());
}
 
Example 17
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedByDataStartsWithFilter() {
  File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new StringStartsWith("data", "junc"));

  Assert.assertEquals(1, reader.planInputPartitions().size());
}
 
Example 18
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedByIdStartsWith() {
  File location = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new StringStartsWith("data", "junc"));

  Assert.assertEquals(1, reader.planInputPartitions().size());
}
 
Example 19
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, planTasks(unfiltered).size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}
 
Example 20
Source Project: spark-llap   Source File: HiveWarehouseConnector.java    License: Apache License 2.0 5 votes vote down vote up
@Override public DataSourceReader createReader(DataSourceOptions options) {
  try {
    return getDataSourceReader(getOptions(options));
  } catch (IOException e) {
    LOG.error("Error creating {}", getClass().getName());
    LOG.error(ExceptionUtils.getStackTrace(e));
    throw new RuntimeException(e);
  }
}
 
Example 21
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    return createReader(null, options);
}
 
Example 22
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
  return createReader(null, options);
}
 
Example 23
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 4 votes vote down vote up
private void pushFilters(DataSourceReader reader, Filter... filters) {
  Assert.assertTrue(reader instanceof SupportsPushDownFilters);
  SupportsPushDownFilters filterable = (SupportsPushDownFilters) reader;
  filterable.pushFilters(filters);
}
 
Example 24
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
  Table table = findTable(options);
  return new Reader(table, lazyConf());
}
 
Example 25
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 4 votes vote down vote up
private void pushFilters(DataSourceReader reader,
                         Expression... expressions) {
  Assert.assertTrue(reader instanceof SupportsPushDownCatalystFilters);
  SupportsPushDownCatalystFilters filterable = (SupportsPushDownCatalystFilters) reader;
  filterable.pushCatalystFilters(expressions);
}
 
Example 26
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 4 votes vote down vote up
private List<DataReaderFactory<UnsafeRow>> planTasks(DataSourceReader reader) {
  Assert.assertTrue(reader instanceof SupportsScanUnsafeRow);
  SupportsScanUnsafeRow unsafeReader = (SupportsScanUnsafeRow) reader;
  return unsafeReader.createUnsafeRowReaderFactories();
}
 
Example 27
Source Project: beam   Source File: DatasetSourceBatch.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
  return new DatasetReader<>(options);
}
 
Example 28
Source Project: spark-llap   Source File: HiveWarehouseConnector.java    License: Apache License 2.0 4 votes vote down vote up
protected DataSourceReader getDataSourceReader(Map<String, String> params) throws IOException {
  return new HiveWarehouseDataSourceReader(params);
}
 
Example 29
Source Project: spark-llap   Source File: SimpleMockConnector.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    return new SimpleMockDataSourceReader();
}
 
Example 30
Source Project: spark-llap   Source File: MockHiveWarehouseConnector.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected DataSourceReader getDataSourceReader(Map<String, String> params) throws IOException {
  return new MockHiveWarehouseDataSourceReader(params);
}