org.apache.spark.sql.sources.v2.reader.DataSourceReader Java Examples

The following examples show how to use org.apache.spark.sql.sources.v2.reader.DataSourceReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00"));

  List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example #2
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example #3
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testInFilterForTimestamp() {
  File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("ts", new Timestamp[]{
      new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000),
      new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000),
      new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000),
      new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000),
      null
  }));

  Assert.assertEquals("Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size());
}
 
Example #4
Source File: IcebergSource.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
  if (readSchema != null) {
    // convert() will fail if readSchema contains fields not in table.schema()
    SparkSchemaUtil.convert(table.schema(), readSchema);
    reader.pruneColumns(readSchema);
  }

  return reader;
}
 
Example #5
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example #6
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));

  List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example #7
Source File: BigQueryDataSourceV2.java    From spark-bigquery-connector with Apache License 2.0 5 votes vote down vote up
@Override
public DataSourceReader createReader(StructType schema, DataSourceOptions options) {
    SparkSession spark = getDefaultSparkSessionOrCreate();

    Injector injector = Guice.createInjector(
            new BigQueryClientModule(),
            new SparkBigQueryConnectorModule(spark, options, Optional.ofNullable(schema)));

    BigQueryDataSourceReader reader = injector.getInstance(BigQueryDataSourceReader.class);
    return reader;
}
 
Example #8
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, planTasks(unfiltered).size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}
 
Example #9
Source File: HiveWarehouseConnector.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
@Override public DataSourceReader createReader(DataSourceOptions options) {
  try {
    return getDataSourceReader(getOptions(options));
  } catch (IOException e) {
    LOG.error("Error creating {}", getClass().getName());
    LOG.error(ExceptionUtils.getStackTrace(e));
    throw new RuntimeException(e);
  }
}
 
Example #10
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedByIdStartsWith() {
  File location = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new StringStartsWith("data", "junc"));

  Assert.assertEquals(1, reader.planInputPartitions().size());
}
 
Example #11
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedByDataStartsWithFilter() {
  File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new StringStartsWith("data", "junc"));

  Assert.assertEquals(1, reader.planInputPartitions().size());
}
 
Example #12
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testInFilter() {
  File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("data", new String[]{"foo", "junction", "brush", null}));

  Assert.assertEquals(2, reader.planInputPartitions().size());
}
 
Example #13
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, unfiltered.planInputPartitions().size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}
 
Example #14
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  // set spark.sql.caseSensitive to false
  String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
  TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");

  try {
    IcebergSource source = new IcebergSource();

    for (int i = 0; i < 10; i += 1) {
      DataSourceReader reader = source.createReader(options);

      pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match

      List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
      Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

      // validate row filtering
      assertEqualsSafe(SCHEMA.asStruct(), expected(i),
          read(unpartitioned.toString(), "id = " + i));
    }
  } finally {
    // return global conf to previous state
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
  }
}
 
Example #15
Source File: PartitioningRowDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example #16
Source File: ParallelRowDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example #17
Source File: FlexibleRowDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    return new Reader(host, port, table);
}
 
Example #18
Source File: ParallelRowReadWriteDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}
 
Example #19
Source File: SimpleRowDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    return new Reader(host, port);
}
 
Example #20
Source File: DefaultSource.java    From flight-spark-source with Apache License 2.0 5 votes vote down vote up
public DataSourceReader createReader(DataSourceOptions dataSourceOptions) {
  Location defaultLocation = Location.forGrpcInsecure(
    dataSourceOptions.get("host").orElse("localhost"),
    dataSourceOptions.getInt("port", 47470)
  );
  String sql = dataSourceOptions.get("path").orElse("");
  FlightDataSourceReader.FactoryOptions options = new FlightDataSourceReader.FactoryOptions(
    defaultLocation,
    sql,
    dataSourceOptions.get("username").orElse("anonymous"),
    dataSourceOptions.get("password").orElse(null),
    dataSourceOptions.getBoolean("parallel", false), null);
  Broadcast<FlightDataSourceReader.FactoryOptions> bOptions = lazySparkContext().broadcast(options);
  return new FlightDataSourceReader(bOptions);
}
 
Example #21
Source File: SimpleMockConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    return new SimpleMockDataSourceReader();
}
 
Example #22
Source File: HiveWarehouseConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
protected DataSourceReader getDataSourceReader(Map<String, String> params) throws IOException {
  return new HiveWarehouseDataSourceReader(params);
}
 
Example #23
Source File: MockHiveWarehouseConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
@Override
protected DataSourceReader getDataSourceReader(Map<String, String> params) throws IOException {
  return new MockHiveWarehouseDataSourceReader(params);
}
 
Example #24
Source File: DatasetSourceBatch.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
  return new DatasetReader<>(options);
}
 
Example #25
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private List<DataReaderFactory<UnsafeRow>> planTasks(DataSourceReader reader) {
  Assert.assertTrue(reader instanceof SupportsScanUnsafeRow);
  SupportsScanUnsafeRow unsafeReader = (SupportsScanUnsafeRow) reader;
  return unsafeReader.createUnsafeRowReaderFactories();
}
 
Example #26
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private void pushFilters(DataSourceReader reader,
                         Expression... expressions) {
  Assert.assertTrue(reader instanceof SupportsPushDownCatalystFilters);
  SupportsPushDownCatalystFilters filterable = (SupportsPushDownCatalystFilters) reader;
  filterable.pushCatalystFilters(expressions);
}
 
Example #27
Source File: IcebergSource.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
  Table table = findTable(options);
  return new Reader(table, lazyConf());
}
 
Example #28
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private void pushFilters(DataSourceReader reader, Filter... filters) {
  Assert.assertTrue(reader instanceof SupportsPushDownFilters);
  SupportsPushDownFilters filterable = (SupportsPushDownFilters) reader;
  filterable.pushFilters(filters);
}
 
Example #29
Source File: IcebergSource.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
  return createReader(null, options);
}
 
Example #30
Source File: BigQueryDataSourceV2.java    From spark-bigquery-connector with Apache License 2.0 4 votes vote down vote up
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    return createReader(null, options);
}