Java Code Examples for org.apache.spark.sql.sources.v2.reader.DataReaderFactory

The following examples show how to use org.apache.spark.sql.sources.v2.reader.DataReaderFactory. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    List<Split> splits = null;
    DBClientWrapper db = new DBClientWrapper(_host, _port);
    db.connect();
    try {
        if (_partitions == 0)
            splits = db.getSplits(_table);
        else
            splits = db.getSplits(_table, _partitions);
    } catch (UnknownTableException ute) {
        throw new RuntimeException(ute);
    } finally {
        db.disconnect();
    }
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    log.info("created " + factories.size() + " factories");
    return factories;
}
 
Example 2
Source Project: spark-data-sources   Source File: ParallelRowDataSource.java    License: MIT License 6 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    List<Split> splits = null;
    DBClientWrapper db = new DBClientWrapper(_host, _port);
    db.connect();
    try {
        if (_partitions == 0)
            splits = db.getSplits(_table);
        else
            splits = db.getSplits(_table, _partitions);
    } catch (UnknownTableException ute) {
        throw new RuntimeException(ute);
    } finally {
        db.disconnect();
    }
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    log.info("created " + factories.size() + " factories");
    return factories;
}
 
Example 3
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example 4
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00"));

  List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example 5
Source Project: spark-llap   Source File: HiveWarehouseDataSourceReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override public List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories() {
  try {
    boolean countStar = this.schema.length() == 0;
    String queryString = getQueryString(SchemaUtil.columnNames(schema), pushedFilters);
    List<DataReaderFactory<ColumnarBatch>> factories = new ArrayList<>();
    if (countStar) {
      LOG.info("Executing count with query: {}", queryString);
      factories.addAll(getCountStarFactories(queryString));
    } else {
      factories.addAll(getSplitsFactories(queryString));
    }
    return factories;
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}
 
Example 6
Source Project: spark-llap   Source File: HiveWarehouseDataSourceReader.java    License: Apache License 2.0 6 votes vote down vote up
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>();
  try {
    JobConf jobConf = JobUtil.createJobConf(options, query);
    LlapBaseInputFormat llapInputFormat = new LlapBaseInputFormat(false, Long.MAX_VALUE);
    //numSplits arg not currently supported, use 1 as dummy arg
    InputSplit[] splits = llapInputFormat.getSplits(jobConf, 1);
    for (InputSplit split : splits) {
      tasks.add(getDataReaderFactory(split, jobConf, getArrowAllocatorMax()));
    }
  } catch (IOException e) {
    LOG.error("Unable to submit query to HS2");
    throw new RuntimeException(e);
  }
  return tasks;
}
 
Example 7
Source Project: spark-data-sources   Source File: PartitioningRowDataSource.java    License: MIT License 5 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("reader factories requested for table [" + _table + "]");
    initialize();
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : _splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    return factories;
}
 
Example 8
Source Project: iceberg   Source File: Reader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<DataReaderFactory<UnsafeRow>> createUnsafeRowReaderFactories() {
  String tableSchemaString = SchemaParser.toJson(table.schema());
  String expectedSchemaString = SchemaParser.toJson(lazySchema());

  List<DataReaderFactory<UnsafeRow>> readTasks = Lists.newArrayList();
  for (CombinedScanTask task : tasks()) {
    readTasks.add(new ReadTask(task, tableSchemaString, expectedSchemaString, conf));
  }

  return readTasks;
}
 
Example 9
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, planTasks(unfiltered).size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}
 
Example 10
Source Project: spark-llap   Source File: HiveWarehouseDataSourceReader.java    License: Apache License 2.0 5 votes vote down vote up
private List<DataReaderFactory<ColumnarBatch>> getCountStarFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>(100);
  long count = getCount(query);
  String numTasksString = HWConf.COUNT_TASKS.getFromOptionsMap(options);
  int numTasks = Integer.parseInt(numTasksString);
  long numPerTask = count/(numTasks - 1);
  long numLastTask = count % (numTasks - 1);
  for(int i = 0; i < (numTasks - 1); i++) {
    tasks.add(new CountDataReaderFactory(numPerTask));
  }
  tasks.add(new CountDataReaderFactory(numLastTask));
  return tasks;
}
 
Example 11
Source Project: spark-data-sources   Source File: SimpleRowDataSource.java    License: MIT License 4 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("creating a single factory");
    return java.util.Arrays.asList(new SimpleDataReaderFactory(_host, _port));
}
 
Example 12
Source Project: spark-data-sources   Source File: FlexibleRowDataSource.java    License: MIT License 4 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("creating a single factory");
    return java.util.Collections.singletonList(
            new SimpleDataReaderFactory(_host, _port, _table, readSchema()));
}
 
Example 13
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 4 votes vote down vote up
private List<DataReaderFactory<UnsafeRow>> planTasks(DataSourceReader reader) {
  Assert.assertTrue(reader instanceof SupportsScanUnsafeRow);
  SupportsScanUnsafeRow unsafeReader = (SupportsScanUnsafeRow) reader;
  return unsafeReader.createUnsafeRowReaderFactories();
}
 
Example 14
Source Project: spark-llap   Source File: HiveWarehouseDataSourceReader.java    License: Apache License 2.0 4 votes vote down vote up
protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) {
  return new HiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax);
}
 
Example 15
Source Project: spark-llap   Source File: SimpleMockConnector.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    return Arrays.asList(new SimpleMockDataReaderFactory());
}
 
Example 16
Source Project: spark-llap   Source File: MockHiveWarehouseConnector.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) {
  return new MockHiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax);
}
 
Example 17
Source Project: spark-llap   Source File: MockHiveWarehouseConnector.java    License: Apache License 2.0 4 votes vote down vote up
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  return Lists.newArrayList(new MockHiveWarehouseDataReaderFactory(null, null, 0));
}