org.apache.spark.sql.sources.v2.reader.DataReaderFactory Java Examples

The following examples show how to use org.apache.spark.sql.sources.v2.reader.DataReaderFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParallelRowReadWriteDataSource.java    From spark-data-sources with MIT License 6 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    List<Split> splits = null;
    DBClientWrapper db = new DBClientWrapper(_host, _port);
    db.connect();
    try {
        if (_partitions == 0)
            splits = db.getSplits(_table);
        else
            splits = db.getSplits(_table, _partitions);
    } catch (UnknownTableException ute) {
        throw new RuntimeException(ute);
    } finally {
        db.disconnect();
    }
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    log.info("created " + factories.size() + " factories");
    return factories;
}
 
Example #2
Source File: ParallelRowDataSource.java    From spark-data-sources with MIT License 6 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    List<Split> splits = null;
    DBClientWrapper db = new DBClientWrapper(_host, _port);
    db.connect();
    try {
        if (_partitions == 0)
            splits = db.getSplits(_table);
        else
            splits = db.getSplits(_table, _partitions);
    } catch (UnknownTableException ute) {
        throw new RuntimeException(ute);
    } finally {
        db.disconnect();
    }
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    log.info("created " + factories.size() + " factories");
    return factories;
}
 
Example #3
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>();
  try {
    JobConf jobConf = JobUtil.createJobConf(options, query);
    LlapBaseInputFormat llapInputFormat = new LlapBaseInputFormat(false, Long.MAX_VALUE);
    //numSplits arg not currently supported, use 1 as dummy arg
    InputSplit[] splits = llapInputFormat.getSplits(jobConf, 1);
    for (InputSplit split : splits) {
      tasks.add(getDataReaderFactory(split, jobConf, getArrowAllocatorMax()));
    }
  } catch (IOException e) {
    LOG.error("Unable to submit query to HS2");
    throw new RuntimeException(e);
  }
  return tasks;
}
 
Example #4
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
@Override public List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories() {
  try {
    boolean countStar = this.schema.length() == 0;
    String queryString = getQueryString(SchemaUtil.columnNames(schema), pushedFilters);
    List<DataReaderFactory<ColumnarBatch>> factories = new ArrayList<>();
    if (countStar) {
      LOG.info("Executing count with query: {}", queryString);
      factories.addAll(getCountStarFactories(queryString));
    } else {
      factories.addAll(getSplitsFactories(queryString));
    }
    return factories;
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}
 
Example #5
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example #6
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00"));

  List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
 
Example #7
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, planTasks(unfiltered).size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}
 
Example #8
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
private List<DataReaderFactory<ColumnarBatch>> getCountStarFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>(100);
  long count = getCount(query);
  String numTasksString = HWConf.COUNT_TASKS.getFromOptionsMap(options);
  int numTasks = Integer.parseInt(numTasksString);
  long numPerTask = count/(numTasks - 1);
  long numLastTask = count % (numTasks - 1);
  for(int i = 0; i < (numTasks - 1); i++) {
    tasks.add(new CountDataReaderFactory(numPerTask));
  }
  tasks.add(new CountDataReaderFactory(numLastTask));
  return tasks;
}
 
Example #9
Source File: Reader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public List<DataReaderFactory<UnsafeRow>> createUnsafeRowReaderFactories() {
  String tableSchemaString = SchemaParser.toJson(table.schema());
  String expectedSchemaString = SchemaParser.toJson(lazySchema());

  List<DataReaderFactory<UnsafeRow>> readTasks = Lists.newArrayList();
  for (CombinedScanTask task : tasks()) {
    readTasks.add(new ReadTask(task, tableSchemaString, expectedSchemaString, conf));
  }

  return readTasks;
}
 
Example #10
Source File: PartitioningRowDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("reader factories requested for table [" + _table + "]");
    initialize();
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : _splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    return factories;
}
 
Example #11
Source File: SimpleRowDataSource.java    From spark-data-sources with MIT License 4 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("creating a single factory");
    return java.util.Arrays.asList(new SimpleDataReaderFactory(_host, _port));
}
 
Example #12
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private List<DataReaderFactory<UnsafeRow>> planTasks(DataSourceReader reader) {
  Assert.assertTrue(reader instanceof SupportsScanUnsafeRow);
  SupportsScanUnsafeRow unsafeReader = (SupportsScanUnsafeRow) reader;
  return unsafeReader.createUnsafeRowReaderFactories();
}
 
Example #13
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) {
  return new HiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax);
}
 
Example #14
Source File: FlexibleRowDataSource.java    From spark-data-sources with MIT License 4 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("creating a single factory");
    return java.util.Collections.singletonList(
            new SimpleDataReaderFactory(_host, _port, _table, readSchema()));
}
 
Example #15
Source File: SimpleMockConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    return Arrays.asList(new SimpleMockDataReaderFactory());
}
 
Example #16
Source File: MockHiveWarehouseConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
@Override
protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) {
  return new MockHiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax);
}
 
Example #17
Source File: MockHiveWarehouseConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  return Lists.newArrayList(new MockHiveWarehouseDataReaderFactory(null, null, 0));
}