Java Code Examples for org.apache.spark.sql.DataFrameReader#load()

The following examples show how to use org.apache.spark.sql.DataFrameReader#load() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: HiveWarehouseSessionImpl.java From spark-llap with Apache License 2.0

5 votes

public Dataset<Row> table(String sql) {
  DataFrameReader dfr = session().read().format(HIVE_WAREHOUSE_CONNECTOR_INTERNAL).option("table", sql);
  return dfr.load();
}

Example 2

Source File: HoodieIncrSource.java From hudi with Apache License 2.0

4 votes

@Override
public Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {

  DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.HOODIE_SRC_BASE_PATH));

  /*
   * DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH,
   * Config.HOODIE_SRC_PARTITION_FIELDS)); List<String> partitionFields =
   * props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", new ArrayList<>()); PartitionValueExtractor
   * extractor = DataSourceUtils.createPartitionExtractor(props.getString( Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS,
   * Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS));
   */
  String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH);
  int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH);
  boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT,
      Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);

  // Use begin Instant if set and non-empty
  Option<String> beginInstant =
      lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty();

  Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath,
      numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt);

  if (instantEndpts.getKey().equals(instantEndpts.getValue())) {
    LOG.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey());
    return Pair.of(Option.empty(), instantEndpts.getKey());
  }

  // Do Incr pull. Set end instant if available
  DataFrameReader reader = sparkSession.read().format("org.apache.hudi")
      .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
      .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft())
      .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight());

  Dataset<Row> source = reader.load(srcPath);

  /*
   * log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema());
   * 
   * StructType newSchema = new StructType(source.schema().fields()); for (String field : partitionFields) { newSchema
   * = newSchema.add(field, DataTypes.StringType, true); }
   * 
   * /** Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if
   * configured
   *
   * Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> { // _hoodie_instant_time String
   * instantTime = row.getString(0); IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(),
   * instantEndpts.getValue()); if (!partitionFields.isEmpty()) { // _hoodie_partition_path String hoodiePartitionPath
   * = row.getString(3); List<Object> partitionVals =
   * extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() .map(o -> (Object)
   * o).collect(Collectors.toList()); ValidationUtils.checkArgument(partitionVals.size() == partitionFields.size(),
   * "#partition-fields != #partition-values-extracted"); List<Object> rowObjs = new
   * ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); rowObjs.addAll(partitionVals); return
   * RowFactory.create(rowObjs.toArray()); } return row; }, RowEncoder.apply(newSchema));
   * 
   * log.info("Validated Source Schema :" + validated.schema());
   */

  // Remove Hoodie meta columns except partition path from input source
  final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream()
      .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new));
  // log.info("Final Schema from Source is :" + src.schema());
  return Pair.of(Option.of(src), instantEndpts.getRight());
}

Example 3

Source File: HiveWarehouseSessionImpl.java From spark-llap with Apache License 2.0

4 votes

public Dataset<Row> executeQuery(String sql) {
  DataFrameReader dfr = session().read().format(HIVE_WAREHOUSE_CONNECTOR_INTERNAL).option("query", sql);
  return dfr.load();
}