Java Code Examples for org.apache.spark.sql.DataFrameReader

The following examples show how to use org.apache.spark.sql.DataFrameReader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: hudi   Source File: CsvDFSSource.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Reads the CSV files and parsed the lines into {@link Dataset} of {@link Row}.
 *
 * @param pathStr  The list of file paths, separated by ','.
 * @return  {@link Dataset} of {@link Row} containing the records.
 */
private Option<Dataset<Row>> fromFiles(Option<String> pathStr) {
  if (pathStr.isPresent()) {
    DataFrameReader dataFrameReader = sparkSession.read().format("csv");
    CSV_CONFIG_KEYS.forEach(optionKey -> {
      String configPropName = CSV_SRC_CONFIG_PREFIX + optionKey;
      String value  = props.getString(configPropName, null);
      // Pass down the Hudi CSV configs to Spark DataFrameReader
      if (value != null) {
        dataFrameReader.option(optionKey, value);
      }
    });
    if (sourceSchema != null) {
      // Source schema is specified, pass it to the reader
      dataFrameReader.schema(sourceSchema);
    }
    dataFrameReader.option("inferSchema", Boolean.toString(sourceSchema == null));

    return Option.of(dataFrameReader.load(pathStr.get().split(",")));
  } else {
    return Option.empty();
  }
}
 
Example 2
Source Project: hudi   Source File: HoodieIncrSource.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {

  DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.HOODIE_SRC_BASE_PATH));

  /*
   * DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH,
   * Config.HOODIE_SRC_PARTITION_FIELDS)); List<String> partitionFields =
   * props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", new ArrayList<>()); PartitionValueExtractor
   * extractor = DataSourceUtils.createPartitionExtractor(props.getString( Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS,
   * Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS));
   */
  String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH);
  int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH);
  boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT,
      Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);

  // Use begin Instant if set and non-empty
  Option<String> beginInstant =
      lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty();

  Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath,
      numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt);

  if (instantEndpts.getKey().equals(instantEndpts.getValue())) {
    LOG.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey());
    return Pair.of(Option.empty(), instantEndpts.getKey());
  }

  // Do Incr pull. Set end instant if available
  DataFrameReader reader = sparkSession.read().format("org.apache.hudi")
      .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
      .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft())
      .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight());

  Dataset<Row> source = reader.load(srcPath);

  /*
   * log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema());
   * 
   * StructType newSchema = new StructType(source.schema().fields()); for (String field : partitionFields) { newSchema
   * = newSchema.add(field, DataTypes.StringType, true); }
   * 
   * /** Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if
   * configured
   *
   * Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> { // _hoodie_instant_time String
   * instantTime = row.getString(0); IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(),
   * instantEndpts.getValue()); if (!partitionFields.isEmpty()) { // _hoodie_partition_path String hoodiePartitionPath
   * = row.getString(3); List<Object> partitionVals =
   * extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() .map(o -> (Object)
   * o).collect(Collectors.toList()); ValidationUtils.checkArgument(partitionVals.size() == partitionFields.size(),
   * "#partition-fields != #partition-values-extracted"); List<Object> rowObjs = new
   * ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); rowObjs.addAll(partitionVals); return
   * RowFactory.create(rowObjs.toArray()); } return row; }, RowEncoder.apply(newSchema));
   * 
   * log.info("Validated Source Schema :" + validated.schema());
   */

  // Remove Hoodie meta columns except partition path from input source
  final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream()
      .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new));
  // log.info("Final Schema from Source is :" + src.schema());
  return Pair.of(Option.of(src), instantEndpts.getRight());
}
 
Example 3
Source Project: spark-llap   Source File: HiveWarehouseSessionImpl.java    License: Apache License 2.0 4 votes vote down vote up
public Dataset<Row> executeQuery(String sql) {
  DataFrameReader dfr = session().read().format(HIVE_WAREHOUSE_CONNECTOR_INTERNAL).option("query", sql);
  return dfr.load();
}
 
Example 4
Source Project: spark-llap   Source File: HiveWarehouseSessionImpl.java    License: Apache License 2.0 4 votes vote down vote up
public Dataset<Row> table(String sql) {
  DataFrameReader dfr = session().read().format(HIVE_WAREHOUSE_CONNECTOR_INTERNAL).option("table", sql);
  return dfr.load();
}
 
Example 5
Source Project: SparkDemo   Source File: DataFrameCreate.java    License: MIT License 3 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(DataFrameCreate.class);

	// 创建DataFrame 读取json
	SQLContext sqlContext = new SQLContext(sc);

	DataFrameReader dataFrameReader = sqlContext.read();
	Dataset<Row> dataset = dataFrameReader.json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	dataset.show();

	sc.close();
}