Java Code Examples for org.apache.spark.api.java.JavaRDD#context()

The following examples show how to use org.apache.spark.api.java.JavaRDD#context() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TransformTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <InputT, AccumT, OutputT>
    TransformEvaluator<Combine.Globally<InputT, OutputT>> combineGlobally() {
  return new TransformEvaluator<Combine.Globally<InputT, OutputT>>() {

    @Override
    public void evaluate(Combine.Globally<InputT, OutputT> transform, EvaluationContext context) {
      final PCollection<InputT> input = context.getInput(transform);
      final Coder<InputT> iCoder = context.getInput(transform).getCoder();
      final Coder<OutputT> oCoder = context.getOutput(transform).getCoder();
      final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT> combineFn =
          (CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT>)
              CombineFnUtil.toFnWithContext(transform.getFn());
      final WindowedValue.FullWindowedValueCoder<OutputT> wvoCoder =
          WindowedValue.FullWindowedValueCoder.of(
              oCoder, windowingStrategy.getWindowFn().windowCoder());
      final boolean hasDefault = transform.isInsertDefault();

      final SparkCombineFn<InputT, InputT, AccumT, OutputT> sparkCombineFn =
          SparkCombineFn.globally(
              combineFn,
              context.getSerializableOptions(),
              TranslationUtils.getSideInputs(transform.getSideInputs(), context),
              windowingStrategy);
      final Coder<AccumT> aCoder;
      try {
        aCoder = combineFn.getAccumulatorCoder(context.getPipeline().getCoderRegistry(), iCoder);
      } catch (CannotProvideCoderException e) {
        throw new IllegalStateException("Could not determine coder for accumulator", e);
      }

      @SuppressWarnings("unchecked")
      JavaRDD<WindowedValue<InputT>> inRdd =
          ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();

      JavaRDD<WindowedValue<OutputT>> outRdd;

      SparkCombineFn.WindowedAccumulator<InputT, InputT, AccumT, ?> accumulated =
          GroupCombineFunctions.combineGlobally(inRdd, sparkCombineFn, aCoder, windowingStrategy);

      if (!accumulated.isEmpty()) {
        Iterable<WindowedValue<OutputT>> output = sparkCombineFn.extractOutput(accumulated);
        outRdd =
            context
                .getSparkContext()
                .parallelize(CoderHelpers.toByteArrays(output, wvoCoder))
                .map(CoderHelpers.fromByteFunction(wvoCoder));
      } else {
        // handle empty input RDD, which will naturally skip the entire execution
        // as Spark will not run on empty RDDs.
        JavaSparkContext jsc = new JavaSparkContext(inRdd.context());
        if (hasDefault) {
          OutputT defaultValue = combineFn.defaultValue();
          outRdd =
              jsc.parallelize(Lists.newArrayList(CoderHelpers.toByteArray(defaultValue, oCoder)))
                  .map(CoderHelpers.fromByteFunction(oCoder))
                  .map(WindowedValue::valueInGlobalWindow);
        } else {
          outRdd = jsc.emptyRDD();
        }
      }

      context.putDataset(transform, new BoundedDataset<>(outRdd));
    }

    @Override
    public String toNativeString() {
      return "aggregate(..., new <fn>(), ...)";
    }
  };
}
 
Example 2
Source File: CountCumSum.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public CountCumSum(JavaRDD<AtomicLong> sentenceCountRDD) {
    this.sentenceCountRDD = sentenceCountRDD;
    this.sc = new JavaSparkContext(sentenceCountRDD.context());
}
 
Example 3
Source File: SparkWord2Vec.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public void fitSentences(JavaRDD<String> sentences) {
    /**
     * Basically all we want here is tokenization, to get JavaRDD<Sequence<VocabWord>> out of Strings, and then we just go  for SeqVec
     */

    validateConfiguration();

    final JavaSparkContext context = new JavaSparkContext(sentences.context());

    broadcastEnvironment(context);

    JavaRDD<Sequence<VocabWord>> seqRdd = sentences.map(new TokenizerFunction(configurationBroadcast));

    // now since we have new rdd - just pass it to SeqVec
    super.fitSequences(seqRdd);
}
 
Example 4
Source File: DataFrames.java    From DataVec with Apache License 2.0 3 votes vote down vote up
/**
 * Creates a data frame from a collection of writables
 * rdd given a schema
 *
 * @param schema the schema to use
 * @param data   the data to convert
 * @return the dataframe object
 */
public static DataRowsFacade toDataFrame(Schema schema, JavaRDD<List<Writable>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());
    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.map(new ToRow(schema));
    return dataRows(sqlContext.createDataFrame(rows, fromSchema(schema)));
}
 
Example 5
Source File: DataFrames.java    From DataVec with Apache License 2.0 3 votes vote down vote up
/**
 * Convert the given sequence data set to a DataFrame.<br>
 * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(DataRowsFacade)}
 *
 * @param schema Schema for the data
 * @param data   Sequence data to convert to a DataFrame
 * @return The dataframe object
 */
public static DataRowsFacade toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());

    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema));
    return dataRows(sqlContext.createDataFrame(rows, fromSchemaSequence(schema)));
}
 
Example 6
Source File: DataFrames.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * Creates a data frame from a collection of writables
 * rdd given a schema
 *
 * @param schema the schema to use
 * @param data   the data to convert
 * @return the dataframe object
 */
public static Dataset<Row> toDataFrame(Schema schema, JavaRDD<List<Writable>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());
    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.map(new ToRow(schema));
    return sqlContext.createDataFrame(rows, fromSchema(schema));
}
 
Example 7
Source File: DataFrames.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * Convert the given sequence data set to a DataFrame.<br>
 * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(Dataset<Row>)}
 *
 * @param schema Schema for the data
 * @param data   Sequence data to convert to a DataFrame
 * @return The dataframe object
 */
public static Dataset<Row> toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());

    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema));
    return sqlContext.createDataFrame(rows, fromSchemaSequence(schema));
}