Java Code Examples for org.apache.spark.SparkContext#getOrCreate()

The following examples show how to use org.apache.spark.SparkContext#getOrCreate() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkMaster.java    From GeoTriples with Apache License 2.0 6 votes vote down vote up
/**
 * Convert the input Dataset into RDF triples and store the results.
 * The conversion is taking place per Partitions using the mapPartition Spark transformation.
 * @param mapping_list list of TripleMaps
 */
private void convert_partition(ArrayList<TriplesMap> mapping_list){
    SparkContext sc = SparkContext.getOrCreate();

    Pair<ArrayList<TriplesMap>, List<String>> transformation_info = new Pair<>(mapping_list, Arrays.asList(reader.getHeaders()));
    ClassTag<Pair<ArrayList<TriplesMap>, List<String>>> classTag_pair = scala.reflect.ClassTag$.MODULE$.apply(Pair.class);
    Broadcast<Pair<ArrayList<TriplesMap>, List<String>>> bd_info = sc.broadcast(transformation_info, classTag_pair);

    rowRDD
        .mapPartitions(
        (Iterator<Row> rows_iter) -> {
            ArrayList<TriplesMap> p_mapping_list = bd_info.value().getKey();
            List<String> p_header = bd_info.value().getValue();
            RML_Converter rml_converter = new RML_Converter(p_mapping_list, p_header);
            rml_converter.start();
            rml_converter.registerFunctions();
            Iterator<String> triples = rml_converter.convertPartition(rows_iter);

            rml_converter.stop();
            return triples;
        })
        .saveAsTextFile(outputDir);
}
 
Example 2
Source File: SparkMaster.java    From GeoTriples with Apache License 2.0 6 votes vote down vote up
/**
 * Convert the input Dataset into RDF triples and store the results.
 * The conversion is taking place per Per using the map Spark transformation.
 * @param mapping_list list of TripleMaps
 */
private void convert_row(ArrayList<TriplesMap> mapping_list){

    SparkContext sc = SparkContext.getOrCreate();

    RML_Converter rml_converter = new RML_Converter(mapping_list, Arrays.asList(reader.getHeaders()));
    ClassTag<RML_Converter> classTagRML_Converter = scala.reflect.ClassTag$.MODULE$.apply(RML_Converter.class);
    Broadcast<RML_Converter> bc_converter = sc.broadcast(rml_converter, classTagRML_Converter);

    ClassTag<HashMap<URI, Function>> classTag_hashMap = scala.reflect.ClassTag$.MODULE$.apply(HashMap.class);
    Broadcast<HashMap<URI, Function>> bc_functionsHashMap = sc.broadcast(FunctionFactory.availableFunctions, classTag_hashMap);
    rowRDD
        .map((row) ->  {
            FunctionFactory.availableFunctions = bc_functionsHashMap.value();
            return bc_converter.value().convertRow(row);
        } )
        .saveAsTextFile(outputDir);
}
 
Example 3
Source File: SparkTextFileBoundedSourceVertex.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
protected Iterator<String> initializeIterator() {
  // for setting up the same environment in the executors.
  final SparkContext sparkContext = SparkContext.getOrCreate(sparkConf);

  // Spark does lazy evaluation: it doesn't load the full data in rdd, but only the partition it is asked for.
  final RDD<String> rdd = sparkContext.textFile(inputPath, numPartitions);
  final Iterable<String> iterable = () -> JavaConverters.asJavaIteratorConverter(
    rdd.iterator(rdd.getPartitions()[partitionIndex], TaskContext$.MODULE$.empty())).asJava();
  return iterable.iterator();
}
 
Example 4
Source File: ConverterFactory.java    From jpmml-sparkml with GNU Affero General Public License v3.0 5 votes vote down vote up
static
public void checkVersion(){
	SparkContext sparkContext = SparkContext.getOrCreate();

	int[] version = parseVersion(sparkContext.version());

	if(!Arrays.equals(ConverterFactory.VERSION, version)){
		throw new IllegalArgumentException("Expected Apache Spark ML version " + formatVersion(ConverterFactory.VERSION) + ", got version " + formatVersion(version) + " (" + sparkContext.version() + ")");
	}
}
 
Example 5
Source File: Spark1Shims.java    From zeppelin with Apache License 2.0 5 votes vote down vote up
public void setupSparkListener(final String master,
                               final String sparkWebUrl,
                               final InterpreterContext context) {
  SparkContext sc = SparkContext.getOrCreate();
  sc.addSparkListener(new JobProgressListener(sc.getConf()) {
    @Override
    public void onJobStart(SparkListenerJobStart jobStart) {
      if (sc.getConf().getBoolean("spark.ui.enabled", true) &&
          !Boolean.parseBoolean(properties.getProperty("zeppelin.spark.ui.hidden", "false"))) {
        buildSparkJobUrl(master, sparkWebUrl, jobStart.jobId(), jobStart.properties(), context);
      }
    }
  });
}
 
Example 6
Source File: Spark3Shims.java    From zeppelin with Apache License 2.0 5 votes vote down vote up
public void setupSparkListener(final String master,
                               final String sparkWebUrl,
                               final InterpreterContext context) {
  SparkContext sc = SparkContext.getOrCreate();
  sc.addSparkListener(new SparkListener() {
    @Override
    public void onJobStart(SparkListenerJobStart jobStart) {

      if (sc.getConf().getBoolean("spark.ui.enabled", true) &&
          !Boolean.parseBoolean(properties.getProperty("zeppelin.spark.ui.hidden", "false"))) {
        buildSparkJobUrl(master, sparkWebUrl, jobStart.jobId(), jobStart.properties(), context);
      }
    }
  });
}
 
Example 7
Source File: Spark2Shims.java    From zeppelin with Apache License 2.0 5 votes vote down vote up
public void setupSparkListener(final String master,
                               final String sparkWebUrl,
                               final InterpreterContext context) {
  SparkContext sc = SparkContext.getOrCreate();
  sc.addSparkListener(new SparkListener() {
    @Override
    public void onJobStart(SparkListenerJobStart jobStart) {

      if (sc.getConf().getBoolean("spark.ui.enabled", true) &&
          !Boolean.parseBoolean(properties.getProperty("zeppelin.spark.ui.hidden", "false"))) {
        buildSparkJobUrl(master, sparkWebUrl, jobStart.jobId(), jobStart.properties(), context);
      }
    }
  });
}
 
Example 8
Source File: Spark.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
public static SparkContext create(final SparkConf sparkConf) {
    if (isContextNullOrStopped()) {
        sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin");
        CONTEXT = SparkContext.getOrCreate(sparkConf);
    }
    return CONTEXT;
}
 
Example 9
Source File: Spark.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
public static SparkContext recreateStopped() {
    if (null == CONTEXT)
        throw new IllegalStateException("The Spark context has not been created.");
    if (!CONTEXT.isStopped())
        throw new IllegalStateException("The Spark context is not stopped.");
    CONTEXT = SparkContext.getOrCreate(CONTEXT.getConf());
    return CONTEXT;
}
 
Example 10
Source File: LocalPropertyTest.java    From tinkerpop with Apache License 2.0 4 votes vote down vote up
@Test
public void shouldSetThreadLocalProperties() throws Exception {
    final String testName = "ThreadLocalProperties";
    final String rddName = TestHelper.makeTestDataDirectory(LocalPropertyTest.class, UUID.randomUUID().toString());
    final Configuration configuration = new BaseConfiguration();
    configuration.setProperty("spark.master", "local[4]");
    configuration.setProperty("spark.serializer", GryoSerializer.class.getCanonicalName());
    configuration.setProperty(Graph.GRAPH, HadoopGraph.class.getName());
    configuration.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, SparkHadoopGraphProvider.PATHS.get("tinkerpop-modern-v3d0.kryo"));
    configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, GryoInputFormat.class.getCanonicalName());
    configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, PersistedOutputRDD.class.getCanonicalName());
    configuration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, rddName);
    configuration.setProperty(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, false);
    configuration.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);
    configuration.setProperty("spark.jobGroup.id", "22");
    Graph graph = GraphFactory.open(configuration);
    graph.compute(SparkGraphComputer.class)
            .result(GraphComputer.ResultGraph.NEW)
            .persist(GraphComputer.Persist.EDGES)
            .program(TraversalVertexProgram.build()
                    .traversal(graph.traversal().withComputer(Computer.compute(SparkGraphComputer.class)),
                            "gremlin-groovy",
                            "g.V()").create(graph)).submit().get();
    ////////
    SparkConf sparkConfiguration = new SparkConf();
    sparkConfiguration.setAppName(testName);
    ConfUtil.makeHadoopConfiguration(configuration).forEach(entry -> sparkConfiguration.set(entry.getKey(), entry.getValue()));
    JavaSparkContext sparkContext = new JavaSparkContext(SparkContext.getOrCreate(sparkConfiguration));
    JavaSparkStatusTracker statusTracker = sparkContext.statusTracker();
    assertTrue(statusTracker.getJobIdsForGroup("22").length >= 1);
    assertTrue(Spark.hasRDD(Constants.getGraphLocation(rddName)));
    ///////
    configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, PersistedInputRDD.class.getCanonicalName());
    configuration.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, rddName);
    configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, null);
    configuration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);

    // just a note that this value should have always been set to true, but from the initial commit was false.
    // interestingly the last assertion had always passed up to spark 2.3.x when it started to fail. apparently
    // that assertion should likely have never passed, so it stands to reason that there was a bug in spark in
    // 2.2.x that was resolved for 2.3.x....that's my story and i'm sticking to it.
    configuration.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);
    configuration.setProperty("spark.jobGroup.id", "44");
    graph = GraphFactory.open(configuration);
    graph.compute(SparkGraphComputer.class)
            .result(GraphComputer.ResultGraph.NEW)
            .persist(GraphComputer.Persist.NOTHING)
            .program(TraversalVertexProgram.build()
                    .traversal(graph.traversal().withComputer(SparkGraphComputer.class),
                            "gremlin-groovy",
                            "g.V()").create(graph)).submit().get();
    ///////
    assertTrue(statusTracker.getJobIdsForGroup("44").length >= 1);
}