Java Code Examples for org.apache.spark.sql.SparkSession#stop()

The following examples show how to use org.apache.spark.sql.SparkSession#stop() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JavaRandomForestClassifierExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRandomForestClassifierExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);
  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a RandomForest model.
  RandomForestClassifier rf = new RandomForestClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures");

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and forest in a Pipeline
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]);
  System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
  // $example off$

  spark.stop();
}
 
Example 2
Source File: JavaSQLTransformerExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaSQLTransformerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 1.0, 3.0),
    RowFactory.create(2, 2.0, 5.0)
  );
  StructType schema = new StructType(new StructField [] {
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("v2", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  SQLTransformer sqlTrans = new SQLTransformer().setStatement(
    "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");

  sqlTrans.transform(df).show();
  // $example off$

  spark.stop();
}
 
Example 3
Source File: TestSparkDataFile.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@AfterClass
public static void stopSpark() {
  SparkSession currentSpark = TestSparkDataFile.spark;
  TestSparkDataFile.spark = null;
  TestSparkDataFile.sparkContext = null;
  currentSpark.stop();
}
 
Example 4
Source File: JavaLinearRegressionWithElasticNetExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLinearRegressionWithElasticNetExample")
    .getOrCreate();

  // $example on$
  // Load training data.
  Dataset<Row> training = spark.read().format("libsvm")
    .load("data/mllib/sample_linear_regression_data.txt");

  LinearRegression lr = new LinearRegression()
    .setMaxIter(10)
    .setRegParam(0.3)
    .setElasticNetParam(0.8);

  // Fit the model.
  LinearRegressionModel lrModel = lr.fit(training);

  // Print the coefficients and intercept for linear regression.
  System.out.println("Coefficients: "
    + lrModel.coefficients() + " Intercept: " + lrModel.intercept());

  // Summarize the model over the training set and print out some metrics.
  LinearRegressionTrainingSummary trainingSummary = lrModel.summary();
  System.out.println("numIterations: " + trainingSummary.totalIterations());
  System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
  trainingSummary.residuals().show();
  System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
  System.out.println("r2: " + trainingSummary.r2());
  // $example off$

  spark.stop();
}
 
Example 5
Source File: FromRowsAndSchema.java    From learning-spark-with-java with MIT License 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession spark = SparkSession
        .builder()
        .appName("DataFrame-FromRowsAndSchema")
        .master("local[4]")
        .getOrCreate();

    List<Row> customerRows = Arrays.asList(
        RowFactory.create(1, "Widget Co", 120000.00, 0.00, "AZ"),
        RowFactory.create(2, "Acme Widgets", 410500.00, 500.00, "CA"),
        RowFactory.create(3, "Widgetry", 410500.00, 200.00, "CA"),
        RowFactory.create(4, "Widgets R Us", 410500.00, 0.0, "CA"),
        RowFactory.create(5, "Ye Olde Widgete", 500.00, 0.0, "MA")
    );

    List<StructField> fields = Arrays.asList(
        DataTypes.createStructField("id", DataTypes.IntegerType, true),
        DataTypes.createStructField("name", DataTypes.StringType, true),
        DataTypes.createStructField("sales", DataTypes.DoubleType, true),
        DataTypes.createStructField("discount", DataTypes.DoubleType, true),
        DataTypes.createStructField("state", DataTypes.StringType, true)
    );
    StructType customerSchema = DataTypes.createStructType(fields);

    Dataset<Row> customerDF =
        spark.createDataFrame(customerRows, customerSchema);

    System.out.println("*** the schema created");
    customerDF.printSchema();

    System.out.println("*** the data");
    customerDF.show();

    System.out.println("*** just the rows from CA");
    customerDF.filter(col("state").equalTo("CA")).show();

    spark.stop();
}
 
Example 6
Source File: JavaMinHashLSHExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMinHashLSHExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0}))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("keys", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MinHashLSH mh = new MinHashLSH()
    .setNumHashTables(1)
    .setInputCol("keys")
    .setOutputCol("values");

  MinHashLSHModel model = mh.fit(dataFrame);
  model.transform(dataFrame).show();
  // $example off$

  spark.stop();
}
 
Example 7
Source File: JavaTfIdfExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTfIdfExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0.0, "Hi I heard about Spark"),
    RowFactory.create(0.0, "I wish Java could use case classes"),
    RowFactory.create(1.0, "Logistic regression models are neat")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> sentenceData = spark.createDataFrame(data, schema);

  Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
  Dataset<Row> wordsData = tokenizer.transform(sentenceData);

  int numFeatures = 20;
  HashingTF hashingTF = new HashingTF()
    .setInputCol("words")
    .setOutputCol("rawFeatures")
    .setNumFeatures(numFeatures);

  Dataset<Row> featurizedData = hashingTF.transform(wordsData);
  // alternatively, CountVectorizer can also be used to get term frequency vectors

  IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
  IDFModel idfModel = idf.fit(featurizedData);

  Dataset<Row> rescaledData = idfModel.transform(featurizedData);
  rescaledData.select("label", "features").show();
  // $example off$

  spark.stop();
}
 
Example 8
Source File: JavaVectorSlicerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorSlicerExample")
    .getOrCreate();

  // $example on$
  Attribute[] attrs = new Attribute[]{
    NumericAttribute.defaultAttr().withName("f1"),
    NumericAttribute.defaultAttr().withName("f2"),
    NumericAttribute.defaultAttr().withName("f3")
  };
  AttributeGroup group = new AttributeGroup("userFeatures", attrs);

  List<Row> data = Lists.newArrayList(
    RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
    RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
  );

  Dataset<Row> dataset =
    spark.createDataFrame(data, (new StructType()).add(group.toStructField()));

  VectorSlicer vectorSlicer = new VectorSlicer()
    .setInputCol("userFeatures").setOutputCol("features");

  vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
  // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})

  Dataset<Row> output = vectorSlicer.transform(dataset);
  output.show(false);
  // $example off$

  spark.stop();
}
 
Example 9
Source File: JavaLogisticRegressionWithElasticNetExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogisticRegressionWithElasticNetExample")
    .getOrCreate();

  // $example on$
  // Load training data
  Dataset<Row> training = spark.read().format("libsvm")
    .load("data/mllib/sample_libsvm_data.txt");

  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.3)
    .setElasticNetParam(0.8);

  // Fit the model
  LogisticRegressionModel lrModel = lr.fit(training);

  // Print the coefficients and intercept for logistic regression
  System.out.println("Coefficients: "
    + lrModel.coefficients() + " Intercept: " + lrModel.intercept());

  // We can also use the multinomial family for binary classification
  LogisticRegression mlr = new LogisticRegression()
          .setMaxIter(10)
          .setRegParam(0.3)
          .setElasticNetParam(0.8)
          .setFamily("multinomial");

  // Fit the model
  LogisticRegressionModel mlrModel = mlr.fit(training);

  // Print the coefficients and intercepts for logistic regression with multinomial family
  System.out.println("Multinomial coefficients: " + lrModel.coefficientMatrix()
    + "\nMultinomial intercepts: " + mlrModel.interceptVector());
  // $example off$

  spark.stop();
}
 
Example 10
Source File: JavaNGramExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaNGramExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
    RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
    RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField(
      "words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
  });

  Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema);

  NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams");

  Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame);
  ngramDataFrame.select("ngrams").show(false);
  // $example off$

  spark.stop();
}
 
Example 11
Source File: TestSparkTableUtilWithInMemoryCatalog.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@AfterClass
public static void stopSpark() {
  SparkSession currentSpark = TestSparkTableUtilWithInMemoryCatalog.spark;
  TestSparkTableUtilWithInMemoryCatalog.spark = null;
  currentSpark.stop();
}
 
Example 12
Source File: SparkSessionUtil.java    From jpmml-sparkml with GNU Affero General Public License v3.0 4 votes vote down vote up
static
public SparkSession destroySparkSession(SparkSession sparkSession){
	sparkSession.stop();

	return null;
}
 
Example 13
Source File: TestOrcScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@AfterClass
public static void stopSpark() {
  SparkSession spark = TestOrcScan.spark;
  TestOrcScan.spark = null;
  spark.stop();
}
 
Example 14
Source File: JReadPartitionAware_Mismatch.java    From spark-data-sources with MIT License 4 votes vote down vote up
public static void main(String[] args)
        throws IOException, InterruptedException,
        ExistingTableException, UnknownTableException
{

    final String serverHost = "localhost";
    final int serverPort = 50199;

    DBServer server = new DBServer(serverPort);
    server.start();

    System.out.println("*** Example database server started");

    //
    // Since this DataSource doesn't support writing, we need to populate
    // ExampleDB with some data.
    //

    Schema schema = new Schema();
    schema.addColumn("g", Schema.ColumnType.STRING);
    schema.addColumn("u", Schema.ColumnType.INT64);


    DBClient client = new DBClient(serverHost, serverPort);
    //
    // This time the table is not clustered on any column
    //
    client.createTable("myTable", schema);

    List<edb.common.Row> toInsert = new ArrayList<>();
    for (int i = 0; i < 20; i++) {
        edb.common.Row r = new edb.common.Row();
        //
        // String column with four distinct values for clustering
        //
        r.addField(new edb.common.Row.StringField("g", "G_" + (i % 4)));
        r.addField(new edb.common.Row.Int64Field("u", i * 100));

        toInsert.add(r);
    }

    client.bulkInsert("myTable", toInsert);

    System.out.println("*** Example database server populated with data");

    //
    // By default this data source supports creating Datasets with four partitions.
    //
    String dataSourceName = "datasources.PartitioningRowDataSource";

    SparkSession spark = SparkSession
            .builder()
            .appName("JReadPartitionAware-Mismatch")
            .master("local[4]")
            .getOrCreate();

    //
    // This is where we read from our DataSource. Notice how we use the
    // fully qualified class name and provide the information needed to connect to
    // ExampleDB using options. We specify two partitions so that each can be expected
    // to contain two clusters. But the table wasn't set up with the column clustered, so
    // a shuffle will be needed.
    //
    Dataset<Row> data = spark.read()
            .format(dataSourceName)
            .option("host", serverHost)
            .option("port", serverPort)
            .option("table", "myTable")
            .option("partitions", 2) // number of partitions specified here
            .load();

    System.out.println("*** Schema: ");
    data.printSchema();

    System.out.println("*** Data: ");
    data.show();

    RDDUtils.analyze(data);

    Dataset<Row> aggregated = data.groupBy(col("g")).agg(sum(col("u")));

    //
    // Note: since a shuffle was required, the resulting table has the usual default
    // number of partitions -- 200 as of Spark 2.3.0
    //
    System.out.println("*** Query result: ");
    aggregated.show();

    RDDUtils.analyze(aggregated);

    spark.stop();

    server.stop();
}
 
Example 15
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@AfterClass
public static void stopSpark() {
  SparkSession currentSpark = TestSparkDataWrite.spark;
  TestSparkDataWrite.spark = null;
  currentSpark.stop();
}
 
Example 16
Source File: TestAvroScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@AfterClass
public static void stopSpark() {
  SparkSession currentSpark = TestAvroScan.spark;
  TestAvroScan.spark = null;
  currentSpark.stop();
}
 
Example 17
Source File: TestAvroScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@AfterClass
public static void stopSpark() {
  SparkSession spark = TestAvroScan.spark;
  TestAvroScan.spark = null;
  spark.stop();
}
 
Example 18
Source File: Basic.java    From learning-spark-with-java with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}
 
Example 19
Source File: TestSnapshotSelection.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@AfterClass
public static void stopSpark() {
  SparkSession currentSpark = TestSnapshotSelection.spark;
  TestSnapshotSelection.spark = null;
  currentSpark.stop();
}
 
Example 20
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@AfterClass
public static void stopSpark() {
  SparkSession currentSpark = TestFilteredScan.spark;
  TestFilteredScan.spark = null;
  currentSpark.stop();
}