Java Code Examples for org.apache.spark.sql.Dataset#select()

The following examples show how to use org.apache.spark.sql.Dataset#select() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ProjectRestrictMapFunction.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
    public Pair<Dataset<Row>, OperationContext> nativeTransformation(Dataset<Row> input, OperationContext context) {
        ProjectRestrictOperation op = (ProjectRestrictOperation) operationContext.getOperation();
        Dataset<Row> df = null;
        // TODO:  Enable the commented try-catch block after regression testing.
        //        This would be a safeguard against unanticipated exceptions:
        //             org.apache.spark.sql.catalyst.parser.ParseException
        //             org.apache.spark.sql.AnalysisException
        //    ... which may occur if the Splice parser fails to detect a
        //        SQL expression which SparkSQL does not support.
        if (op.hasExpressions()) {
//      try {
            df = input.selectExpr(op.getExpressions());
            return Pair.newPair(df, context);
//        }
//        catch (Exception e) {
//        }
        }
        int[] mapping = op.projectMapping;
        Column[] columns = new Column[mapping.length];
        for (int i = 0; i < mapping.length; ++i) {
            columns[i] = input.col("c" + (mapping[i] - 1));
        }
        df = input.select(columns);
        return Pair.newPair(df, context);
    }
 
Example 2
Source File: SparkPredictionServiceRunner.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
public Dataset<Row> run(Dataset dataset) {

        //only use configured variables for pipeline
        Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(this.sparkRunnerConfig);
        List<String> predictionVars = configuration.getModelPredictionConfiguration().getPredictionVariables();
        List<Column> usedColumns = new ArrayList<>();
        for(String var : predictionVars) {
            usedColumns.add(new Column(var));
        }
        dataset = dataset.select(BpmnaiUtils.getInstance().asSeq(usedColumns));

        //go through pipe elements
        // Define processing steps to run
        final PreprocessingRunner preprocessingRunner = new PreprocessingRunner();

        for(PipelineStep ps : pipelineManager.getOrderedPipeline()) {
            preprocessingRunner.addPreprocessorStep(ps);
        }

        // Run processing runner
        Dataset<Row> resultDataset = preprocessingRunner.run(dataset, this.sparkRunnerConfig);

        writeConfig();

        return resultDataset;
    }
 
Example 3
Source File: ParseJSONDeriver.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  String parsedStructTemporaryFieldName = "__parsed_json";

  Dataset<Row> dependency = dependencies.get(stepName);

  Dataset<Row> parsed = dependency.select(
      functions.from_json(new Column(fieldName), schema, options).as(parsedStructTemporaryFieldName));

  if (asStruct) {
    return parsed.withColumnRenamed(parsedStructTemporaryFieldName, structFieldName);
  }
  else {
    for (StructField parsedField : schema.fields()) {
      parsed = parsed.withColumn(
          parsedField.name(), new Column(parsedStructTemporaryFieldName + "." + parsedField.name()));
    }

    return parsed.drop(parsedStructTemporaryFieldName);
  }
}
 
Example 4
Source File: AbstractConceptMaps.java    From bunsen with Apache License 2.0 6 votes vote down vote up
/**
 * Writes mapping records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param mappings a dataset of mapping records
 * @param tableName the table to write them to
 */
private static void writeMappingsToTable(Dataset<Mapping> mappings,
    String tableName) {

  // Note the last two columns here must be the partitioned-by columns
  // in order and in lower case for Spark to properly match
  // them to the partitions.
  Dataset<Row> orderedColumnDataset =
      mappings.select("sourceValueSet",
          "targetValueSet",
          "sourceSystem",
          "sourceValue",
          "targetSystem",
          "targetValue",
          "equivalence",
          "conceptmapuri",
          "conceptmapversion");

  orderedColumnDataset
      .write()
      .insertInto(tableName);
}
 
Example 5
Source File: AdvancedSearchDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
private static Dataset<Row> getEntityToChainId() throws IOException {
    // get entityID to strandId mapping
    String query = "SELECT pdbid, entity_id, pdbx_strand_id FROM entity_poly";
    Dataset<Row> mapping = PdbjMineDataset.getDataset(query);

    // split one-to-many relationship into multiple records: 'A,B -> [A, B] -> explode to separate rows
    mapping = mapping.withColumn("chainId", split(mapping.col("pdbx_strand_id"), ","));
    mapping = mapping.withColumn("chainId", explode(col("chainId")));

    // create a structureChainId file, e.g. 1XYZ + A -> 1XYZ.A
    mapping = mapping.withColumn("structureChainId", concat_ws(".", col("structureId"), col("chainId")));

    return mapping.select("entity_id", "structureId", "structureChainId");
}
 
Example 6
Source File: RDDConverterUtilsExt.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static Dataset<Row> projectColumns(Dataset<Row> df, ArrayList<String> columns) {
	ArrayList<String> columnToSelect = new ArrayList<String>();
	for(int i = 1; i < columns.size(); i++) {
		columnToSelect.add(columns.get(i));
	}
	return df.select(columns.get(0), scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList());
}
 
Example 7
Source File: SelectDeriver.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  dependencyCheck(dependencies);
  Dataset<Row> sourceStep = dependencies.get(stepName);
  if (useIncludeFields){
      if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){
          throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      String firstCol = includeFields.get(0);
      includeFields.remove(0);
      return sourceStep.select(firstCol, includeFields.toArray(new String[0]));
  } else {
      if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){
          throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq());
  }
}
 
Example 8
Source File: Hierarchies.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Writes ancestor records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param ancestors a dataset of ancestor records
 * @param tableName the table to write them to
 */
private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) {

  Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem",
      "descendantValue",
      "ancestorSystem",
      "ancestorValue",
      "uri",
      "version");

  orderedColumnDataset.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(tableName);
}
 
Example 9
Source File: AbstractValueSets.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Writes value records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param values a dataset of value records
 * @param tableName the table to write them to
 */
private static void writeValuesToTable(Dataset<Value> values, String tableName) {

  // Note the last two columns here must be the partitioned-by columns in order and in lower case
  // for Spark to properly match them to the partitions
  Dataset<Row> orderColumnDataset = values.select("system",
      "version",
      "value",
      "valueseturi",
      "valuesetversion");

  orderColumnDataset.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(tableName);
}
 
Example 10
Source File: SparkMLHouses.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 5 votes vote down vote up
public static void main(String[] args) throws InterruptedException, StreamingQueryException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                // * the schema can be written on disk, and read from disk
                // * the schema is not mandatory to be complete, it can contain only the needed fields    
                StructType HOUSES_SCHEMA = 
                       new StructType()
                           .add("House", LongType, true)
                           .add("Taxes", LongType, true)
                           .add("Bedrooms", LongType, true)
                           .add("Baths", FloatType, true)
                           .add("Quadrant", LongType, true)
                           .add("NW", StringType, true)
                           .add("Price($)", LongType, false)
                           .add("Size(sqft)", LongType, false)
                           .add("lot", LongType, true);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);

                SparkSession sparkSession = SparkSession.builder()
                    .config(conf)
                    .getOrCreate();

                Dataset<Row> housesDF = sparkSession.read()
                     .schema(HOUSES_SCHEMA)
                     .json(HOUSES_FILE_PATH);
             
                // Gathering Data				
                Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), 
                    col("Bedrooms"), col("Baths"),
                    col("Size(sqft)"), col("Price($)"));
                
                // Data Preparation  
                Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label");
                
                Imputer imputer = new Imputer()
                    // .setMissingValue(1.0d)
                    .setInputCols(new String[] { "Baths" })
                    .setOutputCols(new String[] { "~Baths~" });

                VectorAssembler assembler = new VectorAssembler()
                    .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" })
                    .setOutputCol("features");
                
                // Choosing a Model               
                LinearRegression linearRegression = new LinearRegression();
                linearRegression.setMaxIter(1000);

                Pipeline pipeline = new Pipeline()
                                .setStages(new PipelineStage[] {
                                    imputer, assembler, linearRegression 
                                });

                // Training The Data
                Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 });

                Dataset<Row> trainDF = splitDF[0];
                Dataset<Row> evaluationDF = splitDF[1];

                PipelineModel pipelineModel = pipeline.fit(trainDF);
                
                // Evaluation 
                Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF);

                predictionsDF.show(false);

                Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), 
                    col("prediction"));

                RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2");
                RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse");

                double r2 = evaluteR2.evaluate(forEvaluationDF);
                double rmse = evaluteRMSE.evaluate(forEvaluationDF);

                logger.info("---------------------------");
                logger.info("R2 =" + r2);
                logger.info("RMSE =" + rmse);
                logger.info("---------------------------");
        }
 
Example 11
Source File: AdvancedSearchDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Runs an RCSB PDB Advanced Search web service using an XML query description.
 * The returned dataset contains the following fields dependent on the query type:
 * <pre> 
 *   structureId, e.g., 1STP
 *   structureChainId, e.g., 4HHB.A
 *   ligandId, e.g., HEM
 * </pre>
 *   
 * @param xmlQuery RCSB PDB Advanced Query XML
 * @return dataset of ids
 * @throws IOException
 */
public static Dataset<Row> getDataset(String xmlQuery) throws IOException {
    // run advanced query
    List<String> results = AdvancedQueryService.postQuery(xmlQuery);

    // convert list of lists to a dataframe
    SparkSession spark = SparkSession.builder().getOrCreate();

    // handle 3 types of results based on length of string:
    //   structureId: 4 (e.g., 4HHB)
    //   structureEntityId: > 4 (e.g., 4HHB:1)
    //   entityId: < 4 (e.g., HEM)
    Dataset<Row> ds = null;
    if (results.size() > 0) {
        if (results.get(0).length() > 4) {
            ds = spark.createDataset(results, Encoders.STRING()).toDF("structureEntityId");
        
            // if results contain an entity id, e.g., 101M:1, then map entityId to structureChainId
            ds = ds.withColumn("structureId", substring_index(col("structureEntityId"), ":", 1));
            ds = ds.withColumn("entityId", substring_index(col("structureEntityId"), ":", -1));
          
            Dataset<Row> mapping = getEntityToChainId();
            ds = ds.join(mapping, ds.col("structureId").equalTo(mapping.col("structureId")).and(ds.col("entityId").equalTo(mapping.col("entity_id"))));
        
            ds = ds.select("structureChainId");
        } else if (results.get(0).length() < 4) {
            ds = spark.createDataset(results, Encoders.STRING()).toDF("ligandId");
        } else {
            ds = spark.createDataset(results, Encoders.STRING()).toDF("structureId");
        }
    }

    return ds;
}
 
Example 12
Source File: RDDConverterUtilsExt.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static Dataset<Row> projectColumns(Dataset<Row> df, ArrayList<String> columns) {
	ArrayList<String> columnToSelect = new ArrayList<String>();
	for(int i = 1; i < columns.size(); i++) {
		columnToSelect.add(columns.get(i));
	}
	return df.select(columns.get(0), scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList());
}
 
Example 13
Source File: JavaMultilayerPerceptronClassifierExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMultilayerPerceptronClassifierExample")
    .getOrCreate();

  // $example on$
  // Load training data
  String path = "data/mllib/sample_multiclass_classification_data.txt";
  Dataset<Row> dataFrame = spark.read().format("libsvm").load(path);

  // Split the data into train and test
  Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
  Dataset<Row> train = splits[0];
  Dataset<Row> test = splits[1];

  // specify layers for the neural network:
  // input layer of size 4 (features), two intermediate of size 5 and 4
  // and output of size 3 (classes)
  int[] layers = new int[] {4, 5, 4, 3};

  // create the trainer and set its parameters
  MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier()
    .setLayers(layers)
    .setBlockSize(128)
    .setSeed(1234L)
    .setMaxIter(100);

  // train the model
  MultilayerPerceptronClassificationModel model = trainer.fit(train);

  // compute accuracy on the test set
  Dataset<Row> result = model.transform(test);
  Dataset<Row> predictionAndLabels = result.select("prediction", "label");
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setMetricName("accuracy");

  System.out.println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels));
  // $example off$

  spark.stop();
}
 
Example 14
Source File: JavaEstimatorTransformerParamExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaEstimatorTransformerParamExample")
    .getOrCreate();

  // $example on$
  // Prepare training data.
  List<Row> dataTraining = Arrays.asList(
      RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)),
      RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)),
      RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)),
      RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> training = spark.createDataFrame(dataTraining, schema);

  // Create a LogisticRegression instance. This instance is an Estimator.
  LogisticRegression lr = new LogisticRegression();
  // Print out the parameters, documentation, and any default values.
  System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");

  // We may set parameters using setter methods.
  lr.setMaxIter(10).setRegParam(0.01);

  // Learn a LogisticRegression model. This uses the parameters stored in lr.
  LogisticRegressionModel model1 = lr.fit(training);
  // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
  // we can view the parameters it used during fit().
  // This prints the parameter (name: value) pairs, where names are unique IDs for this
  // LogisticRegression instance.
  System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());

  // We may alternatively specify parameters using a ParamMap.
  ParamMap paramMap = new ParamMap()
    .put(lr.maxIter().w(20))  // Specify 1 Param.
    .put(lr.maxIter(), 30)  // This overwrites the original maxIter.
    .put(lr.regParam().w(0.1), lr.threshold().w(0.55));  // Specify multiple Params.

  // One can also combine ParamMaps.
  ParamMap paramMap2 = new ParamMap()
    .put(lr.probabilityCol().w("myProbability"));  // Change output column name
  ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);

  // Now learn a new model using the paramMapCombined parameters.
  // paramMapCombined overrides all parameters set earlier via lr.set* methods.
  LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
  System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());

  // Prepare test documents.
  List<Row> dataTest = Arrays.asList(
      RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
      RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)),
      RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5))
  );
  Dataset<Row> test = spark.createDataFrame(dataTest, schema);

  // Make predictions on test documents using the Transformer.transform() method.
  // LogisticRegression.transform will only use the 'features' column.
  // Note that model2.transform() outputs a 'myProbability' column instead of the usual
  // 'probability' column since we renamed the lr.probabilityCol parameter previously.
  Dataset<Row> results = model2.transform(test);
  Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
  for (Row r: rows.collectAsList()) {
    System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
      + ", prediction=" + r.get(3));
  }
  // $example off$

  spark.stop();
}
 
Example 15
Source File: SparkMultiClassClassifier.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * Dataset must at least contain the following two columns:
 * label: the class labels
 * features: feature vector
 * @param data
 * @return map with metrics
 */
public Map<String,String> fit(Dataset<Row> data) {
	int classCount = (int)data.select(label).distinct().count();

	StringIndexerModel labelIndexer = new StringIndexer()
	  .setInputCol(label)
	  .setOutputCol("indexedLabel")
	  .fit(data);

	// Split the data into training and test sets (30% held out for testing)
	Dataset<Row>[] splits = data.randomSplit(new double[] {1.0-testFraction, testFraction}, seed);
	Dataset<Row> trainingData = splits[0];
	Dataset<Row> testData = splits[1];
	
	String[] labels = labelIndexer.labels();
	
	System.out.println();
	System.out.println("Class\tTrain\tTest");
	for (String l: labels) {
		System.out.println(l + "\t" + trainingData.select(label).filter(label + " = '" + l + "'").count()
				+ "\t" 
				+ testData.select(label).filter(label + " = '" + l + "'").count());
	}
	
	// Set input columns
	predictor
	.setLabelCol("indexedLabel")
	.setFeaturesCol("features");

	// Convert indexed labels back to original labels.
	IndexToString labelConverter = new IndexToString()
	  .setInputCol("prediction")
	  .setOutputCol("predictedLabel")
	  .setLabels(labelIndexer.labels());

	// Chain indexers and forest in a Pipeline
	Pipeline pipeline = new Pipeline()
	  .setStages(new PipelineStage[] {labelIndexer, predictor, labelConverter});

	// Train model. This also runs the indexers.
	PipelineModel model = pipeline.fit(trainingData);

	// Make predictions.
	Dataset<Row> predictions = model.transform(testData).cache();
	
	// Display some sample predictions
	System.out.println();
	System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());

	predictions.sample(false, 0.1, seed).show(25);	

	predictions = predictions.withColumnRenamed(label, "stringLabel");
	predictions = predictions.withColumnRenamed("indexedLabel", label);
	
	// collect metrics
	Dataset<Row> pred = predictions.select("prediction",label);
       Map<String,String> metrics = new LinkedHashMap<>();       
       metrics.put("Method", predictor.getClass().getSimpleName());
       
       if (classCount == 2) {
       	    BinaryClassificationMetrics b = new BinaryClassificationMetrics(pred);
         	metrics.put("AUC", Float.toString((float)b.areaUnderROC()));
       }
    
       MulticlassMetrics m = new MulticlassMetrics(pred); 
       metrics.put("F", Float.toString((float)m.weightedFMeasure()));
       metrics.put("Accuracy", Float.toString((float)m.accuracy()));
       metrics.put("Precision", Float.toString((float)m.weightedPrecision()));
       metrics.put("Recall", Float.toString((float)m.weightedRecall()));
       metrics.put("False Positive Rate", Float.toString((float)m.weightedFalsePositiveRate()));
       metrics.put("True Positive Rate", Float.toString((float)m.weightedTruePositiveRate()));
       metrics.put("", "\nConfusion Matrix\n" 
           + Arrays.toString(labels) +"\n" 
       		+ m.confusionMatrix().toString());
       
       return metrics;
}
 
Example 16
Source File: ProteinFoldDatasetCreator.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 1) {
		System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(ProteinFoldDatasetCreator.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));

	// get secondary structure content
	Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb);

	// classify chains by secondary structure type
	double minThreshold = 0.05;
	double maxThreshold = 0.15;
    	data = addProteinFoldType(data, minThreshold, maxThreshold);
    	
    	// create a binary classification dataset
	data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache();

	// create a three-state classification model (alpha, beta, alpha+beta)
	//		data = data.filter("foldType != 'other'").cache();

	// add Word2Vec encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int n = 2;
	int windowSize = 11;
	int vectorSize = 50;
	data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);	

	data.printSchema();
	data.show(25);
	
	// keep only a subset of relevant fields for further processing
       data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features");

       data.write().mode("overwrite").format("parquet").save(args[0]);
	
	long end = System.nanoTime();

	System.out.println((end-start)/1E9 + " sec");
}
 
Example 17
Source File: UDFExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
	 //Build a Spark Session	
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
      .appName("EdgeBuilder")
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
	  rootLogger.setLevel(Level.WARN); 
	// Read the CSV data
		 Dataset<Row> emp_ds = sparkSession.read()
				 .format("com.databricks.spark.csv")
   		         .option("header", "true")
   		         .option("inferSchema", "true")
   		         .load("src/main/resources/employee.txt");    
    		
	    UDF2 calcDays=new CalcDaysUDF();
	  //Registering the UDFs in Spark Session created above      
	    sparkSession.udf().register("calcDays", calcDays, DataTypes.LongType);
	    
	    emp_ds.createOrReplaceTempView("emp_ds");
	    
	    emp_ds.printSchema();
	    emp_ds.show();
	    
	    sparkSession.sql("select calcDays(hiredate,'dd-MM-yyyy') from emp_ds").show();   
	    //Instantiate UDAF
	    AverageUDAF calcAvg= new AverageUDAF();
	    //Register UDAF to SparkSession
	    sparkSession.udf().register("calAvg", calcAvg);
	    //Use UDAF
	    sparkSession.sql("select deptno,calAvg(salary) from emp_ds group by deptno ").show(); 
	   
	    //
	    TypeSafeUDAF typeSafeUDAF=new TypeSafeUDAF();
	    
	    Dataset<Employee> emf = emp_ds.as(Encoders.bean(Employee.class));
	    emf.printSchema();
	    emf.show();
	    
	    TypedColumn<Employee, Double> averageSalary = typeSafeUDAF.toColumn().name("averageTypeSafe");
	    Dataset<Double> result = emf.select(averageSalary);
	   result.show();
	    

}
 
Example 18
Source File: JavaEstimatorTransformerParamExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
   SparkSession spark = SparkSession
     .builder().master("local").config("spark.sql.warehouse.dir", "file:///C:/Users/sumit.kumar/Downloads/bin/warehouse")
     .appName("JavaEstimatorTransformerParamExample")
     .getOrCreate();
   Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
   // $example on$
   // Prepare training data.
   List<Row> dataTraining = Arrays.asList(
       RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)),
       RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)),
       RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)),
       RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5))
   );
   StructType schema = new StructType(new StructField[]{
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
       new StructField("features", new VectorUDT(), false, Metadata.empty())
   });
   Dataset<Row> training = spark.createDataFrame(dataTraining, schema);

   // Create a LogisticRegression instance. This instance is an Estimator.
   LogisticRegression lr = new LogisticRegression();
   // Print out the parameters, documentation, and any default values.
   System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");

   // We may set parameters using setter methods.
   lr.setMaxIter(10).setRegParam(0.01);

   // Learn a LogisticRegression model. This uses the parameters stored in lr.
   LogisticRegressionModel model1 = lr.fit(training);
   // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
   // we can view the parameters it used during fit().
   // This prints the parameter (name: value) pairs, where names are unique IDs for this
   // LogisticRegression instance.
   System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());

   // We may alternatively specify parameters using a ParamMap.
   ParamMap paramMap = new ParamMap()
     .put(lr.maxIter().w(20))  // Specify 1 Param.
     .put(lr.maxIter(), 30)  // This overwrites the original maxIter.
     .put(lr.regParam().w(0.1), lr.threshold().w(0.55));  // Specify multiple Params.

   // One can also combine ParamMaps.
   ParamMap paramMap2 = new ParamMap()
     .put(lr.probabilityCol().w("myProbability"));  // Change output column name
   ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);

   // Now learn a new model using the paramMapCombined parameters.
   // paramMapCombined overrides all parameters set earlier via lr.set* methods.
   LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
   System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());

   // Prepare test documents.
   List<Row> dataTest = Arrays.asList(
       RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
       RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)),
       RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5))
   );
   Dataset<Row> test = spark.createDataFrame(dataTest, schema);

   // Make predictions on test documents using the Transformer.transform() method.
   // LogisticRegression.transform will only use the 'features' column.
   // Note that model2.transform() outputs a 'myProbability' column instead of the usual
   // 'probability' column since we renamed the lr.probabilityCol parameter previously.
   Dataset<Row> results = model2.transform(test);
   Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
   for (Row r: rows.collectAsList()) {
     System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
       + ", prediction=" + r.get(3));
   }
   // $example off$

   spark.stop();
 }
 
Example 19
Source File: BikeRentalPrediction.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	SparkSession sparkSession = SparkSession
			.builder()
			.master("local")
			.config("spark.sql.warehouse.dir",
					"file:///E:/sumitK/Hadoop/warehouse")
			.appName("BikeRentalPrediction").getOrCreate();
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN);
	//We use the sqlContext.read method to read the data and set a few options:
	//  'format': specifies the Spark CSV data source
	//  'header': set to true to indicate that the first line of the CSV data file is a header
    // The file is called 'hour.csv'.	
	Dataset<Row> ds=sparkSession.read()
			  .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
			  .option("header", "true")
			  .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv");
	
	ds.cache();
	
	ds.select("season").show();;
	
	ds.show();
	
	System.out.println("Our dataset has rows :: "+ ds.count());
	
	Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered");
	df.printSchema();
	//col("...") is preferable to df.col("...")
	Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType),
			                            col("yr").cast(DataTypes.IntegerType),
										col("mnth").cast(DataTypes.IntegerType),
										col("hr").cast(DataTypes.IntegerType),
										col("holiday").cast(DataTypes.IntegerType),
										col("weekday").cast(DataTypes.IntegerType),
										col("workingday").cast(DataTypes.IntegerType),
										col("weathersit").cast(DataTypes.IntegerType),
										col("temp").cast(DataTypes.IntegerType),
										col("atemp").cast(DataTypes.IntegerType),
										col("hum").cast(DataTypes.IntegerType),
										col("windspeed").cast(DataTypes.IntegerType),
										col("cnt").cast(DataTypes.IntegerType));
	
	
dformatted.printSchema();	
Dataset<Row>[] data=	dformatted.randomSplit(new double[]{0.7,0.3});
System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count());

///
//removing 'cnt' cloumn and then forming str array
String[] featuresCols = dformatted.drop("cnt").columns();

for(String str:featuresCols){
	System.out.println(str+" :: ");
}

//This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures");
//This identifies categorical features and indexes them.
VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4);

//Takes the "features" column and learns to predict "cnt"
GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt");
		
// Define a grid of hyperparameters to test:
  //  - maxDepth: max depth of each decision tree in the GBT ensemble
//  - maxIter: iterations, i.e., number of trees in each GBT ensemble
// In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
ParamMap[]	paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build();
// We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol());

//	# Declare the CrossValidator, which runs model tuning for us.
	CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid);
		
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv});
			
	PipelineModel pipelineModel=pipeline.fit(data[0]);
	
	Dataset<Row> predictions = pipelineModel.transform(data[1]);
	
	predictions.show();
	//predictions.select("cnt", "prediction", *featuresCols);
}
 
Example 20
Source File: SparkMLScoringOnline.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException, StreamingQueryException {
 
      System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

      // * the schema can be written on disk, and read from disk
      // * the schema is not mandatory to be complete, it can contain only the needed fields    
      StructType RSVP_SCHEMA = new StructType()                                
              .add("event",
                      new StructType()
                              .add("event_id", StringType, true)
                              .add("event_name", StringType, true)
                              .add("event_url", StringType, true)
                              .add("time", LongType, true))
              .add("group",
                      new StructType()
                              .add("group_city", StringType, true)
                              .add("group_country", StringType, true)
                              .add("group_id", LongType, true)
                              .add("group_lat", DoubleType, true)
                              .add("group_lon", DoubleType, true)
                              .add("group_name", StringType, true)
                              .add("group_state", StringType, true)
                              .add("group_topics", DataTypes.createArrayType(
                                      new StructType()
                                              .add("topicName", StringType, true)
                                              .add("urlkey", StringType, true)), true)
                              .add("group_urlname", StringType, true))
              .add("guests", LongType, true)
              .add("member",
                      new StructType()
                              .add("member_id", LongType, true)
                              .add("member_name", StringType, true)                                
                              .add("photo", StringType, true))
              .add("mtime", LongType, true)
              .add("response", StringType, true)
              .add("rsvp_id", LongType, true)
              .add("venue",
                      new StructType()
                              .add("lat", DoubleType, true)
                              .add("lon", DoubleType, true)
                              .add("venue_id", LongType, true)
                              .add("venue_name", StringType, true))
              .add("visibility", StringType, true);

      final SparkConf conf = new SparkConf()
              .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
              .setAppName(APPLICATION_NAME)
              .set("spark.sql.caseSensitive", CASE_SENSITIVE);

      SparkSession spark = SparkSession
              .builder()
              .config(conf)
              .getOrCreate();

      PipelineModel pipelineModel = PipelineModel.load(MODEL_FOLDER_PATH);
     
      Dataset<Row> meetupStream = spark.readStream()
              .format(KAFKA_FORMAT)
              .option("kafka.bootstrap.servers", KAFKA_BROKERS)
              .option("subscribe", KAFKA_TOPIC)
              .load();

      Dataset<Row> gatheredDF = meetupStream.select(
    (from_json(col("value").cast("string"), RSVP_SCHEMA))
	        .alias("rsvp"))
	.alias("meetup")
          .select("meetup.*");
		
      Dataset<Row> filteredDF = gatheredDF.filter(e -> !e.anyNull());  

      Dataset<Row> preparedDF = filteredDF.select(
        col("rsvp.group.group_city"),
        col("rsvp.group.group_lat"), col("rsvp.group.group_lon"), 
		col("rsvp.response")
);
		                
      preparedDF.printSchema();
   
      Dataset<Row> predictionDF = pipelineModel.transform(preparedDF);
      
      StreamingQuery query = predictionDF.writeStream()                
              .format(JSON_FORMAT)
              .option("path", RESULT_FOLDER_PATH)
              .option("checkpointLocation", CHECKPOINT_LOCATION)
              .trigger(Trigger.ProcessingTime(QUERY_INTERVAL_SECONDS))
              .option("truncate", false)
              .start();

      query.awaitTermination();
  }