org.apache.spark.ml.PipelineModel Java Examples
The following examples show how to use
org.apache.spark.ml.PipelineModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EntitySalienceSpark.java From ambiverse-nlu with Apache License 2.0 | 6 votes |
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); synchronized (EntitySalienceSpark.class) { SparkConf conf = new SparkConf() .setAppName("EntitySalienceTagger") .set("spark.driver.allowMultipleContexts","true") .setMaster("local"); jsc = new JavaSparkContext(conf); //Load the training model //trainingModel = PipelineModel.load(modelPath); trainingModel = (PipelineModel) jsc.objectFile(modelPath).first(); jsc.close(); jsc.stop(); } }
Example #2
Source File: JavaRandomForestClassifierExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaRandomForestClassifierExample") .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. StringIndexerModel labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") .fit(data); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(4) .fit(data); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a RandomForest model. RandomForestClassifier rf = new RandomForestClassifier() .setLabelCol("indexedLabel") .setFeaturesCol("indexedFeatures"); // Convert indexed labels back to original labels. IndexToString labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); // Chain indexers and forest in a Pipeline Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter}); // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. Dataset<Row> predictions = model.transform(testData); // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); // Select (prediction, true label) and compute test error MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") .setMetricName("accuracy"); double accuracy = evaluator.evaluate(predictions); System.out.println("Test Error = " + (1.0 - accuracy)); RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]); System.out.println("Learned classification forest model:\n" + rfModel.toDebugString()); // $example off$ spark.stop(); }
Example #3
Source File: CMMModel.java From vn.vitk with GNU General Public License v3.0 | 5 votes |
@Override public CMMModel load(String path) { org.apache.spark.ml.util.DefaultParamsReader.Metadata metadata = DefaultParamsReader.loadMetadata(path, sc(), CMMModel.class.getName()); String pipelinePath = new Path(path, "pipelineModel").toString(); PipelineModel pipelineModel = PipelineModel.load(pipelinePath); String dataPath = new Path(path, "data").toString(); DataFrame df = sqlContext().read().format("parquet").load(dataPath); Row row = df.select("markovOrder", "weights", "tagDictionary").head(); // load the Markov order MarkovOrder order = MarkovOrder.values()[row.getInt(0)-1]; // load the weight vector Vector w = row.getAs(1); // load the tag dictionary @SuppressWarnings("unchecked") scala.collection.immutable.HashMap<String, WrappedArray<Integer>> td = (scala.collection.immutable.HashMap<String, WrappedArray<Integer>>)row.get(2); Map<String, Set<Integer>> tagDict = new HashMap<String, Set<Integer>>(); Iterator<Tuple2<String, WrappedArray<Integer>>> iterator = td.iterator(); while (iterator.hasNext()) { Tuple2<String, WrappedArray<Integer>> tuple = iterator.next(); Set<Integer> labels = new HashSet<Integer>(); scala.collection.immutable.List<Integer> list = tuple._2().toList(); for (int i = 0; i < list.size(); i++) labels.add(list.apply(i)); tagDict.put(tuple._1(), labels); } // build a CMM model CMMModel model = new CMMModel(pipelineModel, w, order, tagDict); DefaultParamsReader.getAndSetParams(model, metadata); return model; }
Example #4
Source File: CMMModel.java From vn.vitk with GNU General Public License v3.0 | 5 votes |
/** * Creates a conditional Markov model. * @param pipelineModel * @param weights * @param markovOrder */ public CMMModel(PipelineModel pipelineModel, Vector weights, MarkovOrder markovOrder, Map<String, Set<Integer>> tagDictionary) { this.pipelineModel = pipelineModel; this.contextExtractor = new ContextExtractor(markovOrder, Constants.REGEXP_FILE); this.weights = weights; this.tags = ((StringIndexerModel)(pipelineModel.stages()[2])).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { featureMap.put(features[j], j); } this.tagDictionary = tagDictionary; }
Example #5
Source File: SparkMLHouses.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 5 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields StructType HOUSES_SCHEMA = new StructType() .add("House", LongType, true) .add("Taxes", LongType, true) .add("Bedrooms", LongType, true) .add("Baths", FloatType, true) .add("Quadrant", LongType, true) .add("NW", StringType, true) .add("Price($)", LongType, false) .add("Size(sqft)", LongType, false) .add("lot", LongType, true); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession sparkSession = SparkSession.builder() .config(conf) .getOrCreate(); Dataset<Row> housesDF = sparkSession.read() .schema(HOUSES_SCHEMA) .json(HOUSES_FILE_PATH); // Gathering Data Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), col("Bedrooms"), col("Baths"), col("Size(sqft)"), col("Price($)")); // Data Preparation Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label"); Imputer imputer = new Imputer() // .setMissingValue(1.0d) .setInputCols(new String[] { "Baths" }) .setOutputCols(new String[] { "~Baths~" }); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" }) .setOutputCol("features"); // Choosing a Model LinearRegression linearRegression = new LinearRegression(); linearRegression.setMaxIter(1000); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] { imputer, assembler, linearRegression }); // Training The Data Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 }); Dataset<Row> trainDF = splitDF[0]; Dataset<Row> evaluationDF = splitDF[1]; PipelineModel pipelineModel = pipeline.fit(trainDF); // Evaluation Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF); predictionsDF.show(false); Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), col("prediction")); RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2"); RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse"); double r2 = evaluteR2.evaluate(forEvaluationDF); double rmse = evaluteRMSE.evaluate(forEvaluationDF); logger.info("---------------------------"); logger.info("R2 =" + r2); logger.info("RMSE =" + rmse); logger.info("---------------------------"); }
Example #6
Source File: TransitionBasedParserMLP.java From vn.vitk with GNU General Public License v3.0 | 5 votes |
/** * Creates a transition-based parser using a MLP transition classifier. * @param jsc * @param classifierFileName * @param featureFrame */ public TransitionBasedParserMLP(JavaSparkContext jsc, String classifierFileName, FeatureFrame featureFrame) { this.featureFrame = featureFrame; this.classifier = TransitionClassifier.load(jsc, new Path(classifierFileName, "data").toString()); this.pipelineModel = PipelineModel.load(new Path(classifierFileName, "pipelineModel").toString()); this.transitionName = ((StringIndexerModel)pipelineModel.stages()[2]).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); this.featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { this.featureMap.put(features[j], j); } }
Example #7
Source File: SparkMLDeriver.java From envelope with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) { if (model == null) { model = PipelineModel.load(modelPath); } Dataset<Row> data = getData(dependencies); return model.transform(data); }
Example #8
Source File: RandomForestRegressionModelInfoAdapterBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testRandomForestRegressionWithPipeline() { // Load the data stored in LIBSVM format as a DataFrame. DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm"); // Split the data into training and test sets (30% held out for testing) DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3}); DataFrame trainingData = splits[0]; DataFrame testData = splits[1]; // Train a RandomForest model. RandomForestRegressionModel regressionModel = new RandomForestRegressor() .setFeaturesCol("features").fit(trainingData); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{regressionModel}); // Train model. This also runs the indexer. PipelineModel sparkPipeline = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline, null); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); Row[] sparkOutput = sparkPipeline.transform(testData).select("features", "prediction").collect(); //compare predictions for (Row row : sparkOutput) { Vector v = (Vector) row.get(0); double actual = row.getDouble(1); Map<String, Object> inputData = new HashMap<String, Object>(); inputData.put(transformer.getInputKeys().iterator().next(), v.toArray()); transformer.transform(inputData); double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next()); assertEquals(actual, predicted, EPSILON); } }
Example #9
Source File: TestSparkMLDeriver.java From envelope with Apache License 2.0 | 5 votes |
private void generateAndSaveModel(String savePath) throws IOException { // Sourced from the Spark ML documentation and examples StructType trainingSchema = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("id", DataTypes.LongType, false), DataTypes.createStructField("text", DataTypes.StringType, false), DataTypes.createStructField("label", DataTypes.DoubleType, false) )); Dataset<Row> training = Contexts.getSparkSession().createDataFrame(Lists.newArrayList( RowFactory.create(0L, "a b c d e spark", 1.0), RowFactory.create(1L, "b d", 0.0), RowFactory.create(2L, "spark f g h", 1.0), RowFactory.create(3L, "hadoop mapreduce", 0.0) ), trainingSchema); Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words"); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); PipelineModel model = pipeline.fit(training); model.write().overwrite().save(savePath); }
Example #10
Source File: PipelineModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 5 votes |
@Override public PipelineModelInfo getModelInfo(final PipelineModel from, final DataFrame df) { final PipelineModelInfo modelInfo = new PipelineModelInfo(); final ModelInfo stages[] = new ModelInfo[from.stages().length]; for (int i = 0; i < from.stages().length; i++) { Transformer sparkModel = from.stages()[i]; stages[i] = ModelInfoAdapterFactory.getAdapter(sparkModel.getClass()).adapt(sparkModel, df); } modelInfo.setStages(stages); return modelInfo; }
Example #11
Source File: RFormulaModelConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 5 votes |
@Override public void registerFeatures(SparkMLEncoder encoder){ RFormulaModel transformer = getTransformer(); ResolvedRFormula resolvedFormula = transformer.resolvedFormula(); String targetCol = resolvedFormula.label(); String labelCol = transformer.getLabelCol(); if(!(targetCol).equals(labelCol)){ List<Feature> features = encoder.getFeatures(targetCol); encoder.putFeatures(labelCol, features); } ConverterFactory converterFactory = encoder.getConverterFactory(); PipelineModel pipelineModel = transformer.pipelineModel(); Transformer[] stages = pipelineModel.stages(); for(Transformer stage : stages){ TransformerConverter<?> converter = converterFactory.newConverter(stage); if(converter instanceof FeatureConverter){ FeatureConverter<?> featureConverter = (FeatureConverter<?>)converter; featureConverter.registerFeatures(encoder); } else { throw new IllegalArgumentException("Expected a subclass of " + FeatureConverter.class.getName() + ", got " + (converter != null ? ("class " + (converter.getClass()).getName()) : null)); } } }
Example #12
Source File: PipelineModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 5 votes |
@Override public PipelineModelInfo getModelInfo(final PipelineModel from) { final PipelineModelInfo modelInfo = new PipelineModelInfo(); final ModelInfo stages[] = new ModelInfo[from.stages().length]; for (int i = 0; i < from.stages().length; i++) { Transformer sparkModel = from.stages()[i]; stages[i] = ModelInfoAdapterFactory.getAdapter(sparkModel.getClass()).adapt(sparkModel); } modelInfo.setStages(stages); return modelInfo; }
Example #13
Source File: PMMLBuilder.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 5 votes |
public PMMLBuilder verify(Dataset<Row> dataset, double precision, double zeroThreshold){ PipelineModel pipelineModel = getPipelineModel(); Dataset<Row> transformedDataset = pipelineModel.transform(dataset); Verification verification = new Verification(dataset, transformedDataset) .setPrecision(precision) .setZeroThreshold(zeroThreshold); return setVerification(verification); }
Example #14
Source File: DecisionTreeClassificationModelBridgePipelineTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testDecisionTreeClassificationWithPipeline() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/classification_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); // Train a DecisionTree model. DecisionTreeClassifier classificationModel = new DecisionTreeClassifier() .setLabelCol("labelIndex") .setFeaturesCol("features"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, classificationModel}); // Train model. This also runs the indexer. PipelineModel sparkPipeline = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> output = sparkPipeline.transform(testData).select("features", "label","prediction","rawPrediction").collectAsList(); //compare predictions for (Row row : output) { Map<String, Object> data_ = new HashMap<>(); double [] actualRawPrediction = ((DenseVector) row.get(3)).toArray(); data_.put("features", ((SparseVector) row.get(0)).toArray()); data_.put("label", (row.get(1)).toString()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")); assertEquals((double)data_.get("prediction"), (double)row.get(2), EPSILON); assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON); } }
Example #15
Source File: TransitionClassifier.java From vn.vitk with GNU General Public License v3.0 | 4 votes |
/** * Trains a transition classifier on the data frame. * @param jsc * @param graphs * @param featureFrame * @param classifierFileName * @param numHiddenUnits * @return a transition classifier. */ public Transformer trainMLP(JavaSparkContext jsc, List<DependencyGraph> graphs, FeatureFrame featureFrame, String classifierFileName, int numHiddenUnits) { // create a SQLContext this.sqlContext = new SQLContext(jsc); // extract a data frame from these graphs DataFrame dataset = toDataFrame(jsc, graphs, featureFrame); // create a processing pipeline and fit it to the data frame Pipeline pipeline = createPipeline(); PipelineModel pipelineModel = pipeline.fit(dataset); DataFrame trainingData = pipelineModel.transform(dataset); // cache the training data for better performance trainingData.cache(); if (verbose) { trainingData.show(false); } // compute the number of different labels, which is the maximum element // in the 'label' column. trainingData.registerTempTable("dfTable"); Row row = sqlContext.sql("SELECT MAX(label) as maxValue from dfTable").first(); int numLabels = (int)row.getDouble(0); numLabels++; int vocabSize = ((CountVectorizerModel)(pipelineModel.stages()[1])).getVocabSize(); // default is a two-layer MLP int[] layers = {vocabSize, numLabels}; // if user specify a hidden layer, use a 3-layer MLP: if (numHiddenUnits > 0) { layers = new int[3]; layers[0] = vocabSize; layers[1] = numHiddenUnits; layers[2] = numLabels; } MultilayerPerceptronClassifier classifier = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setTol((Double)params.getOrDefault(params.getTolerance())) .setMaxIter((Integer)params.getOrDefault(params.getMaxIter())); MultilayerPerceptronClassificationModel model = classifier.fit(trainingData); // compute precision on the training data // DataFrame result = model.transform(trainingData); DataFrame predictionAndLabel = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision"); if (verbose) { System.out.println("N = " + trainingData.count()); System.out.println("D = " + vocabSize); System.out.println("K = " + numLabels); System.out.println("H = " + numHiddenUnits); System.out.println("training precision = " + evaluator.evaluate(predictionAndLabel)); } // save the trained MLP to a file // String classifierPath = new Path(classifierFileName, "data").toString(); jsc.parallelize(Arrays.asList(model), 1).saveAsObjectFile(classifierPath); // save the pipeline model to sub-directory "pipelineModel" // try { String pipelinePath = new Path(classifierFileName, "pipelineModel").toString(); pipelineModel.write().overwrite().save(pipelinePath); } catch (IOException e) { e.printStackTrace(); } return model; }
Example #16
Source File: PipelineModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<PipelineModel> getSource() { return PipelineModel.class; }
Example #17
Source File: DecisionTreeRegressionModelBridgePipelineTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testDecisionTreeRegressionPrediction() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/regression_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex").setHandleInvalid("skip"); DecisionTreeRegressor regressionModel = new DecisionTreeRegressor().setLabelCol("labelIndex").setFeaturesCol("features"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, regressionModel}); PipelineModel sparkPipeline = pipeline.fit(trainingData); byte[] exportedModel = ModelExporter.export(sparkPipeline); Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> output = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList(); //compare predictions for (Row row : output) { Map<String, Object> data_ = new HashMap<>(); data_.put("features", ((SparseVector) row.get(0)).toArray()); data_.put("label", (row.get(2)).toString()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")); assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON); } }
Example #18
Source File: GradientBoostClassificationModelPipelineTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testGradientBoostClassification() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/binary_classification_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a RandomForest model. GBTClassifier classificationModel = new GBTClassifier().setLabelCol("labelIndex") .setFeaturesCol("features");; Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, classificationModel}); PipelineModel sparkPipeline = pipeline.fit(trainingData); // Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline); // Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> sparkOutput = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList(); // compare predictions for (Row row : sparkOutput) { Map<String, Object> data_ = new HashMap<>(); data_.put("features", ((SparseVector) row.get(0)).toArray()); data_.put("label", (row.get(2)).toString()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")+" ,"+row.get(1)); assertEquals((double) data_.get("prediction"), (double) row.get(1), EPSILON); } }
Example #19
Source File: ConverterUtil.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 4 votes |
static public byte[] toPMMLByteArray(StructType schema, PipelineModel pipelineModel, Map<String, ? extends Map<String, ?>> options){ throw new UnsupportedOperationException(formatMessage("toPMMLByteArray", "buildByteArray")); }
Example #20
Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testPipeline() { // Prepare training documents, which are labeled. StructType schema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), createStructField("label", DoubleType, false) }); Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList( cr(0L, "a b c d e spark", 1.0), cr(1L, "b d", 0.0), cr(2L, "spark f g h", 1.0), cr(3L, "hadoop mapreduce", 0.0) ), schema); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression. RegexTokenizer tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("words") .setPattern("\\s") .setGaps(true) .setToLowercase(false); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{tokenizer, hashingTF, lr}); // Fit the pipeline to training documents. PipelineModel sparkPipelineModel = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipelineModel); System.out.println(new String(exportedModel)); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //prepare test data StructType testSchema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), }); Dataset<Row> testData = spark.createDataFrame(Arrays.asList( cr(4L, "spark i j k"), cr(5L, "l m n"), cr(6L, "mapreduce spark"), cr(7L, "apache hadoop") ), testSchema); //verify that predictions for spark pipeline and exported pipeline are the same List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList(); for (Row r : predictions) { System.out.println(r); double sparkPipelineOp = r.getDouble(3); Map<String, Object> data = new HashMap<String, Object>(); data.put("text", r.getString(1)); transformer.transform(data); double exportedPipelineOp = (double) data.get("prediction"); double exportedPipelineProb = (double) data.get("probability"); assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01); } }
Example #21
Source File: PipelineModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<PipelineModel> getSource() { return PipelineModel.class; }
Example #22
Source File: DecisionTreeRegressionModelBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testDecisionTreeRegressionWithPipeline() { // Load the data stored in LIBSVM format as a DataFrame. DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm"); // Split the data into training and test sets (30% held out for testing) DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3}); DataFrame trainingData = splits[0]; DataFrame testData = splits[1]; // Train a DecisionTree model. DecisionTreeRegressor dt = new DecisionTreeRegressor() .setFeaturesCol("features"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{dt}); // Train model. This also runs the indexer. PipelineModel sparkPipeline = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline, null); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); Row[] sparkOutput = sparkPipeline.transform(testData).select("features", "prediction").collect(); //compare predictions for (Row row : sparkOutput) { Vector v = (Vector) row.get(0); double actual = row.getDouble(1); Map<String, Object> inputData = new HashMap<String, Object>(); inputData.put(transformer.getInputKeys().iterator().next(), v.toArray()); transformer.transform(inputData); double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next()); assertEquals(actual, predicted, EPSILON); } }
Example #23
Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testPipeline() { // Prepare training documents, which are labeled. StructType schema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), createStructField("label", DoubleType, false) }); DataFrame trainingData = sqlContext.createDataFrame(Arrays.asList( cr(0L, "a b c d e spark", 1.0), cr(1L, "b d", 0.0), cr(2L, "spark f g h", 1.0), cr(3L, "hadoop mapreduce", 0.0) ), schema); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression. RegexTokenizer tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("words") .setPattern("\\s") .setGaps(true) .setToLowercase(false); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{tokenizer, hashingTF, lr}); // Fit the pipeline to training documents. PipelineModel sparkPipelineModel = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipelineModel, trainingData); System.out.println(new String(exportedModel)); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //prepare test data StructType testSchema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), }); DataFrame testData = sqlContext.createDataFrame(Arrays.asList( cr(4L, "spark i j k"), cr(5L, "l m n"), cr(6L, "mapreduce spark"), cr(7L, "apache hadoop") ), testSchema); //verify that predictions for spark pipeline and exported pipeline are the same Row[] predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collect(); for (Row r : predictions) { System.out.println(r); double sparkPipelineOp = r.getDouble(3); Map<String, Object> data = new HashMap<String, Object>(); data.put("text", r.getString(1)); transformer.transform(data); double exportedPipelineOp = (double) data.get("prediction"); double exportedPipelineProb = (double) data.get("probability"); assertEquals(sparkPipelineOp, exportedPipelineOp, EPSILON); } }
Example #24
Source File: RandomForestClassificationModelInfoAdapterBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testRandomForestClassificationWithPipeline() { // Load the data stored in LIBSVM format as a DataFrame. DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm"); // Split the data into training and test sets (30% held out for testing) DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3}); DataFrame trainingData = splits[0]; DataFrame testData = splits[1]; StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); // Train a DecisionTree model. RandomForestClassifier classifier = new RandomForestClassifier() .setLabelCol("labelIndex") .setFeaturesCol("features") .setPredictionCol("prediction") .setRawPredictionCol("rawPrediction") .setProbabilityCol("probability"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, classifier}); // Train model. This also runs the indexer. PipelineModel sparkPipeline = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline, null); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction", "rawPrediction", "probability").collect(); //compare predictions for (Row row : sparkOutput) { Vector v = (Vector) row.get(1); double actual = row.getDouble(2); double [] actualProbability = ((Vector) row.get(4)).toArray(); double[] actualRaw = ((Vector) row.get(3)).toArray(); Map<String, Object> inputData = new HashMap<String, Object>(); inputData.put("features", v.toArray()); inputData.put("label", row.get(0).toString()); transformer.transform(inputData); double predicted = (double) inputData.get("prediction"); double[] probability = (double[]) inputData.get("probability"); double[] rawPrediction = (double[]) inputData.get("rawPrediction"); assertEquals(actual, predicted, EPSILON); assertArrayEquals(actualProbability, probability, EPSILON); assertArrayEquals(actualRaw, rawPrediction, EPSILON); } }
Example #25
Source File: DecisionTreeClassificationModelBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testDecisionTreeClassificationWithPipeline() { // Load the data stored in LIBSVM format as a DataFrame. DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm"); // Split the data into training and test sets (30% held out for testing) DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3}); DataFrame trainingData = splits[0]; DataFrame testData = splits[1]; StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); // Train a DecisionTree model. DecisionTreeClassifier classificationModel = new DecisionTreeClassifier() .setLabelCol("labelIndex") .setFeaturesCol("features"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, classificationModel}); // Train model. This also runs the indexer. PipelineModel sparkPipeline = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline, null); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction").collect(); //compare predictions for (Row row : sparkOutput) { Vector v = (Vector) row.get(1); double actual = row.getDouble(2); Map<String, Object> inputData = new HashMap<String, Object>(); inputData.put("features", v.toArray()); inputData.put("label", row.get(0).toString()); transformer.transform(inputData); double predicted = (double) inputData.get("prediction"); assertEquals(actual, predicted, EPSILON); } }
Example #26
Source File: FillNAValuesTransformerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void shouldWorkCorrectlyWithPipeline() { //Prepare test data DataFrame df = getDataFrame(); Row[] originalData = df.orderBy("id").select("id", "a", "b", "c", "d").collect(); //prepare transformation pipeline FillNAValuesTransformer fillNAValuesTransformer = new FillNAValuesTransformer(); fillNAValuesTransformer.setNAValueMap( getFillNAMap() ); Pipeline pipeline = new Pipeline(); pipeline.setStages(new PipelineStage[]{fillNAValuesTransformer}); PipelineModel model = pipeline.fit(df); //predict Row[] sparkOutput = model.transform(df).orderBy("id").select("id", "a", "b", "c", "d").collect(); //export byte[] exportedModel = ModelExporter.export(model, df); Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //verify correctness assertTrue(transformer.getInputKeys().size() == 4); assertTrue(transformer.getInputKeys().containsAll(Arrays.asList("a", "b", "c", "d"))); assertTrue(transformer.getOutputKeys().size() == 4); assertTrue(transformer.getOutputKeys().containsAll(Arrays.asList("a", "b", "c", "d"))); for( int i=0; i < originalData.length; i++) { Map<String, Object> input = new HashMap<String, Object>(); input.put("a", originalData[i].get(1)); input.put("b", originalData[i].get(2)); input.put("c", originalData[i].get(3)); input.put("d", originalData[i].get(4)); transformer.transform(input); assertEquals(sparkOutput[i].get(1), input.get("a")); assertEquals(sparkOutput[i].get(2), input.get("b")); assertEquals(sparkOutput[i].get(3), input.get("c")); assertEquals(sparkOutput[i].get(4), input.get("d")); } }
Example #27
Source File: TransformerBuilder.java From jpmml-evaluator-spark with GNU Affero General Public License v3.0 | 4 votes |
public Transformer build(){ Evaluator evaluator = getEvaluator(); PMMLTransformer pmmlTransformer = new PMMLTransformer(evaluator, this.columnProducers); if(this.exploded){ ColumnExploder columnExploder = new ColumnExploder(pmmlTransformer.getOutputCol()); ColumnPruner columnPruner = new ColumnPruner(new Set.Set1<>(pmmlTransformer.getOutputCol())); PipelineModel pipelineModel = new PipelineModel(null, new Transformer[]{pmmlTransformer, columnExploder, columnPruner}); return pipelineModel; } return pmmlTransformer; }
Example #28
Source File: JavaDecisionTreeClassificationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaDecisionTreeClassificationExample") .getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. Dataset<Row> data = spark .read() .format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. StringIndexerModel labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") .fit(data); // Automatically identify categorical features, and index them. VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(4) // features with > 4 distinct values are treated as continuous. .fit(data); // Split the data into training and test sets (30% held out for testing). Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a DecisionTree model. DecisionTreeClassifier dt = new DecisionTreeClassifier() .setLabelCol("indexedLabel") .setFeaturesCol("indexedFeatures"); // Convert indexed labels back to original labels. IndexToString labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); // Chain indexers and tree in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter}); // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. Dataset<Row> predictions = model.transform(testData); // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); // Select (prediction, true label) and compute test error. MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") .setMetricName("accuracy"); double accuracy = evaluator.evaluate(predictions); System.out.println("Test Error = " + (1.0 - accuracy)); DecisionTreeClassificationModel treeModel = (DecisionTreeClassificationModel) (model.stages()[2]); System.out.println("Learned classification tree model:\n" + treeModel.toDebugString()); // $example off$ spark.stop(); }
Example #29
Source File: SparkMLScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields StructType RSVP_SCHEMA = new StructType() .add("event", new StructType() .add("event_id", StringType, true) .add("event_name", StringType, true) .add("event_url", StringType, true) .add("time", LongType, true)) .add("group", new StructType() .add("group_city", StringType, true) .add("group_country", StringType, true) .add("group_id", LongType, true) .add("group_lat", DoubleType, true) .add("group_lon", DoubleType, true) .add("group_name", StringType, true) .add("group_state", StringType, true) .add("group_topics", DataTypes.createArrayType( new StructType() .add("topicName", StringType, true) .add("urlkey", StringType, true)), true) .add("group_urlname", StringType, true)) .add("guests", LongType, true) .add("member", new StructType() .add("member_id", LongType, true) .add("member_name", StringType, true) .add("photo", StringType, true)) .add("mtime", LongType, true) .add("response", StringType, true) .add("rsvp_id", LongType, true) .add("venue", new StructType() .add("lat", DoubleType, true) .add("lon", DoubleType, true) .add("venue_id", LongType, true) .add("venue_name", StringType, true)) .add("visibility", StringType, true); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession spark = SparkSession .builder() .config(conf) .getOrCreate(); PipelineModel pipelineModel = PipelineModel.load(MODEL_FOLDER_PATH); Dataset<Row> meetupStream = spark.readStream() .format(KAFKA_FORMAT) .option("kafka.bootstrap.servers", KAFKA_BROKERS) .option("subscribe", KAFKA_TOPIC) .load(); Dataset<Row> gatheredDF = meetupStream.select( (from_json(col("value").cast("string"), RSVP_SCHEMA)) .alias("rsvp")) .alias("meetup") .select("meetup.*"); Dataset<Row> filteredDF = gatheredDF.filter(e -> !e.anyNull()); Dataset<Row> preparedDF = filteredDF.select( col("rsvp.group.group_city"), col("rsvp.group.group_lat"), col("rsvp.group.group_lon"), col("rsvp.response") ); preparedDF.printSchema(); Dataset<Row> predictionDF = pipelineModel.transform(preparedDF); StreamingQuery query = predictionDF.writeStream() .format(JSON_FORMAT) .option("path", RESULT_FOLDER_PATH) .option("checkpointLocation", CHECKPOINT_LOCATION) .trigger(Trigger.ProcessingTime(QUERY_INTERVAL_SECONDS)) .option("truncate", false) .start(); query.awaitTermination(); }
Example #30
Source File: EntitySalienceTestingSparkRunner.java From ambiverse-nlu with Apache License 2.0 | 4 votes |
@Override protected int run() throws Exception { SparkConf sparkConf = new SparkConf() .setAppName("EntitySalienceTrainingSparkRunner") .set("spark.hadoop.validateOutputSpecs", "false") //.set("spark.yarn.executor.memoryOverhead", "4096") .set("spark.rdd.compress", "true") .set("spark.core.connection.ack.wait.timeout", "600") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") //.set("spark.kryo.registrationRequired", "true") .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class, InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class, VectorIndexer.class}) ;//setMaster("local"); //Remove this if you run it on the server. TrainingSettings trainingSettings = new TrainingSettings(); if(defaultConf != null) { trainingSettings.setAidaDefaultConf(defaultConf); } JavaSparkContext sc = new JavaSparkContext(sparkConf); int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances")) * Integer.parseInt(sc.getConf().get("spark.executor.cores")); // int totalCores = 2; //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG); trainingSettings.setPositiveInstanceScalingFactor(1); if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) { sc.addFile(trainingSettings.getBigramCountCache()); sc.addFile(trainingSettings.getKeywordCountCache()); sc.addFile(trainingSettings.getWordContractionsCache()); sc.addFile(trainingSettings.getWordExpansionsCache()); if (trainingSettings.getAidaDefaultConf().equals("db")) { sc.addFile(trainingSettings.getDatabaseAida()); } else { sc.addFile(trainingSettings.getCassandraConfig()); } } SQLContext sqlContext = new SQLContext(sc); int partitionNumber = 3 * totalCores; //Read training documents serialized as SCAS JavaPairRDD<Text, SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber); //Instanciate a training spark runner TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner(); PipelineModel trainingModel = (PipelineModel) sc.objectFile(model).first(); //Evaluate the model and write down the evaluation metrics. trainingSparkRunner.evaluate(sc, sqlContext, documents, trainingModel, trainingSettings, output+"/"+sc.getConf().getAppId()+"/"); return 0; }