org.apache.spark.ml.feature.RegexTokenizer Java Examples
The following examples show how to use
org.apache.spark.ml.feature.RegexTokenizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RegexTokenizerConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 6 votes |
@Override public List<Feature> encodeFeatures(SparkMLEncoder encoder){ RegexTokenizer transformer = getTransformer(); if(!transformer.getGaps()){ throw new IllegalArgumentException("Expected splitter mode, got token matching mode"); } // End if if(transformer.getMinTokenLength() != 1){ throw new IllegalArgumentException("Expected 1 as minimum token length, got " + transformer.getMinTokenLength() + " as minimum token length"); } Feature feature = encoder.getOnlyFeature(transformer.getInputCol()); Field<?> field = feature.getField(); if(transformer.getToLowercase()){ Apply apply = PMMLUtil.createApply(PMMLFunctions.LOWERCASE, feature.ref()); field = encoder.createDerivedField(FeatureUtil.createName("lowercase", feature), OpType.CATEGORICAL, DataType.STRING, apply); } return Collections.singletonList(new DocumentFeature(encoder, field, transformer.getPattern())); }
Example #2
Source File: RegexTokenizerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 6 votes |
@Override public RegexTokenizerModelInfo getModelInfo(final RegexTokenizer from) { final RegexTokenizerModelInfo modelInfo = new RegexTokenizerModelInfo(); modelInfo.setMinTokenLength(from.getMinTokenLength()); modelInfo.setGaps(from.getGaps()); modelInfo.setPattern(from.getPattern()); modelInfo.setToLowercase(from.getToLowercase()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
Example #3
Source File: RegexTokenizerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 6 votes |
@Override public RegexTokenizerModelInfo getModelInfo(final RegexTokenizer from, final DataFrame df) { final RegexTokenizerModelInfo modelInfo = new RegexTokenizerModelInfo(); modelInfo.setMinTokenLength(from.getMinTokenLength()); modelInfo.setGaps(from.getGaps()); modelInfo.setPattern(from.getPattern()); modelInfo.setToLowercase(from.getToLowercase()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
Example #4
Source File: JavaTokenizerExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaTokenizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, "Hi I heard about Spark"), RowFactory.create(1, "I wish Java could use case classes"), RowFactory.create(2, "Logistic,regression,models,are,neat") ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); RegexTokenizer regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); spark.udf().register("countTokens", new UDF1<WrappedArray, Integer>() { @Override public Integer call(WrappedArray words) { return words.size(); } }, DataTypes.IntegerType); Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame); tokenized.select("sentence", "words") .withColumn("tokens", callUDF("countTokens", col("words"))).show(false); Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame); regexTokenized.select("sentence", "words") .withColumn("tokens", callUDF("countTokens", col("words"))).show(false); // $example off$ spark.stop(); }
Example #5
Source File: RegexTokenizerConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 4 votes |
public RegexTokenizerConverter(RegexTokenizer transformer){ super(transformer); }
Example #6
Source File: RegexTokenizerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class getSource() { return RegexTokenizer.class; }
Example #7
Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testPipeline() { // Prepare training documents, which are labeled. StructType schema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), createStructField("label", DoubleType, false) }); Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList( cr(0L, "a b c d e spark", 1.0), cr(1L, "b d", 0.0), cr(2L, "spark f g h", 1.0), cr(3L, "hadoop mapreduce", 0.0) ), schema); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression. RegexTokenizer tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("words") .setPattern("\\s") .setGaps(true) .setToLowercase(false); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{tokenizer, hashingTF, lr}); // Fit the pipeline to training documents. PipelineModel sparkPipelineModel = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipelineModel); System.out.println(new String(exportedModel)); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //prepare test data StructType testSchema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), }); Dataset<Row> testData = spark.createDataFrame(Arrays.asList( cr(4L, "spark i j k"), cr(5L, "l m n"), cr(6L, "mapreduce spark"), cr(7L, "apache hadoop") ), testSchema); //verify that predictions for spark pipeline and exported pipeline are the same List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList(); for (Row r : predictions) { System.out.println(r); double sparkPipelineOp = r.getDouble(3); Map<String, Object> data = new HashMap<String, Object>(); data.put("text", r.getString(1)); transformer.transform(data); double exportedPipelineOp = (double) data.get("prediction"); double exportedPipelineProb = (double) data.get("probability"); assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01); } }
Example #8
Source File: RegexTokenizerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testRegexTokenizer() { //prepare data StructType schema = createStructType(new StructField[]{ createStructField("rawText", StringType, false), }); List<Row> trainingData = Arrays.asList( cr("Test of tok."), cr("Te,st. punct") ); Dataset<Row> dataset = spark.createDataFrame(trainingData, schema); //train model in spark RegexTokenizer sparkModel = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setPattern("\\s") .setGaps(true) .setToLowercase(false) .setMinTokenLength(3); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> pairs = sparkModel.transform(dataset).select("rawText", "tokens").collectAsList(); for (Row row : pairs) { Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), row.getString(0)); transformer.transform(data); String[] output = (String[]) data.get(sparkModel.getOutputCol()); Object sparkOp = row.get(1); System.out.println(ArrayUtils.toString(output)); System.out.println(row.get(1)); } }
Example #9
Source File: RegexTokenizerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class getSource() { return RegexTokenizer.class; }
Example #10
Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testPipeline() { // Prepare training documents, which are labeled. StructType schema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), createStructField("label", DoubleType, false) }); DataFrame trainingData = sqlContext.createDataFrame(Arrays.asList( cr(0L, "a b c d e spark", 1.0), cr(1L, "b d", 0.0), cr(2L, "spark f g h", 1.0), cr(3L, "hadoop mapreduce", 0.0) ), schema); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression. RegexTokenizer tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("words") .setPattern("\\s") .setGaps(true) .setToLowercase(false); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{tokenizer, hashingTF, lr}); // Fit the pipeline to training documents. PipelineModel sparkPipelineModel = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipelineModel, trainingData); System.out.println(new String(exportedModel)); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //prepare test data StructType testSchema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), }); DataFrame testData = sqlContext.createDataFrame(Arrays.asList( cr(4L, "spark i j k"), cr(5L, "l m n"), cr(6L, "mapreduce spark"), cr(7L, "apache hadoop") ), testSchema); //verify that predictions for spark pipeline and exported pipeline are the same Row[] predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collect(); for (Row r : predictions) { System.out.println(r); double sparkPipelineOp = r.getDouble(3); Map<String, Object> data = new HashMap<String, Object>(); data.put("text", r.getString(1)); transformer.transform(data); double exportedPipelineOp = (double) data.get("prediction"); double exportedPipelineProb = (double) data.get("probability"); assertEquals(sparkPipelineOp, exportedPipelineOp, EPSILON); } }
Example #11
Source File: RegexTokenizerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testRegexTokenizer() { //prepare data StructType schema = createStructType(new StructField[]{ createStructField("rawText", StringType, false), }); List<Row> trainingData = Arrays.asList( cr("Test of tok."), cr("Te,st. punct") ); DataFrame dataset = sqlContext.createDataFrame(trainingData, schema); //train model in spark RegexTokenizer sparkModel = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setPattern("\\s") .setGaps(true) .setToLowercase(false) .setMinTokenLength(3); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel, dataset); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); Row[] pairs = sparkModel.transform(dataset).select("rawText", "tokens").collect(); for (Row row : pairs) { Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), row.getString(0)); transformer.transform(data); String[] output = (String[]) data.get(sparkModel.getOutputCol()); Object sparkOp = row.get(1); System.out.println(ArrayUtils.toString(output)); System.out.println(row.get(1)); } }