org.apache.spark.ml.feature.RegexTokenizer Java Examples

The following examples show how to use org.apache.spark.ml.feature.RegexTokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RegexTokenizerConverter.java    From jpmml-sparkml with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
	RegexTokenizer transformer = getTransformer();

	if(!transformer.getGaps()){
		throw new IllegalArgumentException("Expected splitter mode, got token matching mode");
	} // End if

	if(transformer.getMinTokenLength() != 1){
		throw new IllegalArgumentException("Expected 1 as minimum token length, got " + transformer.getMinTokenLength() + " as minimum token length");
	}

	Feature feature = encoder.getOnlyFeature(transformer.getInputCol());

	Field<?> field = feature.getField();

	if(transformer.getToLowercase()){
		Apply apply = PMMLUtil.createApply(PMMLFunctions.LOWERCASE, feature.ref());

		field = encoder.createDerivedField(FeatureUtil.createName("lowercase", feature), OpType.CATEGORICAL, DataType.STRING, apply);
	}

	return Collections.singletonList(new DocumentFeature(encoder, field, transformer.getPattern()));
}
 
Example #2
Source File: RegexTokenizerModelInfoAdapter.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
@Override
public RegexTokenizerModelInfo getModelInfo(final RegexTokenizer from) {
    final RegexTokenizerModelInfo modelInfo = new RegexTokenizerModelInfo();
    modelInfo.setMinTokenLength(from.getMinTokenLength());
    modelInfo.setGaps(from.getGaps());
    modelInfo.setPattern(from.getPattern());
    modelInfo.setToLowercase(from.getToLowercase());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
Example #3
Source File: RegexTokenizerModelInfoAdapter.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
@Override
public RegexTokenizerModelInfo getModelInfo(final RegexTokenizer from, final DataFrame df) {
    final RegexTokenizerModelInfo modelInfo = new RegexTokenizerModelInfo();
    modelInfo.setMinTokenLength(from.getMinTokenLength());
    modelInfo.setGaps(from.getGaps());
    modelInfo.setPattern(from.getPattern());
    modelInfo.setToLowercase(from.getToLowercase());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
Example #4
Source File: JavaTokenizerExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTokenizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "Hi I heard about Spark"),
    RowFactory.create(1, "I wish Java could use case classes"),
    RowFactory.create(2, "Logistic,regression,models,are,neat")
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
  });

  Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema);

  Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");

  RegexTokenizer regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W");  // alternatively .setPattern("\\w+").setGaps(false);

  spark.udf().register("countTokens", new UDF1<WrappedArray, Integer>() {
    @Override
    public Integer call(WrappedArray words) {
      return words.size();
    }
  }, DataTypes.IntegerType);

  Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
  tokenized.select("sentence", "words")
      .withColumn("tokens", callUDF("countTokens", col("words"))).show(false);

  Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
  regexTokenized.select("sentence", "words")
      .withColumn("tokens", callUDF("countTokens", col("words"))).show(false);
  // $example off$

  spark.stop();
}
 
Example #5
Source File: RegexTokenizerConverter.java    From jpmml-sparkml with GNU Affero General Public License v3.0 4 votes vote down vote up
public RegexTokenizerConverter(RegexTokenizer transformer){
	super(transformer);
}
 
Example #6
Source File: RegexTokenizerModelInfoAdapter.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Override
public Class getSource() {
    return RegexTokenizer.class;
}
 
Example #7
Source File: PipelineBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    Dataset<Row> testData = spark.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01);
    }
}
 
Example #8
Source File: RegexTokenizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testRegexTokenizer() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("rawText", StringType, false),
    });
    List<Row> trainingData = Arrays.asList(
            cr("Test of tok."),
            cr("Te,st.  punct")
    );
    Dataset<Row> dataset = spark.createDataFrame(trainingData, schema);

    //train model in spark
    RegexTokenizer sparkModel = new RegexTokenizer()
            .setInputCol("rawText")
            .setOutputCol("tokens")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false)
            .setMinTokenLength(3);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    List<Row> pairs = sparkModel.transform(dataset).select("rawText", "tokens").collectAsList();
    for (Row row : pairs) {

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), row.getString(0));
        transformer.transform(data);
        String[] output = (String[]) data.get(sparkModel.getOutputCol());

        Object sparkOp = row.get(1);
        System.out.println(ArrayUtils.toString(output));
        System.out.println(row.get(1));
    }
}
 
Example #9
Source File: RegexTokenizerModelInfoAdapter.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Override
public Class getSource() {
    return RegexTokenizer.class;
}
 
Example #10
Source File: PipelineBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    DataFrame trainingData = sqlContext.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel, trainingData);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    DataFrame testData = sqlContext.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    Row[] predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collect();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, EPSILON);
    }
}
 
Example #11
Source File: RegexTokenizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testRegexTokenizer() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("rawText", StringType, false),
    });
    List<Row> trainingData = Arrays.asList(
            cr("Test of tok."),
            cr("Te,st.  punct")
    );
    DataFrame dataset = sqlContext.createDataFrame(trainingData, schema);

    //train model in spark
    RegexTokenizer sparkModel = new RegexTokenizer()
            .setInputCol("rawText")
            .setOutputCol("tokens")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false)
            .setMinTokenLength(3);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, dataset);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] pairs = sparkModel.transform(dataset).select("rawText", "tokens").collect();
    for (Row row : pairs) {

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), row.getString(0));
        transformer.transform(data);
        String[] output = (String[]) data.get(sparkModel.getOutputCol());

        Object sparkOp = row.get(1);
        System.out.println(ArrayUtils.toString(output));
        System.out.println(row.get(1));
    }
}