org.dmg.pmml.TextIndex Java Examples

The following examples show how to use org.dmg.pmml.TextIndex. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public List<String> process(){
	TextIndex textIndex = getTextIndex();
	FieldValue value = getValue();

	Cache<FieldValue, List<String>> termTokenCache = CacheUtil.getValue(textIndex, TextUtil.termTokenCaches, TextUtil.termTokenCacheLoader);

	List<String> tokens = termTokenCache.getIfPresent(value);
	if(tokens == null){
		String string = value.asString();

		tokens = TextUtil.tokenize(textIndex, string);

		termTokenCache.put(value, tokens);
	}

	return tokens;
}
 
Example #2
Source File: UnsupportedMarkupInspector.java    From jpmml-evaluator with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public VisitorAction visit(TextIndex textIndex){
	boolean tokenize = textIndex.isTokenize();
	if(!tokenize){
		report(new UnsupportedAttributeException(textIndex, PMMLAttributes.TEXTINDEX_TOKENIZE, false));
	}

	TextIndex.LocalTermWeights localTermWeights = textIndex.getLocalTermWeights();
	switch(localTermWeights){
		case AUGMENTED_NORMALIZED_TERM_FREQUENCY:
			report(new UnsupportedAttributeException(textIndex, localTermWeights));
			break;
		default:
			break;
	}

	return super.visit(textIndex);
}
 
Example #3
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public List<String> process(){
	TextIndex textIndex = getTextIndex();
	FieldValue value = getValue();

	Cache<FieldValue, List<String>> textTokenCache = CacheUtil.getValue(textIndex, TextUtil.textTokenCaches, TextUtil.textTokenCacheLoader);

	List<String> tokens = textTokenCache.getIfPresent(value);
	if(tokens == null){
		String string = TextUtil.normalize(textIndex, value.asString());

		tokens = TextUtil.tokenize(textIndex, string);

		textTokenCache.put(value, tokens);
	}

	return tokens;
}
 
Example #4
Source File: ExpressionUtilTest.java    From jpmml-evaluator with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void evaluateTextIndexNormalization(){
	FieldName name = FieldName.create("x");

	TextIndexNormalization stepOne = new TextIndexNormalization();

	List<List<String>> cells = Arrays.asList(
		Arrays.asList("interfaces?", "interface", "true"),
		Arrays.asList("is|are|seem(ed|s?)|were", "be", "true"),
		Arrays.asList("user friendl(y|iness)", "user_friendly", "true")
	);

	stepOne.setInlineTable(createInlineTable(cells, stepOne));

	TextIndexNormalization stepTwo = new TextIndexNormalization()
		.setInField("re")
		.setOutField("feature");

	cells = Arrays.asList(
		Arrays.asList("interface be (user_friendly|well designed|excellent)", "ui_good", "true")
	);

	stepTwo.setInlineTable(createInlineTable(cells, stepTwo));

	TextIndex textIndex = new TextIndex(name, new Constant("ui_good"))
		.setLocalTermWeights(TextIndex.LocalTermWeights.BINARY)
		.setCaseSensitive(false)
		.addTextIndexNormalizations(stepOne, stepTwo);

	assertEquals(1, evaluate(textIndex, name, "Testing the app for a few days convinced me the interfaces are excellent!"));
}
 
Example #5
Source File: ExpressionUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 5 votes vote down vote up
static
public FieldValue evaluateTextIndex(TextIndex textIndex, EvaluationContext context){
	FieldName textName = textIndex.getTextField();
	if(textName == null){
		throw new MissingAttributeException(textIndex, PMMLAttributes.TEXTINDEX_TEXTFIELD);
	}

	FieldValue textValue = context.evaluate(textName);

	FieldValue termValue = ExpressionUtil.evaluateExpressionContainer(textIndex, context);

	// See http://mantis.dmg.org/view.php?id=171
	if(FieldValueUtil.isMissing(textValue) || FieldValueUtil.isMissing(termValue)){
		return FieldValues.MISSING_VALUE;
	}

	TextUtil.TextProcessor textProcessor = new TextUtil.TextProcessor(textIndex, textValue);

	List<String> textTokens = textProcessor.process();

	TextUtil.TermProcessor termProcessor = new TextUtil.TermProcessor(textIndex, termValue);

	List<String> termTokens = termProcessor.process();

	int termFrequency = TextUtil.termFrequency(textIndex, textTokens, termTokens);

	TextIndex.LocalTermWeights localTermWeights = textIndex.getLocalTermWeights();
	switch(localTermWeights){
		case BINARY:
		case TERM_FREQUENCY:
			return FieldValueUtil.create(TypeInfos.CONTINUOUS_INTEGER, termFrequency);
		case LOGARITHMIC:
			return FieldValueUtil.create(TypeInfos.CONTINUOUS_DOUBLE, Math.log10(1d + termFrequency));
		default:
			throw new UnsupportedAttributeException(textIndex, localTermWeights);
	}
}
 
Example #6
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 5 votes vote down vote up
static
public String normalize(TextIndex textIndex, String string){

	if(textIndex.hasTextIndexNormalizations()){
		List<TextIndexNormalization> textIndexNormalizations = textIndex.getTextIndexNormalizations();

		for(TextIndexNormalization textIndexNormalization : textIndexNormalizations){
			string = TextUtil.normalize(textIndex, textIndexNormalization, string);
		}
	}

	return string;
}
 
Example #7
Source File: CountVectorizerModelConverter.java    From jpmml-sparkml with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
	CountVectorizerModel transformer = getTransformer();

	DocumentFeature documentFeature = (DocumentFeature)encoder.getOnlyFeature(transformer.getInputCol());

	ParameterField documentField = new ParameterField(FieldName.create("document"));

	ParameterField termField = new ParameterField(FieldName.create("term"));

	TextIndex textIndex = new TextIndex(documentField.getName(), new FieldRef(termField.getName()))
		.setTokenize(Boolean.TRUE)
		.setWordSeparatorCharacterRE(documentFeature.getWordSeparatorRE())
		.setLocalTermWeights(transformer.getBinary() ? TextIndex.LocalTermWeights.BINARY : null);

	Set<DocumentFeature.StopWordSet> stopWordSets = documentFeature.getStopWordSets();
	for(DocumentFeature.StopWordSet stopWordSet : stopWordSets){

		if(stopWordSet.isEmpty()){
			continue;
		}

		String tokenRE;

		String wordSeparatorRE = documentFeature.getWordSeparatorRE();
		switch(wordSeparatorRE){
			case "\\s+":
				tokenRE = "(^|\\s+)\\p{Punct}*(" + JOINER.join(stopWordSet) + ")\\p{Punct}*(\\s+|$)";
				break;
			case "\\W+":
				tokenRE = "(\\W+)(" + JOINER.join(stopWordSet) + ")(\\W+)";
				break;
			default:
				throw new IllegalArgumentException("Expected \"\\s+\" or \"\\W+\" as splitter regex pattern, got \"" + wordSeparatorRE + "\"");
		}

		Map<String, List<String>> data = new LinkedHashMap<>();
		data.put("string", Collections.singletonList(tokenRE));
		data.put("stem", Collections.singletonList(" "));
		data.put("regex", Collections.singletonList("true"));

		TextIndexNormalization textIndexNormalization = new TextIndexNormalization(null, PMMLUtil.createInlineTable(data))
			.setCaseSensitive(stopWordSet.isCaseSensitive())
			.setRecursive(Boolean.TRUE); // Handles consecutive matches. See http://stackoverflow.com/a/25085385

		textIndex.addTextIndexNormalizations(textIndexNormalization);
	}

	DefineFunction defineFunction = new DefineFunction("tf" + "@" + String.valueOf(CountVectorizerModelConverter.SEQUENCE.getAndIncrement()), OpType.CONTINUOUS, DataType.INTEGER, null, textIndex)
		.addParameterFields(documentField, termField);

	encoder.addDefineFunction(defineFunction);

	List<Feature> result = new ArrayList<>();

	String[] vocabulary = transformer.vocabulary();
	for(int i = 0; i < vocabulary.length; i++){
		String term = vocabulary[i];

		if(TermUtil.hasPunctuation(term)){
			throw new IllegalArgumentException("Punctuated vocabulary terms (" + term + ") are not supported");
		}

		result.add(new TermFeature(encoder, defineFunction, documentFeature, term));
	}

	return result;
}
 
Example #8
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 4 votes vote down vote up
static
public int termFrequency(TextIndex textIndex, List<String> textTokens, List<String> termTokens){

	if(textTokens.isEmpty() || termTokens.isEmpty()){
		return 0;
	}

	boolean caseSensitive = textIndex.isCaseSensitive();

	int maxLevenshteinDistance = textIndex.getMaxLevenshteinDistance();
	if(maxLevenshteinDistance < 0){
		throw new InvalidAttributeException(textIndex, PMMLAttributes.TEXTINDEX_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
	}

	boolean bestHits;

	TextIndex.CountHits countHits = textIndex.getCountHits();
	switch(countHits){
		case BEST_HITS:
			bestHits = true;
			break;
		case ALL_HITS:
			bestHits = false;
			break;
		default:
			throw new UnsupportedAttributeException(textIndex, countHits);
	}

	int maxFrequency;

	TextIndex.LocalTermWeights localTermWeights = textIndex.getLocalTermWeights();
	switch(localTermWeights){
		case BINARY:
			maxFrequency = 1;
			break;
		case TERM_FREQUENCY:
		case LOGARITHMIC:
			maxFrequency = Integer.MAX_VALUE;
			break;
		default:
			throw new UnsupportedAttributeException(textIndex, localTermWeights);
	}

	try {
		return termFrequency(textTokens, termTokens, caseSensitive, maxLevenshteinDistance, bestHits, maxFrequency);
	} catch(PMMLException pe){
		throw pe.ensureContext(textIndex);
	}
}
 
Example #9
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 4 votes vote down vote up
TermProcessor(TextIndex textIndex, FieldValue value){
	super(textIndex, value);
}
 
Example #10
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 4 votes vote down vote up
TextProcessor(TextIndex textIndex, FieldValue value){
	super(textIndex, value);
}
 
Example #11
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 4 votes vote down vote up
private void setTextIndex(TextIndex textIndex){
	this.textIndex = textIndex;
}
 
Example #12
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 4 votes vote down vote up
public TextIndex getTextIndex(){
	return this.textIndex;
}
 
Example #13
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 4 votes vote down vote up
public StringProcessor(TextIndex textIndex, FieldValue value){
	setTextIndex(Objects.requireNonNull(textIndex));
	setValue(Objects.requireNonNull(value));
}
 
Example #14
Source File: TextUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 4 votes vote down vote up
static
public String normalize(TextIndex textIndex, TextIndexNormalization textIndexNormalization, String string){
	TextTokenizer tokenizer = null;

	Boolean tokenize = textIndexNormalization.isTokenize();
	if(tokenize == null){
		tokenize = textIndex.isTokenize();
	} // End if

	if(tokenize){
		PMMLObject locatable = textIndexNormalization;

		String wordSeparatorCharacterRE = textIndexNormalization.getWordSeparatorCharacterRE();
		if(wordSeparatorCharacterRE == null){
			locatable = textIndex;

			wordSeparatorCharacterRE = textIndex.getWordSeparatorCharacterRE();
		}

		Pattern pattern = RegExUtil.compile(wordSeparatorCharacterRE, locatable);

		tokenizer = new TextTokenizer(pattern);
	}

	Boolean caseSensitive = textIndexNormalization.isCaseSensitive();
	if(caseSensitive == null){
		caseSensitive = textIndex.isCaseSensitive();
	}

	Integer maxLevenshteinDistance = textIndexNormalization.getMaxLevenshteinDistance();
	if(maxLevenshteinDistance == null){
		maxLevenshteinDistance = textIndex.getMaxLevenshteinDistance();

		if(maxLevenshteinDistance < 0){
			throw new InvalidAttributeException(textIndex, PMMLAttributes.TEXTINDEX_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
		}
	} else

	{
		if(maxLevenshteinDistance < 0){
			throw new InvalidAttributeException(textIndexNormalization, PMMLAttributes.TEXTINDEXNORMALIZATION_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
		}
	}

	InlineTable inlineTable = InlineTableUtil.getInlineTable(textIndexNormalization);
	if(inlineTable != null){
		String inField = textIndexNormalization.getInField();
		String outField = textIndexNormalization.getOutField();
		String regexField = textIndexNormalization.getRegexField();

		normalization:
		while(true){
			String normalizedString;

			try {
				normalizedString = normalize(inlineTable, inField, outField, regexField, string, tokenizer, caseSensitive, maxLevenshteinDistance);
			} catch(PMMLException pe){
				throw pe.ensureContext(textIndexNormalization);
			}

			// "If the recursive flag is set to true, then the normalization table is reapplied until none of its rows causes a change to the input text."
			if(textIndexNormalization.isRecursive()){

				if(!(normalizedString).equals(string)){
					string = normalizedString;

					continue normalization;
				}
			}

			return normalizedString;
		}
	}

	return string;
}
 
Example #15
Source File: ExpressionUtil.java    From jpmml-evaluator with GNU Affero General Public License v3.0 4 votes vote down vote up
static
FieldValue evaluateExpression(Expression expression, EvaluationContext context){

	if(expression instanceof Constant){
		return evaluateConstant((Constant)expression);
	} else

	if(expression instanceof FieldRef){
		return evaluateFieldRef((FieldRef)expression, context);
	} else

	if(expression instanceof NormContinuous){
		return evaluateNormContinuous((NormContinuous)expression, context);
	} else

	if(expression instanceof NormDiscrete){
		return evaluateNormDiscrete((NormDiscrete)expression, context);
	} else

	if(expression instanceof Discretize){
		return evaluateDiscretize((Discretize)expression, context);
	} else

	if(expression instanceof MapValues){
		return evaluateMapValues((MapValues)expression, context);
	} else

	if(expression instanceof TextIndex){
		return evaluateTextIndex((TextIndex)expression, context);
	} else

	if(expression instanceof Apply){
		return evaluateApply((Apply)expression, context);
	} else

	if(expression instanceof Aggregate){
		return evaluateAggregate((Aggregate)expression, context);
	} // End if

	if(expression instanceof JavaExpression){
		return evaluateJavaExpression((JavaExpression)expression, context);
	}

	throw new UnsupportedElementException(expression);
}
 
Example #16
Source File: FieldReferenceFinder.java    From jpmml-model with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public VisitorAction visit(TextIndex textIndex){
	process(textIndex.getTextField());

	return super.visit(textIndex);
}
 
Example #17
Source File: CountVectorizer.java    From jpmml-sklearn with GNU Affero General Public License v3.0 4 votes vote down vote up
public DefineFunction encodeDefineFunction(){
	String analyzer = getAnalyzer();
	List<String> stopWords = getStopWords();
	Object[] nGramRange = getNGramRange();
	Boolean binary = getBinary();
	Object preprocessor = getPreprocessor();
	String stripAccents = getStripAccents();
	Splitter tokenizer = getTokenizer();

	switch(analyzer){
		case "word":
			break;
		default:
			throw new IllegalArgumentException(analyzer);
	}

	if(preprocessor != null){
		throw new IllegalArgumentException();
	} // End if

	if(stripAccents != null){
		throw new IllegalArgumentException(stripAccents);
	}

	ParameterField documentField = new ParameterField(FieldName.create("document"));

	ParameterField termField = new ParameterField(FieldName.create("term"));

	TextIndex textIndex = new TextIndex(documentField.getName(), new FieldRef(termField.getName()))
		.setTokenize(Boolean.TRUE)
		.setWordSeparatorCharacterRE(tokenizer.getSeparatorRE())
		.setLocalTermWeights(binary ? TextIndex.LocalTermWeights.BINARY : null);

	if((stopWords != null && stopWords.size() > 0) && !Arrays.equals(nGramRange, new Integer[]{1, 1})){
		Map<String, List<String>> data = new LinkedHashMap<>();
		data.put("string", Collections.singletonList("(^|\\s+)\\p{Punct}*(" + JOINER.join(stopWords) + ")\\p{Punct}*(\\s+|$)"));
		data.put("stem", Collections.singletonList(" "));
		data.put("regex", Collections.singletonList("true"));

		TextIndexNormalization textIndexNormalization = new TextIndexNormalization(null, PMMLUtil.createInlineTable(data))
			.setRecursive(Boolean.TRUE); // Handles consecutive matches. See http://stackoverflow.com/a/25085385

		textIndex.addTextIndexNormalizations(textIndexNormalization);
	}

	String name = functionName() + "@" + String.valueOf(CountVectorizer.SEQUENCE.getAndIncrement());

	DefineFunction defineFunction = new DefineFunction(name, OpType.CONTINUOUS, DataType.DOUBLE, null, textIndex)
		.addParameterFields(documentField, termField);

	return defineFunction;
}