opennlp.tools.tokenize.Tokenizer Java Examples

The following examples show how to use opennlp.tools.tokenize.Tokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Chapter1.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void nameFinderExample() {
    try {
        String[] sentences = {
            "Tim was a good neighbor. Perhaps not as good a Bob "
            + "Haywood, but still pretty good. Of course Mr. Adam "
            + "took the cake!"};
        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
        TokenNameFinderModel model = new TokenNameFinderModel(new File(
                "C:\\OpenNLP Models", "en-ner-person.bin"));
        NameFinderME finder = new NameFinderME(model);

        for (String sentence : sentences) {
            // Split the sentence into tokens
            String[] tokens = tokenizer.tokenize(sentence);

            // Find the names in the tokens and return Span objects
            Span[] nameSpans = finder.find(tokens);

            // Print the names extracted from the tokens using the Span data
            System.out.println(Arrays.toString(
                    Span.spansToStrings(nameSpans, tokens)));
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
 
Example #2
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void trainingATokenizer() {
    createOpenNLPModel();
    try {
        paragraph = "A demonstration of how to train a tokenizer.";
        InputStream modelInputStream = new FileInputStream(new File(
                ".", "mymodel.bin"));
        TokenizerModel model = new TokenizerModel(modelInputStream);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(paragraph);
        for (String token : tokens) {
            System.out.println(token);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
 
Example #3
Source File: OpenNLPAnnotator.java    From Stargraph with MIT License 6 votes vote down vote up
@Override
public List<Word> doRun(Language language, String sentence) {
    Tokenizer tokenizer = new TokenizerME(getTokenizerModel(language));
    POSTaggerME tagger = new POSTaggerME(getPOSModel(language));
    String[] tokens = tokenizer.tokenize(sentence);
    String[] tags = tagger.tag(tokens);

    PartOfSpeechSet posSet = PartOfSpeechSet.getPOSSet(language);

    List<Word> words = new ArrayList<>();
    for (int i = 0; i < tokens.length; i++) {
        words.add(new Word(posSet.valueOf(tags[i]), tokens[i]));
    }

    return words;
}
 
Example #4
Source File: JM_Scorer.java    From uncc2014watsonsim with GNU General Public License v2.0 6 votes vote down vote up
public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{
	POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin"))));
	Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin"))));
	Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin"))));
	double score = 0;
	
	Parse[] questionParse = ParserTool.parseLine(q, parser, 1);
	Parse[] passageParse = ParserTool.parseLine(q, parser, 1);
	
	if (passage.contains(ca)) {
		for (int i =0; i < questionParse.length; i++) {
			score += matchChildren(questionParse[i],passageParse[i]);
		}
	}
	
	return score;
}
 
Example #5
Source File: Chapter4.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingMultipleNERModels() {
    // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
    // en-ner-organization.bin en-ner-time.bin
    try {
        InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));

        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);

        String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
            "en-ner-organization.bin"};
        ArrayList<String> list = new ArrayList();
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(
                    new FileInputStream(
                            new File(getModelDir(), name)));
            NameFinderME nameFinder = new NameFinderME(entityModel);
            for (int index = 0; index < sentences.length; index++) {
                String tokens[] = tokenizer.tokenize(sentences[index]);
                Span nameSpans[] = nameFinder.find(tokens);
                for (Span span : nameSpans) {
                    list.add("Sentence: " + index
                            + " Span: " + span.toString() + " Entity: "
                            + tokens[span.getStart()]);
                }
            }
        }
        System.out.println("Multiple Entities");
        for (String element : list) {
            System.out.println(element);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}
 
Example #6
Source File: Chapter4.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingMultipleNERModels() {
    // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
    // en-ner-organization.bin en-ner-time.bin
    try {
        InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));

        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);

        String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
            "en-ner-organization.bin"};
        ArrayList<String> list = new ArrayList();
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(
                    new FileInputStream(
                            new File(getModelDir(), name)));
            NameFinderME nameFinder = new NameFinderME(entityModel);
            for (int index = 0; index < sentences.length; index++) {
                String tokens[] = tokenizer.tokenize(sentences[index]);
                Span nameSpans[] = nameFinder.find(tokens);
                for (Span span : nameSpans) {
                    list.add("Sentence: " + index
                            + " Span: " + span.toString() + " Entity: "
                            + tokens[span.getStart()]);
                }
            }
        }
        System.out.println("Multiple Entities");
        for (String element : list) {
            System.out.println(element);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}
 
Example #7
Source File: Chapter1.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void lingpipeExamples() {
    List<String> tokenList = new ArrayList<>();
    List<String> whiteList = new ArrayList<>();
    String text = "A sample sentence processed \nby \tthe "
            + "LingPipe tokenizer.";
    com.aliasi.tokenizer.Tokenizer tokenizer = IndoEuropeanTokenizerFactory.INSTANCE.
            tokenizer(text.toCharArray(), 0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
    for (String element : tokenList) {
        System.out.print(element + " ");
    }
    System.out.println();

}
 
Example #8
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingTheTokenizerMEClass() {
    try {
        InputStream modelIn = new FileInputStream(new File(
                getModelDir(), "en-token.bin"));
        TokenizerModel model = new TokenizerModel(modelIn);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(paragraph);
        for (String token : tokens) {
            System.out.println(token);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
 
Example #9
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingLingPipeTokenizers() {
//        String paragraph = "sample text string";
        char text[] = paragraph.toCharArray();
        TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE;
        com.aliasi.tokenizer.Tokenizer tokenizer = tokenizerFactory.tokenizer(
                text, 0, text.length);
        for (String token : tokenizer) {
            System.out.println(token);
        }
    }
 
Example #10
Source File: BasicActions.java    From knowledge-extraction with Apache License 2.0 5 votes vote down vote up
public String[] testTokenizer(){
	String[] tokens = {};
	try (InputStream modelIn = BasicActions.class.getClassLoader()
			.getResourceAsStream(Consts.EN_TOKEN_MODEL);) {
		
		TokenizerModel tokenModel = new TokenizerModel(modelIn);
		Tokenizer tokenizer = new TokenizerME(tokenModel);
		tokens = tokenizer.tokenize(TEST_PHRASE);
		System.out.println(Arrays.toString(tokens));
	} catch (IOException e) {
		e.printStackTrace();
	}
	return tokens;
}
 
Example #11
Source File: NERScorer.java    From uncc2014watsonsim with GNU General Public License v2.0 5 votes vote down vote up
public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}
 
Example #12
Source File: OpenNlpModule.java    From SciGraph with Apache License 2.0 5 votes vote down vote up
@CheckedProvides(TokenizerProvider.class)
Tokenizer getTokenizer() throws IOException {
  try (InputStream is = getClass().getResourceAsStream("/opennlp/en-token.bin")) {
    TokenizerModel model = new TokenizerModel(is);
    return new TokenizerME(model);
  }
}
 
Example #13
Source File: NERDemo.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
public static void main(String args[]){
    String sentences[] = {"Joe was the last person to see Fred. ", 
        "He saw him in Boston at McKenzie's pub at 3:00 where he " 
        + " paid $2.45 for an ale. ", 
        "Joe wanted to go to Vermont for the day to visit a cousin who " 
        + "works at IBM, but Sally and he had to look for Fred"}; 
    String sentence = "He was the last person to see Fred."; 
    try
    {
        InputStream tokenStream = new FileInputStream(new File(getResourcePath()+ "en-token.bin"));
        InputStream modelStream = new FileInputStream(new File(getResourcePath() + "en-ner-person.bin"));
        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        TokenNameFinderModel entityModel = new TokenNameFinderModel(modelStream);
        NameFinderME nameFinder = new NameFinderME(entityModel);
        String tokens1[] = tokenizer.tokenize(sentence);
        Span nameSpans1[] = nameFinder.find(tokens1);
        for (int i = 0; i < nameSpans1.length; i++) { 
            System.out.println("Span: " + nameSpans1[i].toString()); 
            System.out.println("Entity: " 
                + tokens1[nameSpans1[i].getStart()]); 
        } 
        
        System.out.println("---------- Multiple Sentences -----------");
        for (String sentence1 : sentences) { 
            String tokens[] = tokenizer.tokenize(sentence1); 
            Span nameSpans[] = nameFinder.find(tokens); 
            for (int i = 0; i < nameSpans.length; i++) { 
                System.out.println("Span: " + nameSpans[i].toString()); 
                System.out.println("Entity: "  
                    + tokens[nameSpans[i].getStart()]); 
            } 
            System.out.println(); 
        } 
        
    }
    catch(Exception e){
        System.out.println(e);
    }
}
 
Example #14
Source File: OpenNlpModule.java    From SciGraph with Apache License 2.0 4 votes vote down vote up
@Override
Tokenizer get() throws IOException;
 
Example #15
Source File: Model.java    From DataDefender with Apache License 2.0 4 votes vote down vote up
public Model(final Tokenizer tokenizer, final NameFinderME nameFinder, final String name) {
    this.name       = name;
    this.tokenizer  = tokenizer;
    this.nameFinder = nameFinder;
}
 
Example #16
Source File: Model.java    From DataDefender with Apache License 2.0 4 votes vote down vote up
public Tokenizer getTokenizer() {
    return this.tokenizer;
}