opennlp.tools.tokenize.WhitespaceTokenizer Java Examples

The following examples show how to use opennlp.tools.tokenize.WhitespaceTokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Chapter1.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void detectingPartsOfSpeechExample() {
    String sentence = "POS processing is useful for enhancing the "
            + "quality of data sent to other elements of a pipeline.";

    POSModel model = new POSModelLoader()
            .load(new File("C:/Current Books/NLP and Java/Models/", "en-pos-maxent.bin"));
    POSTaggerME tagger = new POSTaggerME(model);

    String tokens[] = WhitespaceTokenizer.INSTANCE
            .tokenize(sentence);
    String[] tags = tagger.tag(tokens);

    POSSample sample = new POSSample(tokens, tags);
    String posTokens[] = sample.getSentence();
    String posTags[] = sample.getTags();
    for (int i = 0; i < posTokens.length; i++) {
        System.out.print(posTokens[i] + " - " + posTags[i]);
    }
    System.out.println();

    for (int i = 0; i < tokens.length; i++) {
        System.out.print(tokens[i] + "[" + tags[i] + "] ");
    }
}
 
Example #2
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void usingTheOpenNLPLemmatizer() {
    // dictionary files downloaded from: https://code.google.com/p/xssm/downloads/detail?name=SimilarityUtils.zip&can=2&q=

    System.out.println("Starting the OpenNLP Lemmatizer");
    try {
        JWNLDictionary dictionary = new JWNLDictionary(
                "C:\\Downloads\\xssm\\SimilarityUtils\\WordNet-2.0\\dict\\");
        paragraph = "Eat, drink, and be merry, for life is but a dream";
        String tokens[] = WhitespaceTokenizer.INSTANCE.tokenize(paragraph);
        for (String token : tokens) {
            String[] lemmas = dictionary.getLemmas(token, "");
            for (String lemma : lemmas) {
                System.out.println("Token: " + token + "  Lemma: " + lemma);
            }
        }
    } catch (IOException | JWNLException ex) {
        Logger.getLogger(Chapter2.class.getName()).log(Level.SEVERE, null, ex);
    }
}
 
Example #3
Source File: NGramTest.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
public static void main(String args[]){
    String sampletext = "This is n-gram model";
    System.out.println(sampletext);
    
    StringList tokens = new StringList(WhitespaceTokenizer.INSTANCE.tokenize(sampletext));
    System.out.println("Tokens " + tokens);
    
    NGramModel nGramModel = new NGramModel();
    nGramModel.add(tokens,2,4); // minlength and maxlength
    
    System.out.println("Total ngrams: " + nGramModel.numberOfGrams());
    for (StringList ngram : nGramModel) {
        System.out.println(nGramModel.getCount(ngram) + " - " + ngram);
    }
}
 
Example #4
Source File: StopWords.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
public String removeStopWords(String words) {
        String arr[] = WhitespaceTokenizer.INSTANCE.tokenize(words);
        StringBuilder sb = new StringBuilder();
//        ArrayList<String> tokens = new ArrayList<String>(Arrays.asList(arr));
        for (int i = 0; i < arr.length; i++) {
//            System.out.println(tokens.get(i) + "-");
            if (stopWords.contains(arr[i])) {
//                tokens.remove(i);
//                System.out.println("Removing: [" + arr[i] + "]");
            } else {
                sb.append(arr[i]+" ");
            }
        }
        return sb.toString();
    }
 
Example #5
Source File: Chapter5.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static String[] tokenizeSentence(String sentence) {
    //First techniquue
    String words[] = sentence.split("S+");

    // Second technique
    words = WhitespaceTokenizer.INSTANCE.tokenize(sentence);

    return words;
}
 
Example #6
Source File: SearchText.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
public String removeStopWords(String words) {
    String arr[] = WhitespaceTokenizer.INSTANCE.tokenize(words);
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < arr.length; i++) {
        if (words.contains(arr[i])) {
            // Do nothing
        } else {
            sb.append(arr[i]+" ");
        }
    }
    return sb.toString();
}
 
Example #7
Source File: TaxiInkBotConfiguration.java    From Mutters with Apache License 2.0 5 votes vote down vote up
@Override
public IntentMatcher getIntentMatcher()
{
  // model was built with OpenNLP whitespace tokenizer
  OpenNLPTokenizer tokenizer = new OpenNLPTokenizer(WhitespaceTokenizer.INSTANCE);

  // use Open NLP NER for slot matching
  OpenNLPSlotMatcher slotMatcher = new OpenNLPSlotMatcher(tokenizer);
  slotMatcher.addSlotModel("Address", "models/en-ner-address.bin");

  // create intent matcher
  OpenNLPIntentMatcher matcher = new OpenNLPIntentMatcher("models/en-cat-taxi-intents.bin", tokenizer, slotMatcher);

  Intent intent = new Intent("OrderTaxi");
  intent.addSlot(new LiteralSlot("Address"));
  matcher.addIntent(intent);

  intent = new Intent("CancelTaxi");
  matcher.addIntent(intent);

  intent = new Intent("WhereTaxi");
  matcher.addIntent(intent);

  intent = new Intent("GaveAddress");
  intent.addSlot(new LiteralSlot("Address"));
  matcher.addIntent(intent);

  intent = new Intent("Stop");
  matcher.addIntent(intent);

  intent = new Intent("Help");
  matcher.addIntent(intent);

  intent = new Intent("FavColor");
  matcher.addIntent(intent);

  return matcher;
}
 
Example #8
Source File: TaxiStateMachineBotConfiguration.java    From Mutters with Apache License 2.0 5 votes vote down vote up
@Override
public IntentMatcher getIntentMatcher()
{
  // model was built with OpenNLP whitespace tokenizer
  OpenNLPTokenizer tokenizer = new OpenNLPTokenizer(WhitespaceTokenizer.INSTANCE);

  // use Open NLP NER for slot matching
  OpenNLPSlotMatcher slotMatcher = new OpenNLPSlotMatcher(tokenizer);
  slotMatcher.addSlotModel("Address", "models/en-ner-address.bin");

  // create intent matcher
  OpenNLPIntentMatcher matcher = new OpenNLPIntentMatcher("models/en-cat-taxi-intents.bin", tokenizer, slotMatcher);

  Intent intent = new Intent("OrderTaxi");
  intent.addSlot(new LiteralSlot("Address"));
  matcher.addIntent(intent);

  intent = new Intent("CancelTaxi");
  matcher.addIntent(intent);

  intent = new Intent("WhereTaxi");
  matcher.addIntent(intent);

  intent = new Intent("GaveAddress");
  intent.addSlot(new LiteralSlot("Address"));
  matcher.addIntent(intent);

  return matcher;
}
 
Example #9
Source File: TestCategorization.java    From Mutters with Apache License 2.0 5 votes vote down vote up
@Test
public void testCategorization() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-cat-taxi-intents.bin");
  assertThat(modelUrl, is(notNullValue()));

  DoccatModel model = new DoccatModel(modelUrl);
  assertThat(model, is(notNullValue()));

  DocumentCategorizerME myCategorizer = new DocumentCategorizerME(model);
  // model was built with OpenNLP whitespace tokenizer
  OpenNLPTokenizer tokenizer = new OpenNLPTokenizer(WhitespaceTokenizer.INSTANCE);

  String category = myCategorizer.getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Order me a taxi")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("OrderTaxi"));

  category = myCategorizer.getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Send me a taxi")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("OrderTaxi"));

  category = myCategorizer
      .getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Send a taxi to 12 Pleasent Street")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("OrderTaxi"));

  category = myCategorizer.getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Cancel my cab")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("CancelTaxi"));

  category = myCategorizer.getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Where is my taxi ?")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("WhereTaxi"));

  category = myCategorizer
      .getBestCategory(myCategorizer.categorize(tokenizer.tokenize("The address is 136 River Road")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("GaveAddress"));
}
 
Example #10
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
private static void usingTheWhitespaceTokenizer() {
    String tokens[] = WhitespaceTokenizer.INSTANCE.tokenize(paragraph);
    for (String token : tokens) {
        System.out.println(token);
    }
}
 
Example #11
Source File: TokenizerUnitTest.java    From tutorials with MIT License 4 votes vote down vote up
@Test
public void givenWhitespaceTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
    WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
    assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource.");
}