Java Code Examples for com.aliasi.tokenizer.IndoEuropeanTokenizerFactory#INSTANCE

The following examples show how to use com.aliasi.tokenizer.IndoEuropeanTokenizerFactory#INSTANCE . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TrainEntities.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 7 votes vote down vote up
public static void main(String[] args) throws IOException {
        File corpusFile = new File("inputfile.txt");// my annotated file
        File modelFile = new File("outputmodelfile.model"); 

        System.out.println("Setting up Chunker Estimator");
        TokenizerFactory factory
            = IndoEuropeanTokenizerFactory.INSTANCE;
        HmmCharLmEstimator hmmEstimator
            = new HmmCharLmEstimator(MAX_N_GRAM,NUM_CHARS,LM_INTERPOLATION);
        CharLmHmmChunker chunkerEstimator
            = new CharLmHmmChunker(factory,hmmEstimator);

        System.out.println("Setting up Data Parser");
//        Muc6ChunkParser parser = new Muc6ChunkParser();  
//        parser.setHandler( chunkerEstimator);

        System.out.println("Training with Data from File=" + corpusFile);
//        parser.parse(corpusFile);

        System.out.println("Compiling and Writing Model to File=" + modelFile);
        AbstractExternalizable.compileTo(chunkerEstimator,modelFile);
    }
 
Example 2
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void usingTheLingPipeStemmer() {
    String words[] = {"bank", "banking", "banks", "banker",
        "banked", "bankart"};
    TokenizerFactory tokenizerFactory
            = IndoEuropeanTokenizerFactory.INSTANCE;
    TokenizerFactory porterFactory
            = new PorterStemmerTokenizerFactory(tokenizerFactory);
    String[] stems = new String[words.length];
    for (int i = 0; i < words.length; i++) {
        com.aliasi.tokenizer.Tokenization tokenizer
                = new com.aliasi.tokenizer.Tokenization(words[i], porterFactory);
        stems = tokenizer.tokens();
        System.out.print("Word: " + words[i]);
        for (String stem : stems) {
            System.out.println("  Stem: " + stem);
        }
    }

}
 
Example 3
Source File: Chapter4.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingExactDictionaryChunker() {
    initializeDictionary();
    System.out.println("\nDICTIONARY\n" + dictionary);

    ExactDictionaryChunker dictionaryChunker
            = new ExactDictionaryChunker(dictionary,
                    IndoEuropeanTokenizerFactory.INSTANCE, true, false);

    for (String sentence : sentences) {
        System.out.println("\nTEXT=" + sentence);
        displayChunkSet(dictionaryChunker, sentence);
    }
}
 
Example 4
Source File: Chapter4.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingExactDictionaryChunker() {
    initializeDictionary();
    System.out.println("\nDICTIONARY\n" + dictionary);

    ExactDictionaryChunker dictionaryChunker
            = new ExactDictionaryChunker(dictionary,
                    IndoEuropeanTokenizerFactory.INSTANCE, true, false);

    for (String sentence : sentences) {
        System.out.println("\nTEXT=" + sentence);
        displayChunkSet(dictionaryChunker, sentence);
    }
}
 
Example 5
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingLingPipeTokenizers() {
//        String paragraph = "sample text string";
        char text[] = paragraph.toCharArray();
        TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE;
        com.aliasi.tokenizer.Tokenizer tokenizer = tokenizerFactory.tokenizer(
                text, 0, text.length);
        for (String token : tokenizer) {
            System.out.println(token);
        }
    }
 
Example 6
Source File: TweetHandler.java    From Java-for-Data-Science with MIT License 5 votes vote down vote up
public TweetHandler removeStopWords() {
    TokenizerFactory tokenizerFactory
            = IndoEuropeanTokenizerFactory.INSTANCE;
    tokenizerFactory = new EnglishStopTokenizerFactory(tokenizerFactory);
    Tokenizer tokens = tokenizerFactory.tokenizer(
            this.text.toCharArray(), 0, this.text.length());
    StringBuilder buffer = new StringBuilder();
    for (String word : tokens) {
        buffer.append(word + " ");
    }
    this.text = buffer.toString();
    return this;
}
 
Example 7
Source File: SimpleStringCleaning.java    From Java-for-Data-Science with MIT License 5 votes vote down vote up
public static void removeStopWithLing(String text){
	//******************EXAMPLE WITH ling pipe *******************************************************************************************
	//mention lower vs upper case
	out.println(text);
	text = text.toLowerCase().trim();
	TokenizerFactory fact = IndoEuropeanTokenizerFactory.INSTANCE;
	fact = new EnglishStopTokenizerFactory(fact);
	Tokenizer tok = fact.tokenizer(text.toCharArray(), 0, text.length());
	for(String word : tok){
		out.print(word + " ");
	}
}
 
Example 8
Source File: DictionaryChunker.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
public static void main(String[] args) {

        MapDictionary<String> dictionary = new MapDictionary<String>();
        dictionary.addEntry(new DictionaryEntry<String>("Joe","PERSON",CHUNK_SCORE));
        dictionary.addEntry(new DictionaryEntry<String>("Fred","PERSON",CHUNK_SCORE));
        dictionary.addEntry(new DictionaryEntry<String>("Boston","PLACE",CHUNK_SCORE));
        dictionary.addEntry(new DictionaryEntry<String>("pub","PLACE",CHUNK_SCORE));
        dictionary.addEntry(new DictionaryEntry<String>("Vermont","PLACE",CHUNK_SCORE));
        dictionary.addEntry(new DictionaryEntry<String>("IBM","ORGANIZATION",CHUNK_SCORE));
        dictionary.addEntry(new DictionaryEntry<String>("Sally","PERSON",CHUNK_SCORE));


        ExactDictionaryChunker dictionaryChunkerTT
            = new ExactDictionaryChunker(dictionary,
                                         IndoEuropeanTokenizerFactory.INSTANCE,
                                         true,true);

        ExactDictionaryChunker dictionaryChunkerTF
            = new ExactDictionaryChunker(dictionary,
                                         IndoEuropeanTokenizerFactory.INSTANCE,
                                         true,false);

        ExactDictionaryChunker dictionaryChunkerFT
            = new ExactDictionaryChunker(dictionary,
                                         IndoEuropeanTokenizerFactory.INSTANCE,
                                         false,true);

        ExactDictionaryChunker dictionaryChunkerFF
            = new ExactDictionaryChunker(dictionary,
                                         IndoEuropeanTokenizerFactory.INSTANCE,
                                         false,false);



        System.out.println("\nDICTIONARY\n" + dictionary);

        for (int i = 0; i < sentences.length; ++i) {
            String text = sentences[i];
            System.out.println("\n\nTEXT=" + text);

            chunk(dictionaryChunkerTT,text);
            chunk(dictionaryChunkerTF,text);
            chunk(dictionaryChunkerFT,text);
            chunk(dictionaryChunkerFF,text);
        }

    }