Java Code Examples for edu.stanford.nlp.pipeline.Annotation#get()

The following examples show how to use edu.stanford.nlp.pipeline.Annotation#get() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: StanfordCoref.java From Graphene with GNU General Public License v3.0

6 votes

@Override
public CoreferenceContent doCoreferenceResolution(String text) {
	Annotation document = new Annotation(text);
	PIPELINE.annotate(document);

	// extract sentences
	List<Sentence> sentences = new ArrayList<>();
	for (CoreMap coreMap : document.get(CoreAnnotations.SentencesAnnotation.class)) {
		Sentence sentence = new Sentence();
		for (CoreLabel coreLabel : coreMap.get(CoreAnnotations.TokensAnnotation.class)) {
			sentence.addWord(coreLabel.word());
		}
		sentences.add(sentence);
	}

	// replace coreferences
	for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
		String coreMention = cc.getRepresentativeMention().mentionSpan;
		for (CorefChain.CorefMention corefMention : cc.getMentionsInTextualOrder()) {
			sentences.get(corefMention.sentNum-1).replaceWords(corefMention.startIndex-1, corefMention.endIndex-1, getReplacement(corefMention.mentionSpan, coreMention));
		}
	}

	return new CoreferenceContent(text, sentences.stream().map(s -> s.toString()).collect(Collectors.joining(" ")));
}

Example 2

Source File: StanfordTokenizer.java From ambiverse-nlu with Apache License 2.0

6 votes

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
  String text = aJCas.getDocumentText();
  Annotation document = new Annotation(text);
  StanfordCoreNLP stanfordCoreNLP;

  if(!languageMap.containsKey(aJCas.getDocumentLanguage())) {
    throw new AnalysisEngineProcessException(new LanguageNotSupportedException("Language Not Supported"));
  }

  stanfordCoreNLP = stanfordCoreNLPs[languageMap.get(aJCas.getDocumentLanguage())];

  stanfordCoreNLP.annotate(document);
  List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
  for (CoreMap sentence : sentences) {
    int sstart = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
    int ssend = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
    Sentence jsentence = new Sentence(aJCas, sstart, ssend);
    jsentence.addToIndexes();

    for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
      Token casToken = new Token(aJCas, token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
      casToken.addToIndexes();
    }
  }
}

Example 3

Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * How to use:
 * for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
 * 		// this is the text of the token
 * 		String word = token.get(TextAnnotation.class);
 *		// this is the POS tag of the token
 *		String pos = token.get(PartOfSpeechAnnotation.class);
 *	}
 * @param s
 * @return
 */
public CoreMap getPOS (String s) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(s);
    
    // run all Annotators on this text
    pipeline_lemma.annotate(document);
    
    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    for(CoreMap sentence: sentences) {
      // this is the sentence with POS Tags
      return sentence;
    }
    
    return null;
}

Example 4

Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License

6 votes

public Tree getParseTree (String text) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    
    // run all Annotators on this text
    pipeline_lemma.annotate(document);
    
    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    for(CoreMap sentence: sentences) {
    	// this is the parse tree of the current sentence
    	return sentence.get(TreeAnnotation.class);
    }	    
    
    return null;
}

Example 5

Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void usingStanfordPOSTagger() {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos");
    props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger");
    props.put("pos.maxlen", 10);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(theSentence);
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            String word = token.get(TextAnnotation.class);
            String pos = token.get(PartOfSpeechAnnotation.class);
            System.out.print(word + "/" + pos + " ");
        }
        System.out.println();

        try {
            pipeline.xmlPrint(document, System.out);
            pipeline.prettyPrint(document, System.out);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
}

Example 6

Source File: StopwordAnnotatorTest.java From coreNlp with Apache License 2.0

5 votes

/**
 * Test to validate that stopwords are properly annotated in the token list
 * @throws Exception
 */
@org.junit.Test
public void testLuceneStopwordList() throws Exception {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, stopword");
    props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(example);
    pipeline.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    //get the standard lucene stopword set
    Set<?> stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

    for (CoreLabel token : tokens) {

        //get the stopword annotation
        Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

        String word = token.word().toLowerCase();
        if (stopWords.contains(word)) {
            assertTrue(stopword.first());
        }
        else {
            assertFalse(stopword.first());
        }

        //not checking lemma, so always false
        assertFalse(stopword.second());
    }
}

Example 7

Source File: ItalianTokenizerAnnotator.java From tint with GNU General Public License v3.0

5 votes

/**
 * Given an Annotation, perform a task on this Annotation.
 *
 * @param annotation
 */
@Override public void annotate(Annotation annotation) {
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List<List<CoreLabel>> sTokens = tokenizer
            .parse(text, newlineIsSentenceBreak, tokenizeOnlyOnSpace, ssplitOnlyOnNewLine);
    Utils.addBasicAnnotations(annotation, sTokens, text);
}

Example 8

Source File: CoreNLPLemmatizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "lemma");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String tempLemma = token.get(LemmaAnnotation.class);
					w.putAnnotation("lemma", tempLemma.toLowerCase());
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Example 9

Source File: CoreNlpExample.java From core-nlp-example with MIT License

5 votes

public static void main(String[] args) {

        // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        // read some text in the text variable
        String text = "What is the Weather in Bangalore right now?";

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(text);

        // run all Annotators on this text
        pipeline.annotate(document);

        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);

        for (CoreMap sentence : sentences) {
            // traversing the words in the current sentence
            // a CoreLabel is a CoreMap with additional token-specific methods
            for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                // this is the text of the token
                String word = token.get(CoreAnnotations.TextAnnotation.class);
                // this is the POS tag of the token
                String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
                // this is the NER label of the token
                String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);

                System.out.println(String.format("Print: word: [%s] pos: [%s] ne: [%s]", word, pos, ne));
            }
        }
    }

Example 10

Source File: StopwordAnnotatorTest.java From coreNlp with Apache License 2.0

5 votes

/**
 * Test to validate that the custom stopword list words
 * @throws Exception
 */
@org.junit.Test
public void testCustomStopwordList() throws Exception {

    //setup coreNlp properties for stopwords. Note the custom stopword list property
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, stopword");
    props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");
    props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);

    //get the custom stopword set
    Set<?> stopWords = StopwordAnnotator.getStopWordList(Version.LUCENE_36, customStopWordList, true);

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(example);
    pipeline.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    for (CoreLabel token : tokens) {

        //get the stopword annotation
        Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

        String word = token.word().toLowerCase();
        if (stopWords.contains(word)) {
            assertTrue(stopword.first());
        }
        else {
            assertFalse(stopword.first());
        }

        //not checking lemma, so always false
        assertFalse(stopword.second());
    }
}

Example 11

Source File: SentimentAnalyzer.java From computoser with GNU Affero General Public License v3.0

5 votes

/**
 * Synchronized method to obtain the sentiment of the set of documents.
 * Synchronization is fine, because the method is invoked via a scheduled job
 * and only one execution at a time is permitted.
 * That allows to optimize the loading of the model as well
 * @param documents
 * @return
 */
public synchronized SentimentResult getSentiment(Set<String> documents, TimelineMusic meta) {

    double sentimentSum = 0;
    for (String document: documents) {
        int mainSentiment = 0;
        if (document != null && document.length() > 0) {
            int longest = 0;
            try {
                Annotation annotation = pipeline.process(document);
                // mainSentiment is the sentiment of the whole document. We find
                // the whole document by comparing the length of individual
                // annotated "fragments"
                for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    Tree tree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class);
                    int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
                    String partText = sentence.toString();
                    if (partText.length() > longest) {
                        mainSentiment = sentiment;
                        longest = partText.length();
                    }
                }
            } catch (Exception ex) {
                logger.error("Problem analyzing document sentiment. " + document, ex);
                continue;
            }
        }
        sentimentSum += mainSentiment;
    }

    double average = sentimentSum / documents.size();
    meta.setAverageSentiment(average);

    if (average >= 2.25) {
        return SentimentResult.POSITIVE;
    } else if (average <= 1.75) {
        return SentimentResult.NEGATIVE;
    }
    return SentimentResult.NEUTRAL;
}

Example 12

Source File: SplitSentences.java From tint with GNU General Public License v3.0

5 votes

public static void main(String[] args) {
    try {
        final CommandLine cmd = CommandLine
                .parser()
                .withName("./annotate-sentences")
                .withHeader("Annotate sentences")
                .withOption("i", "input", "Input file", "FILE",
                        CommandLine.Type.FILE_EXISTING, true, false, true)
                .withOption("o", "output", "Output file", "FILE",
                        CommandLine.Type.FILE_EXISTING, true, false, true)
                .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);

        File input = cmd.getOptionValue("input", File.class);
        File output = cmd.getOptionValue("output", File.class);

        String text = new String(Files.readAllBytes(input.toPath()), Charsets.UTF_8);
        BufferedWriter writer = new BufferedWriter(new FileWriter(output));

        Properties props = new Properties();
        props.setProperty("annotators", "ita_toksent");
        props.setProperty("customAnnotatorClass.ita_toksent",
                "eu.fbk.dh.tint.tokenizer.annotators.ItalianTokenizerAnnotator");

        StanfordCoreNLP ITApipeline = new StanfordCoreNLP(props);
        Annotation annotation = new Annotation(text);
        ITApipeline.annotate(annotation);

        List<CoreMap> sents = annotation.get(CoreAnnotations.SentencesAnnotation.class);
        for (CoreMap thisSent : sents) {
            writer.append(thisSent.get(CoreAnnotations.TextAnnotation.class)).append("\n");
        }

        writer.close();

    } catch (Exception e) {
        CommandLine.fail(e);
    }
}

Example 13

Source File: UPosAnnotator.java From tint with GNU General Public License v3.0

5 votes

@Override
public void annotate(Annotation annotation) {
    for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
        String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);

        String[] parts = pos.split("\\+");
        StringBuffer upos = new StringBuffer();
        for (String part : parts) {
            String thisPos = uposMap.getOrDefault(part, DEFAULT_UPOS);
            upos.append("+").append(thisPos);
        }
        token.set(CustomAnnotations.UPosAnnotation.class, upos.substring(1));
    }

}

Example 14

Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public Iterable<CoreNlpToken> tokenize(CharSequence text) {
    Iterable<CoreLabel> iterator;
    if (StringUtility.isBlank(text)) {
        // 空格无需分词
        iterator = Collections.EMPTY_LIST;
    } else {
        Annotation annotation = new Annotation(text.toString());
        annotator.annotate(annotation);
        iterator = annotation.get(CoreAnnotations.TokensAnnotation.class);
    }
    CoreNlpToken iterable = new CoreNlpToken(iterator.iterator());
    return iterable;
}

Example 15

Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License

4 votes

public Iterator findTokens() throws IOException
{
	/*char[] c = new char[256];
    int sz = 0;
    StringBuilder b = new StringBuilder();
    
    while ((sz = input.read(c)) >= 0) {
      b.append(c, 0, sz);
    }*/
    //String text = b.toString();
	if (!input.incrementToken()) return null;
    String text;
    text = input.getAttribute(CharTermAttribute.class).toString();
	// read some text in the text variable
	//System.out.println("before annotation");
	Annotation document = new Annotation(text);
	// these are all the sentences in this document
	// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
	pipeline.annotate(document);
	List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class);
    for (CoreMap cm : timexAnnsAll) {
    List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
    TimeData td=new TimeData();
    td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toString());
    td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    timeQueue.add(td);
    }
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	
	//System.out.println("after annotation and sentence getting"+sentences.size());
	for(CoreMap sentence: sentences) {
	  // traversing the words in the current sentence
	  // a CoreLabel is a CoreMap with additional token-specific methods
	  for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
	    // this is the text of the token
		  System.out.println("in token");
	    String word = token.get(TextAnnotation.class);
	    // this is the POS tag of the token
	    String pos = token.get(PartOfSpeechAnnotation.class);
	    // this is the NER label of the token
	    String ne = token.get(NamedEntityTagAnnotation.class);
	   // System.out.println("word : "+word+" pos: "+pos+" ner: "+ne);
	    TokenData tok=new TokenData();
	    tok.setNER(ne);
	    tok.setToken(word);
	    tok.setPOS(pos);
	    tokenQueue.add(tok);
	  }

	}
	Iterator<TokenData> it=tokenQueue.iterator();
	itr_cpy=tokenQueue.iterator();
	tokenOffset=0;
	start=0;
	end=0;
	return it;
}

Example 16

Source File: QueryAnswerTypeAnalyzer.java From NLIWOD with GNU Affero General Public License v3.0

4 votes

@Override
public Object analyze(String q) {
	log.debug("String question: " + q);
	
	//some cases are resolved through the first word of the question
	if(q.startsWith("Where ") || q.startsWith("In ")) return "DBpedia:Place";
	if(q.startsWith("How ")) return "Number";
	if(q.startsWith("When ")) return "Schema:Date";
	if(q.startsWith("Who ")) return "DBpedia:Person";
	if(QuestionTypeAnalyzer.isASKQuestion(q)) return "Boolean";
	
	Annotation annotation = new Annotation(q);
    PIPELINE.annotate(annotation);
      
    List<CoreMap> question = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    
    //get all nouns, verbs, adjectives
    List<String> verbs = getWords(question, "V");
    List<String> nouns = getWords(question, "N");
    List<String> adjectives = getWords(question, "JJ");
    
    //get all properties for the nouns, verbs, adjectives
    Map<String, List<String>> properties = new LinkedHashMap<>();
    getProperties(properties, verbs);
    getProperties(properties, nouns);
    getProperties(properties, adjectives);
		
    //query all ranges for the properties and put them in a list
    ArrayList<String> ranges = new ArrayList<String>();
		for(String key: properties.keySet()) {
			for(String r: properties.get(key)) {
				String answer = queryRange(r);
				ranges.add(answer);
			}
		}
		
		//find the most common range
		String range = mostCommon(ranges);

		//set the answertype depending on the uri (xml schema, ontology etc.)
		if(range.contains("http://dbpedia.org/ontology/")) {
			return range.replace("http://dbpedia.org/ontology/", "DBpedia:");
		}  else if(range.contains("http://www.w3.org/2001/XMLSchema#")) {
			if(range.toLowerCase().contains("double") || range.toLowerCase().contains("integer")) {
				return "Number";
			}
			range = range.replace("http://www.w3.org/2001/XMLSchema#", "");
			range = range.substring(0,1).toUpperCase() + range.substring(1);
			return "Schema:" + range;
		} else if(range.contains("http://www.w3.org/1999/02/22-rdf-syntax-ns#langString")) {
			return "Schema:String";
		}
		return "Misc";
}

Example 17

Source File: CoreNLPDependencyParser.java From Heracles with GNU General Public License v3.0

4 votes

@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit) {
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "depparse");
		
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			
			
//			Main.debug(span.toString());
			
			pipeline.annotate(a);
						
			for (CoreMap sentence : a.get(SentencesAnnotation.class)){
								
				//per sentence, get the dependencies
				SemanticGraph dependencies = sentence.get(EnhancedPlusPlusDependenciesAnnotation.class);
				
				for (TypedDependency td : dependencies.typedDependencies()){
//					Main.debug(td.toString());
					String relationType = td.reln().getLongName();
					Word dep = wordIndex.get(td.dep().beginPosition());
					DataEntity gov = wordIndex.get(td.gov().beginPosition());
					if (gov == null){
						//this is the root, link to sentence
						gov = span;
					}
					if (dep == null || gov == null){
						Framework.debug(td.toString());
						Framework.debug(td.dep().beginPosition() + "\t" + td.gov().beginPosition());
						Framework.debug(wordIndex.toString());
					}
					Relation rel = new Relation("deps", gov, dep);
					rel.putAnnotation("relationLongName", td.reln().getLongName());
					if (td.reln().getParent() != null)
						rel.putAnnotation("relationParentShortName", td.reln().getParent().getShortName());
					rel.putAnnotation("relationShortName", td.reln().getShortName());
//					rel.putAnnotation("relationSpecific", td.reln().getSpecific());
					dep.getRelations().addRelationToParent(rel);
					gov.getRelations().addRelationToChild(rel);
					
				}				
//				dependencies.prettyPrint();
			}
			
		}

	}

Example 18

Source File: Chapter6.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

private static void usingStanfordSentimentAnalysis() {
    String review = "An overly sentimental film with a somewhat "
            + "problematic message, but its sweetness and charm "
            + "are occasionally enough to approximate true depth "
            + "and grace. ";

    String sam = "Sam was an odd sort of fellow. Not prone to angry and "
            + "not prone to merriment. Overall, an odd fellow.";
    String mary = "Mary thought that custard pie was the best pie in the "
            + "world. However, she loathed chocolate pie.";
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, parse, sentiment");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    Annotation annotation = new Annotation(review);
    pipeline.annotate(annotation);

    System.out.println("---sentimentText");
    String[] sentimentText = {"Very Negative", "Negative", "Neutral",
        "Positive", "Very Positive"};
    for (CoreMap sentence : annotation.get(
            CoreAnnotations.SentencesAnnotation.class)) {
        Tree tree = sentence.get(
                SentimentCoreAnnotations.AnnotatedTree.class);
        System.out.println("---Number of children: " + tree.numChildren());
        System.out.println("[" + tree.getChild(0) + "][" + tree.getChild(1) + "]");
        tree.printLocalTree();
        int score = RNNCoreAnnotations.getPredictedClass(tree);
        System.out.println(sentimentText[score]);
    }

    // Classifer
    CRFClassifier crf
            = CRFClassifier.getClassifierNoExceptions(
                    "C:/Current Books in Progress/NLP and Java/Models"
                    + "/english.all.3class.distsim.crf.ser.gz");
    String S1 = "Good afternoon Rajat Raina, how are you today?";
    String S2 = "I go to school at Stanford University, which is located in California.";
    System.out.println(crf.classifyToString(S1));
    System.out.println(crf.classifyWithInlineXML(S2));
    System.out.println(crf.classifyToString(S2, "xml", true));

    Object classification[] = crf.classify(S2).toArray();
    for (int i = 0; i < classification.length; i++) {
        System.out.println(classification[i]);
    }
}

Example 19

Source File: AnnotateLemma.java From tint with GNU General Public License v3.0

4 votes

public static void main(String[] args) {
        try {
            final CommandLine cmd = CommandLine
                    .parser()
                    .withName("./annotate-lemmas")
                    .withHeader("Annotate lemmas")
                    .withOption("i", "input", "Input file", "FILE",
                            CommandLine.Type.FILE_EXISTING, true, false, true)
                    .withOption("o", "output", "Input file", "FILE",
                            CommandLine.Type.FILE_EXISTING, true, false, true)
                    .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);

            File input = cmd.getOptionValue("input", File.class);
            File output = cmd.getOptionValue("output", File.class);

            String text = new String(Files.readAllBytes(input.toPath()), Charsets.UTF_8);
            BufferedWriter writer = new BufferedWriter(new FileWriter(output));

            Properties props = new Properties();
            props.setProperty("annotators", "tokenize, ssplit, pos, ita_morpho, ita_lemma");
            props.setProperty("tokenize.whitespace", "true");
            props.setProperty("ssplit.eolonly", "true");

//            props.setProperty("ita_toksent.newlineIsSentenceBreak", "1");

            props.setProperty("pos.model", "/Users/alessio/Documents/Resources/ita-models/italian5.tagger");

            props.setProperty("customAnnotatorClass.ita_toksent",
                    "eu.fbk.dkm.pikes.tintop.ita.annotators.ItalianTokenizerAnnotator");
            props.setProperty("customAnnotatorClass.ita_lemma", "eu.fbk.dh.digimorph.annotator.DigiLemmaAnnotator");
            props.setProperty("customAnnotatorClass.ita_morpho", "eu.fbk.dh.digimorph.annotator.DigiMorphAnnotator");
            props.setProperty("ita_morpho.model", "/Users/alessio/Documents/Resources/ita-models/italian.db");

            StanfordCoreNLP ITApipeline = new StanfordCoreNLP(props);
            Annotation annotation = new Annotation(text);
            ITApipeline.annotate(annotation);

            System.out.println(ITApipeline.timingInformation());

            List<CoreMap> sents = annotation.get(CoreAnnotations.SentencesAnnotation.class);
            for (CoreMap thisSent : sents) {
                List<CoreLabel> tokens = thisSent.get(CoreAnnotations.TokensAnnotation.class);
                for (CoreLabel token : tokens) {
                    writer.append(token.originalText().replaceAll("\\s+", ""))
                            .append("\t")
                            .append(token.get(CoreAnnotations.PartOfSpeechAnnotation.class))
                            .append("\t")
                            .append(token.get(CoreAnnotations.LemmaAnnotation.class))
                            .append("\n");
                }
                writer.append("\n");
            }

            writer.close();

        } catch (Exception e) {
            CommandLine.fail(e);
        }

    }

Example 20

Source File: CoreNLPToJSON.java From phrasal with GNU General Public License v3.0

4 votes

/**
 * Process an English text file.
 * 
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
  if (args.length < 1) {
    System.err.printf("Usage: java %s file [inputproperties_str] > json_output%n", CoreNLPToJSON.class.getName());
    System.exit(-1);
  }
  String textFile = args[0];
  InputProperties inputProperties = args.length > 1 ? InputProperties.fromString(args[1]) : new InputProperties();

  StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
  
  // Configure tokenizer
  EnglishPreprocessor preprocessor = new EnglishPreprocessor(true);
  
  // Use a map with ordered keys so that the output is ordered by segmentId.
  Map<Integer,SourceSegment> annotations = new TreeMap<Integer,SourceSegment>();
  LineNumberReader reader = IOTools.getReaderFromFile(textFile);
  for (String line; (line = reader.readLine()) != null;) {
    Annotation annotation = coreNLP.process(line);
    List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
    if (sentences.size() != 1) {
      throw new RuntimeException("Sentence splitting on line: " + String.valueOf(reader.getLineNumber()));
    }
    CoreMap sentence = sentences.get(0);
    Tree tree = sentence.get(TreeAnnotation.class);
    tree.indexLeaves();
    int[] chunkVector = getChunkVector(tree);
    List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
    int numTokens = tokens.size();
    SymmetricalWordAlignment alignment = preprocessor.processAndAlign(line);
    if (alignment.e().size() != numTokens) {
      throw new RuntimeException(String.format("Tokenizer configurations differ: %d/%d", alignment.e().size(), numTokens));
    }
    SourceSegment segment = new SourceSegment(numTokens);
    segment.layoutSpec.addAll(makeLayoutSpec(alignment));
    segment.inputProperties = inputProperties.toString();
    for (int j = 0; j < numTokens; ++j) {
      CoreLabel token = tokens.get(j);
      String word = token.get(TextAnnotation.class);
      segment.tokens.add(unescape(word));
      String pos = mapPOS(token.get(PartOfSpeechAnnotation.class));
      segment.pos.add(pos);
      String ne = token.get(NamedEntityTagAnnotation.class);
      segment.ner.add(ne);
      segment.chunkVector[j] = chunkVector[j];
    }
    annotations.put(reader.getLineNumber()-1, segment);
  }
  reader.close();
  System.err.printf("Processed %d sentences%n", reader.getLineNumber());
  
  final SourceDocument jsonDocument = new SourceDocument(textFile, annotations);
  
  // Convert to json
  Gson gson = new Gson();
  String json = gson.toJson(jsonDocument);
  System.out.println(json);
}