edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation Java Examples

The following examples show how to use edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCustomLemmaAnnotator.java    From blog-codes with Apache License 2.0 6 votes vote down vote up
@Test
public void test() {
	Properties props = new Properties();
	props.setProperty("annotators", "tokenize,ssplit,pos,custom.lemma");
	props.setProperty("customAnnotatorClass.custom.lemma", "com.fancyerii.blog.stanfordnlp.CustomLemmaAnnotator");
	props.setProperty("custom.lemma.lemmaFile", "custom-lemmas.txt");
	// set up pipeline
	StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
	
	CoreDocument exampleDocument = new CoreDocument("Some many goods there.");
	// annotate document
	pipeline.annotate(exampleDocument);
	// access tokens from a CoreDocument
	// a token is represented by a CoreLabel
	List<CoreLabel> firstSentenceTokens = exampleDocument.sentences().get(0).tokens();
	// this for loop will print out all of the tokens and the character offset info
	for (CoreLabel token : firstSentenceTokens) {
		System.out.println(token.word()+"/"+token.getString(LemmaAnnotation.class) + "\t" + token.beginPosition() + "\t" + token.endPosition());
	}
}
 
Example #2
Source File: CoreNlpTokenizer.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() {
    clearAttributes();
    while (tokens == null || !tokens.hasNext())
        if (!getNextSentence())
            return false;
    CoreLabel token = tokens.next();
    // Use the lemmatized word:
    String word = token.get(LemmaAnnotation.class);
    if (word == null) { // Fallback when no lemmatization happens.
        word = token.get(TextAnnotation.class);
    }
    termAttribute.setLength(0);
    termAttribute.append(word);
    // NER or part of speech annotation
    String pos = token.get(NamedEntityTagAnnotation.class);
    pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos;
    typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE);
    // Token character offsets
    int be = token.get(CharacterOffsetBeginAnnotation.class).intValue();
    int en = token.get(CharacterOffsetEndAnnotation.class).intValue();
    offsetAttribute.setOffset(be, en);
    // Token in-document position increment:
    positionAttribute.setPositionIncrement(1 + skippedTokens);
    skippedTokens = 0;
    return true;
}
 
Example #3
Source File: CoreNLP.java    From gAnswer with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public String getBaseFormOfPattern (String text) {
	String ret = new String("");
	
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    // run all Annotators on this text
    pipeline_lemma.annotate(document);


    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    int count = 0;
    for(CoreMap sentence: sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        // this is the base form (lemma) of the token
        String lemma = token.getString(LemmaAnnotation.class);
        ret += lemma;
        ret += " ";
      }
      count ++;
      if (count % 100 == 0) {
    	  System.out.println(count);
      }
    }
    
    return ret.substring(0, ret.length()-1);
}
 
Example #4
Source File: CoreNLPLemmatizer.java    From Heracles with GNU General Public License v3.0 5 votes vote down vote up
/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "lemma");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String tempLemma = token.get(LemmaAnnotation.class);
					w.putAnnotation("lemma", tempLemma.toLowerCase());
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}
 
Example #5
Source File: Phrase.java    From uncc2014watsonsim with GNU General Public License v2.0 5 votes vote down vote up
private static List<String> _lemmas(Phrase p) {
	return p.memo(Phrase.sentences)
			.stream()
			.flatMap(s -> s.get(TokensAnnotation.class).stream())
			.map( t -> t.get(LemmaAnnotation.class))
			.collect(toList());
}
 
Example #6
Source File: Stemming.java    From AGDISTIS with GNU Affero General Public License v3.0 5 votes vote down vote up
public String stemming(String documentText) {
	List<String> lemmas = new LinkedList<String>();
	String label = null;
	LancasterStemmer stem = new LancasterStemmer();
	// Create an empty Annotation just with the given text
	Annotation document = new Annotation(documentText);
	// run all Annotators on this text
	this.pipeline.annotate(document);
	// Iterate over all of the sentences found
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	for (CoreMap sentence : sentences) {
		// Iterate over all tokens in a sentence
		for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
			// Retrieve and add the lemma for each word into the
			// list of lemmas
			// lemmas.add(token.get(LemmaAnnotation.class));
			// lemmas.add(morpho.stem(token.word()));
			lemmas.add(stem.stem(token.get(LemmaAnnotation.class)));
		}
	}

	label = lemmas.toString();
	Pattern p = Pattern.compile("[,.;!?(){}\\[\\]<>%]");
	label = p.matcher(label).replaceAll("");

	return label;
}
 
Example #7
Source File: CoreNLPHelper.java    From Heracles with GNU General Public License v3.0 4 votes vote down vote up
public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){
		String originalText = sentenceSpan.getAnnotation("text", String.class); 
		Annotation a = new Annotation(originalText);
		a.set(TextAnnotation.class, originalText);
		
		//a.set(DocIDAnnotation.class, "document");
		
		List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>();
		a.set(SentencesAnnotation.class, sentenceAnnotations);
		List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>();
		a.set(TokensAnnotation.class, tokenAnnotations);
		
		ArrayCoreMap sentenceAnnotation = new ArrayCoreMap();
		sentenceAnnotations.add(sentenceAnnotation);
		
//		int startOffset = sentenceSpan.first().getStartOffset();
		
		for (Word w : sentenceSpan){
			CoreLabel c = new CoreLabel();
			c.set(TextAnnotation.class, w.getWord());
			c.set(OriginalTextAnnotation.class, w.getWord());
			c.set(ValueAnnotation.class, w.getWord());
			c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset());
			c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset());
			
			
			c.set(IndexAnnotation.class, w.getOrder()+1);
//			c.setIndex(w.getOrder());
			
			c.set(SentenceIndexAnnotation.class, 0);
//			c.setSentIndex(0);
			
			c.set(DocIDAnnotation.class, "document");
			c.setDocID("document");
			
			if (w.hasAnnotation("pos"))
				c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class));
			
			if (w.hasAnnotation("lemma"))
				c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class));
			
			if (w.hasAnnotation("nerLabel"))
				c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class));
			
			if (w.hasAnnotation("nerValue"))
				c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class));
			
			tokenAnnotations.add(c);
			if (useWordOrderInsteadOfOffset){
				wordIndex.put(w.getOrder(), w);
			} else {
				wordIndex.put(w.getStartOffset(), w);
			}
		}
		//essential sentence annotation: TokensAnnotation
		sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations);
		//essential sentence annotation: TextAnnotation
		sentenceAnnotation.set(TextAnnotation.class, originalText);
		//essential sentence annotation: SentenceIndexAnnotation
		sentenceAnnotation.set(SentenceIndexAnnotation.class, 0);
		
		sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0);
		sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset());
		sentenceAnnotation.set(TokenBeginAnnotation.class, 0);
		sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder());
		
		return a;
	}
 
Example #8
Source File: JsonPipeline.java    From tac2015-event-detection with GNU General Public License v3.0 4 votes vote down vote up
/** annotator is a stanford corenlp notion.  */
void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, String annotator) {
	switch(annotator) {
	case "tokenize":
	case "cleanxml":
	case "ssplit":
		break;
	case "pos":
		addTokenAnno(sent_info,sentence, "pos", PartOfSpeechAnnotation.class);
		break;
	case "lemma":
		addTokenAnno(sent_info,sentence, "lemmas", LemmaAnnotation.class);
		break;
	case "ner":
		addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
		addTokenAnno(sent_info, sentence, "normner", NormalizedNamedEntityTagAnnotation.class);
		break;
	case "regexner":
		addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
		break;
	case "sentiment": throw new RuntimeException("TODO");
	case "truecase": throw new RuntimeException("TODO");
	case "parse":
		addParseTree(sent_info,sentence);
		addDepsCC(sent_info,sentence);
		addDepsBasic(sent_info,sentence);
		break;
	case "depparse":
		addDepsCC(sent_info,sentence);
		addDepsBasic(sent_info,sentence);
		break;
	case "dcoref":
		break;
	case "relation": throw new RuntimeException("TODO");
	case "natlog": throw new RuntimeException("TODO");
	case "quote": throw new RuntimeException("TODO");
	case "entitymentions":
		addEntityMentions(sent_info, sentence);
		break;
	default:
		throw new RuntimeException("don't know how to handle annotator " + annotator);
	}
}