edu.stanford.nlp.ling.CoreLabel Java Examples

The following examples show how to use edu.stanford.nlp.ling.CoreLabel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaClient.java    From blog-codes with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
	// creates a StanfordCoreNLP object with POS tagging, lemmatization, NER, parsing, and coreference resolution
	Properties props = new Properties();
	
	props.setProperty("annotators", "tokenize,ssplit,pos,ner,depparse,openie"); 
	MultiLangsStanfordCoreNLPClient pipeline = new MultiLangsStanfordCoreNLPClient(props, "http://localhost", 9000, 2, null, null, "zh");

	// read some text in the text variable
	String text = "今天天气很好。";
	// create an empty Annotation just with the given text
	Annotation document = new Annotation(text);
	// run all Annotators on this text
	pipeline.annotate(document);
	
	CoreMap firstSentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0);
	// this for loop will print out all of the tokens and the character offset info
	for (CoreLabel token : firstSentence.get(CoreAnnotations.TokensAnnotation.class)) {
		System.out.println(token.word() + "\t" + token.beginPosition() + "\t" + token.endPosition());
	}
}
 
Example #2
Source File: NERSearcher.java    From Stargraph with MIT License 6 votes vote down vote up
private List<LinkedNamedEntity> postProcessFoundNamedEntities(List<List<CoreLabel>> sentences) {
    final List<List<LinkedNamedEntity>> sentenceList = mergeConsecutiveNamedEntities(sentences);

    if (this.reverseNameOrder) {
        sentenceList.forEach(sentence -> {
            sentence.forEach(LinkedNamedEntity::reverseValue);
        });
    }

    if (sentenceList.isEmpty() || (sentenceList.size() == 1 && sentenceList.get(0).isEmpty())) {
        logger.trace(marker, "No Entities detected.");
        return Collections.emptyList();
    }

    return linkNamedEntities(sentenceList);
}
 
Example #3
Source File: IntelKBPAnnotator.java    From InformationExtraction with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Annotate all the pronominal mentions in the document.
 *
 * @param ann The document.
 * @return The list of pronominal mentions in the document.
 */
private static List<CoreMap> annotatePronominalMentions(Annotation ann) {
    List<CoreMap> pronouns = new ArrayList<>();
    List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
    for (int sentenceIndex = 0; sentenceIndex < sentences.size(); sentenceIndex++) {
        CoreMap sentence = sentences.get(sentenceIndex);
        Integer annoTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
        if (annoTokenBegin == null) {
            annoTokenBegin = 0;
        }

        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        for (int tokenIndex = 0; tokenIndex < tokens.size(); tokenIndex++) {
            CoreLabel token = tokens.get(tokenIndex);
            if (kbpIsPronominalMention(token)) {
                CoreMap pronoun = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1,
                        annoTokenBegin, null, CoreAnnotations.TextAnnotation.class, null);
                pronoun.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
                sentence.get(CoreAnnotations.MentionsAnnotation.class).add(pronoun);
                pronouns.add(pronoun);
            }
        }
    }

    return pronouns;
}
 
Example #4
Source File: MainTest.java    From dependensee with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Test of writeImage method, of class Main.
 */

@Test
public void testWriteImage() throws Exception {
    String text = "A quick brown fox jumped over the lazy dog.";
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    LexicalizedParser lp = LexicalizedParser.loadModel();
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
    Tree tree = lp.apply(wordList);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
    Main.writeImage(tdl, "image.png", 3);
    assert (new File("image.png").exists());
}
 
Example #5
Source File: RegexNerTest.java    From InformationExtraction with GNU General Public License v3.0 6 votes vote down vote up
public static List<String> extractNER(String doc){
    Annotation document = new Annotation(doc);

    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> result = new ArrayList<String>();
    for(CoreMap sentence: sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            // this is the text of the token
            String word = token.get(CoreAnnotations.TextAnnotation.class);
            // this is the POS tag of the token
            String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
            // this is the NER label of the token
            String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
            result.add(ne);
        }
    }
    return result;
}
 
Example #6
Source File: Chapter5.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void usingStanfordPOSTagger() {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos");
    props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger");
    props.put("pos.maxlen", 10);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(theSentence);
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            String word = token.get(TextAnnotation.class);
            String pos = token.get(PartOfSpeechAnnotation.class);
            System.out.print(word + "/" + pos + " ");
        }
        System.out.println();

        try {
            pipeline.xmlPrint(document, System.out);
            pipeline.prettyPrint(document, System.out);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
}
 
Example #7
Source File: Chapter4.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void usingStanfordNER() {
        String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz";
        CRFClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(model);

        String sentence = "";
        for (String element : sentences) {
            sentence += element;
        }

        List<List<CoreLabel>> entityList = classifier.classify(sentence);

        for (List<CoreLabel> internalList : entityList) {
            for (CoreLabel coreLabel : internalList) {
                String word = coreLabel.word();
                String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
//                System.out.println(word + ":" + category);
                if (!"O".equals(category)) {
                    System.out.println(word + ":" + category);
                }

            }

        }
    }
 
Example #8
Source File: GalicianReadability.java    From tint with GNU General Public License v3.0 6 votes vote down vote up
@Override public void addingContentWord(CoreLabel token) {
    super.addingContentWord(token);

    token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 4);
    String lemma = token.lemma();
    if (model.getLevel3Lemmas().contains(lemma)) {
        level3WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 3);
    }
    if (model.getLevel2Lemmas().contains(lemma)) {
        level2WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 2);
    }
    if (model.getLevel1Lemmas().contains(lemma)) {
        level1WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 1);
    }
}
 
Example #9
Source File: Postprocess.java    From phrases with Apache License 2.0 6 votes vote down vote up
public List<Pattern> run(List<Pattern> patterns) {

        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, sentiment");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        for (Pattern pattern : patterns) {
            Annotation annotation = pipeline.process(pattern.toSentences());
            for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    Tree tree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class);
                    int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
                    for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                        String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);

                    }
            }
        }
        return null;
    }
 
Example #10
Source File: Main.java    From dependensee with GNU General Public License v2.0 6 votes vote down vote up
public static void writeImage(String sentence, String outFile, int scale) throws Exception {
    
    LexicalizedParser lp = null;
    try {
        lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    } catch (Exception e) {
        System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar");
        return;
    }
    
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree tree = lp.apply(wordList);
    writeImage(tree, outFile, scale);
    
}
 
Example #11
Source File: Tokens.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
public static Tokens getTokensFromJCas(JCas jCas) {
  Tokens tokens = new Tokens();
  int s_number = 0; //DKPro does not give sentence index????????
  int t_number = 0;
  for (Sentence sentence : select(jCas, Sentence.class)) {
    List<de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token> dktokens = selectCovered(jCas,
        de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token.class, sentence);
    for (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token t : dktokens) {
      CoreLabel taggedWord = CoreNlpUtils.tokenToWord(
          t); //This step should be avoided. Transform directly from DKPRO to AIDA TOKEN. Problem POS mappings. AIDA works with Stanford tags
      Token aidaToken = new Token(t_number, t.getCoveredText(), t.getBegin(), t.getEnd(), 0);
      aidaToken.setPOS(taggedWord.get(CoreAnnotations.PartOfSpeechAnnotation.class));
      aidaToken.setSentence(s_number);
      tokens.addToken(aidaToken);
      t_number++;
    }
    s_number++;
  }
  return tokens;
}
 
Example #12
Source File: ProcessorTools.java    From phrasal with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Convert a raw/preprocessed String pair to a labeled sequence appropriate for training
 * the CRF-based post-processor.
 * 
 * The SymmetricalWordAlignment is created by a Preprocessor. Source is the raw input, target is
 * the tokenized/pre-processed output.
 * 
 * @return
 */
public static List<CoreLabel> alignedPairToLabeledSequence(SymmetricalWordAlignment alignment) {
  List<CoreLabel> sequence = new ArrayList<>(alignment.eSize() * 7);
  
  for (int i = 0; i < alignment.fSize(); ++i) {
    if (sequence.size() > 0) sequence.add(createDatum(WHITESPACE, Operation.Whitespace.toString(), sequence.size(), WHITESPACE, 0));
    String token = alignment.f().get(i).toString();
    Set<Integer> eAlignments = alignment.f2e(i);
    if (eAlignments.size() == 0) {
      System.err.printf("%s: WARNING: discarding unaligned token (%s)%n", ProcessorTools.class.getName(), token);
      
    } else {
      List<String> eTokens = new ArrayList<>(eAlignments.size());
      for (int j : eAlignments) {
        eTokens.add(alignment.e().get(j).toString());
      }
      List<CoreLabel> charSequence = toSequence(token, eTokens, sequence.size());
      sequence.addAll(charSequence);
    }
  }
  return sequence;
}
 
Example #13
Source File: TestCustomLemmaAnnotator.java    From blog-codes with Apache License 2.0 6 votes vote down vote up
@Test
public void test() {
	Properties props = new Properties();
	props.setProperty("annotators", "tokenize,ssplit,pos,custom.lemma");
	props.setProperty("customAnnotatorClass.custom.lemma", "com.fancyerii.blog.stanfordnlp.CustomLemmaAnnotator");
	props.setProperty("custom.lemma.lemmaFile", "custom-lemmas.txt");
	// set up pipeline
	StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
	
	CoreDocument exampleDocument = new CoreDocument("Some many goods there.");
	// annotate document
	pipeline.annotate(exampleDocument);
	// access tokens from a CoreDocument
	// a token is represented by a CoreLabel
	List<CoreLabel> firstSentenceTokens = exampleDocument.sentences().get(0).tokens();
	// this for loop will print out all of the tokens and the character offset info
	for (CoreLabel token : firstSentenceTokens) {
		System.out.println(token.word()+"/"+token.getString(LemmaAnnotation.class) + "\t" + token.beginPosition() + "\t" + token.endPosition());
	}
}
 
Example #14
Source File: TokenizerDemo.java    From blog-codes with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
	for (String arg : args) {
		// option #1: By sentence.
		DocumentPreprocessor dp = new DocumentPreprocessor(arg);
		for (List<HasWord> sentence : dp) {
			System.out.println(sentence);
		}
		// option #2: By token
		PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg), new CoreLabelTokenFactory(), "");
		while (ptbt.hasNext()) {
			CoreLabel label = ptbt.next();
			System.out.println(label);
		}
	}
}
 
Example #15
Source File: Summarizer.java    From wiseowl with MIT License 5 votes vote down vote up
private static Counter<String> getTermFrequencies(List<CoreMap> sentences) {
  Counter<String> ret = new ClassicCounter<String>();

  for (CoreMap sentence : sentences)
    for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class))
      ret.incrementCount(cl.get(CoreAnnotations.TextAnnotation.class));

  return ret;
}
 
Example #16
Source File: Main.java    From dependensee with GNU General Public License v2.0 5 votes vote down vote up
public static void writeImage(String sentence, String outFile, LexicalizedParser lp) throws Exception {
    
    Tree parse;
    try {
        TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
        parse = lp.apply(wordList);            
    } catch (Exception e) {
        throw e;
    }
    writeImage(parse, outFile);
    
}
 
Example #17
Source File: NERTool.java    From Criteria2Query with Apache License 2.0 5 votes vote down vote up
public static void train(String traindatapath,String targetpath){
	long startTime = System.nanoTime();
       /* Step 1: learn the classifier from the training data */
       String trainFile = traindatapath; 
       /* Learn the classifier from the training data */
       String serializeFileLoc =targetpath;
       // properties: https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/NERFeatureFactory.html
       Properties props = new Properties();
       props.put("trainFile", trainFile); // To train with multiple files, a comma separated list
       props.put("map", "word=0,answer=1");
       props.put("useClassFeature", "true");
       props.put("useNGrams", "true");
       props.put("noMidNGrams", "true");
       props.put("maxNGramLeng", "6");
       props.put("useDisjunctive", "true");
       props.put("usePrev", "true");
       props.put("useNext", "true");
       props.put("useSequences", "true");
       props.put("usePrevSequences", "true");
       props.put("maxLeft", "1");
       props.put("useTypeSeqs", "true");
       props.put("useTypeSeqs2", "true");
       props.put("useTypeySequences", "true");
       props.put("wordShape", "chris2useLC");
       // props.put("printFeatures", "true");
       // This feature can be turned off in recent versions with the flag -useKnownLCWords false
       // https://nlp.stanford.edu/software/crf-faq.html question 13

       SeqClassifierFlags flags = new SeqClassifierFlags(props);
       CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(flags);
       crf.train();
       crf.serializeClassifier(serializeFileLoc);
       
}
 
Example #18
Source File: CoreNLPUtils.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Given a semantic graph of a whole sentence (sg) and a "local root" node, get the subgraph from 'sg' which has 
 * 'localRoot' as a root. 
 * @param sg: semantic graph of the whole sentence
 * @param localRoot: the root of the subgraph
 * @return semantic graph object which is the subgraph from 'sg'
 */
public static SemanticGraph getSubgraph(SemanticGraph sg, IndexedWord localRoot){
    ObjectArrayList<TypedDependency> subGraphDependencies = getSubgraphTypedDependencies(sg, localRoot, 
                                                                        new ObjectArrayList<TypedDependency>());
    TreeGraphNode rootTGN = new TreeGraphNode(new CoreLabel(localRoot));
    EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(subGraphDependencies, rootTGN);
    return SemanticGraphFactory.generateUncollapsedDependencies(gs);
}
 
Example #19
Source File: CRFPostprocessor.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Train a model given a preprocessor.
 * 
 * @param preProcessor
 */
protected void train(Preprocessor preProcessor) {
  DocumentReaderAndWriter<CoreLabel> docReader = 
      new ProcessorTools.PostprocessorDocumentReaderAndWriter(preProcessor);
  ObjectBank<List<CoreLabel>> lines =
    classifier.makeObjectBankFromFile(flags.trainFile, docReader);

  classifier.train(lines, docReader);
  System.err.println("Finished training.");
}
 
Example #20
Source File: Phrase.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
/** Remove a set of words represented as core labels from the list of indexed words **/
public void removeCoreLabelWordsFromList(List<CoreMap> cmWords){
    ObjectArrayList<IndexedWord> rWords = new ObjectArrayList<>();
    for (CoreMap cm: cmWords){
        rWords.add(new IndexedWord(new CoreLabel(cm)));
    }
    this.removeWordsFromList(rWords);
}
 
Example #21
Source File: Summarizer.java    From wiseowl with MIT License 5 votes vote down vote up
private double tfIDFWeights(CoreMap sentence) {
  double total = 0;
  for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class))
    if (cl.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("n"))
      total += tfIDFWeight(cl.get(CoreAnnotations.TextAnnotation.class));

  return total;
}
 
Example #22
Source File: CoreNLPUtils.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Given a list of words (as core maps), return the phrase of words as a list of indexed word objects
 * @param words: list of words (e.g. [She, is, pretty])
 * @return list of words (as IndexedWord)
 */
public static ObjectArrayList<IndexedWord> listOfCoreMapWordsToIndexedWordList(List<CoreMap> cmList){
    ObjectArrayList<IndexedWord> wordList = new ObjectArrayList<>();
    for (CoreMap cm: cmList){
        wordList.add(new IndexedWord(new CoreLabel(cm)));
    }
    return wordList;
}
 
Example #23
Source File: CoreNLPUtils.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
public static ObjectArrayList<CoreLabel> getCoreLabelListFromIndexedWordList(ObjectArrayList<IndexedWord> words) {
    ObjectArrayList<CoreLabel> coreLabelList = new ObjectArrayList<>();
    for (IndexedWord w: words) {
        coreLabelList.add(new CoreLabel(w));
    }
    return coreLabelList;
}
 
Example #24
Source File: CoreNLPUtils.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
public static ObjectArrayList<IndexedWord> getWordListFromCoreMapList(List<CoreMap> coreMapList){
    ObjectArrayList<IndexedWord> coreLabelList = new ObjectArrayList<>();
    for (CoreMap cm: coreMapList){
        coreLabelList.add(new IndexedWord(new CoreLabel(cm)));
    }
    return coreLabelList;
}
 
Example #25
Source File: UPosAnnotator.java    From tint with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void annotate(Annotation annotation) {
    for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
        String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);

        String[] parts = pos.split("\\+");
        StringBuffer upos = new StringBuffer();
        for (String part : parts) {
            String thisPos = uposMap.getOrDefault(part, DEFAULT_UPOS);
            upos.append("+").append(thisPos);
        }
        token.set(CustomAnnotations.UPosAnnotation.class, upos.substring(1));
    }

}
 
Example #26
Source File: CoreNLPUtils.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
public static ObjectOpenHashSet<IndexedWord> getWordSetFromCoreMapList(List<CoreMap> coreMapList){
    ObjectOpenHashSet<IndexedWord> coreLabelSet = new ObjectOpenHashSet<>();
    for (CoreMap cm: coreMapList){
        coreLabelSet.add(new IndexedWord(new CoreLabel(cm)));
    }
    return coreLabelSet;
}
 
Example #27
Source File: Minimization.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
/** Given a phrase, if it contains a verb phrase, make a verb phrase safe minimization **/
public void verbPhraseSafeMinimization(List<CoreMap> remWords, List<CoreMap> matchWords){
    // Flags for checking certain conditions
    boolean isAdverb;
    boolean isNotNER;
    boolean containsNEG;
    
    // If the relation starts with a RB+ VB+, drop RB+
    this.tPattern = TokenSequencePattern.compile(REGEX.T_RB_VB);
    this.tMatcher = tPattern.getMatcher(this.phrase.getWordCoreLabelList());
    while (this.tMatcher.find()){   
        matchWords = tMatcher.groupNodes();
        
        for (CoreMap cm: matchWords){
            CoreLabel cl = new CoreLabel(cm);
            if (cl.lemma() == null) cl.setLemma(cl.word());
            
            isAdverb = CoreNLPUtils.isAdverb(cl.tag());
            isNotNER = cl.ner().equals(NE_TYPE.NO_NER);
            containsNEG = Polarity.NEG_WORDS.contains(cl.lemma().toLowerCase());
            
            // Check if the word is RB which is not a NER
            if (isAdverb && isNotNER && !containsNEG){
                remWords.add(cm);   
            }
        }
        this.dropWords(remWords, matchWords);
    }
}
 
Example #28
Source File: CoreNLPTokenizer.java    From Heracles with GNU General Public License v3.0 5 votes vote down vote up
/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must provide an annotation of type "text".
	 * This spanType does not have to be textual unit.
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfTextUnit){
		if (dataset.getPerformedNLPTasks().contains(getTask())){
			Framework.error("This dataset has already been tokenized.");
			return;
		}
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "tokenize");
		//prop1.setProperty("options", "splitHyphenated=true");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		
		for (Span span : dataset.getSpans(spanTypeOfTextUnit)){
			Span textualUnit = span.getTextualUnit();
			String originalText = span.getAnnotation("text", String.class); 
			Annotation a = new Annotation(originalText);
			pipeline.annotate(a);
			List<CoreLabel> tokenAnnotations = a.get(TokensAnnotation.class);
			Word previousWord = null;
			if (!textualUnit.isEmpty())
				previousWord = textualUnit.last();
				
			for (CoreLabel token : tokenAnnotations){
				String word = token.get(OriginalTextAnnotation.class);
				int startOffset = token.get(CharacterOffsetBeginAnnotation.class);
//				int endOffset = token.get(CharacterOffsetEndAnnotation.class);
//				System.out.println(word + "\t" + startOffset + "\t" + endOffset);
				if (previousWord == null){
					previousWord = new Word(word, startOffset, textualUnit, dataset);
				} else {
					previousWord = new Word(word, startOffset, previousWord);
				}
				//and add the new word to the sentence span. If span=textualSpan than this has no effect
				if (!textualUnit.equals(span))
					span.add(previousWord);
			}
		}		
	}
 
Example #29
Source File: MinIE.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Given an object phrase, check if it has infinitive verbs modifying a noun phrase or a named entity. 
 * If yes, then return "true", else -> "false"
 * @param object: the object phrase
 * @return
 */
public boolean pushInfinitiveVerb(Phrase object){
    TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_TO_VB_NP_NER);
    TokenSequenceMatcher tMatcher = tPattern.getMatcher(object.getWordCoreLabelList());
    
    while (tMatcher.find()){         
        CoreLabel firstWordMatch = new CoreLabel(tMatcher.groupNodes().get(0));
        if (firstWordMatch.index() == object.getWordList().get(0).index()){
            return true;
        }
    }
    
    return false;
}
 
Example #30
Source File: Minimization.java    From minie with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Given a list of matched core maps (a phrase) and a list of words which are candidates for dropping ('remWords'), 
 * check if some of them form sub-constituents of 'matchCoreMaps' which are found in the dictionary.
 * If there are, remove them from 'remWords'. The words left in 'remWords' are the ones that couldn't be matched
 * with a sub-constituent found in the dictionary, i.e. those are the ones that we drop.
 * @param matchCoreMaps: list of words as a list of CoreMap object (a phrase)
 * @param remWords: list of candidates to be dropped (each word in 'remWord' can also be found in 'matchCoreMaps')
 */
public void dropWordsNotFoundInDict(List<CoreMap> matchCoreMaps, List<CoreMap> remWords){
    // Get all the sub-constituents
    ObjectArrayList<IndexedWord> words = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(matchCoreMaps);
    SubConstituent sc = new SubConstituent(this.sg, CoreNLPUtils.getRootFromWordList(this.sg, words), words);
    sc.generateSubConstituentsFromLeft();
    ObjectOpenHashSet<String> subconstituents = sc.getStringSubConstituents();
    
    // Sub-constituents' strings found in the dictionary
    ObjectArrayList<String> scStringsInDict = new ObjectArrayList<>();
    for (String s: subconstituents){
        if (this.mwe.contains(s)){
            scStringsInDict.add(s);
        }
    }
    
    // If sub-constituents's strings are found in the dictionary, detect the words associated with them
    // and remove them.
    if (scStringsInDict.size() > 0){
        Iterator<CoreMap> iter = remWords.iterator();
        for (String stInDict: scStringsInDict){
            while (iter.hasNext()){   
                CoreMap cm = iter.next();
                CoreLabel cl = new CoreLabel(cm);
                if (stInDict.contains(cl.lemma().toLowerCase())){
                    iter.remove();
                }
            }
        }
    }
    
    // Drop the words not found in frequent/collocation sub-constituents
    this.dropWords(remWords, matchCoreMaps);
}