Java Code Examples for edu.stanford.nlp.ling.CoreLabel

The following examples show how to use edu.stanford.nlp.ling.CoreLabel. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: blog-codes   Source File: JavaClient.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
	// creates a StanfordCoreNLP object with POS tagging, lemmatization, NER, parsing, and coreference resolution
	Properties props = new Properties();
	
	props.setProperty("annotators", "tokenize,ssplit,pos,ner,depparse,openie"); 
	MultiLangsStanfordCoreNLPClient pipeline = new MultiLangsStanfordCoreNLPClient(props, "http://localhost", 9000, 2, null, null, "zh");

	// read some text in the text variable
	String text = "今天天气很好。";
	// create an empty Annotation just with the given text
	Annotation document = new Annotation(text);
	// run all Annotators on this text
	pipeline.annotate(document);
	
	CoreMap firstSentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0);
	// this for loop will print out all of the tokens and the character offset info
	for (CoreLabel token : firstSentence.get(CoreAnnotations.TokensAnnotation.class)) {
		System.out.println(token.word() + "\t" + token.beginPosition() + "\t" + token.endPosition());
	}
}
 
Example 2
Source Project: blog-codes   Source File: TestCustomLemmaAnnotator.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test() {
	Properties props = new Properties();
	props.setProperty("annotators", "tokenize,ssplit,pos,custom.lemma");
	props.setProperty("customAnnotatorClass.custom.lemma", "com.fancyerii.blog.stanfordnlp.CustomLemmaAnnotator");
	props.setProperty("custom.lemma.lemmaFile", "custom-lemmas.txt");
	// set up pipeline
	StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
	
	CoreDocument exampleDocument = new CoreDocument("Some many goods there.");
	// annotate document
	pipeline.annotate(exampleDocument);
	// access tokens from a CoreDocument
	// a token is represented by a CoreLabel
	List<CoreLabel> firstSentenceTokens = exampleDocument.sentences().get(0).tokens();
	// this for loop will print out all of the tokens and the character offset info
	for (CoreLabel token : firstSentenceTokens) {
		System.out.println(token.word()+"/"+token.getString(LemmaAnnotation.class) + "\t" + token.beginPosition() + "\t" + token.endPosition());
	}
}
 
Example 3
Source Project: ambiverse-nlu   Source File: Tokens.java    License: Apache License 2.0 6 votes vote down vote up
public static Tokens getTokensFromJCas(JCas jCas) {
  Tokens tokens = new Tokens();
  int s_number = 0; //DKPro does not give sentence index????????
  int t_number = 0;
  for (Sentence sentence : select(jCas, Sentence.class)) {
    List<de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token> dktokens = selectCovered(jCas,
        de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token.class, sentence);
    for (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token t : dktokens) {
      CoreLabel taggedWord = CoreNlpUtils.tokenToWord(
          t); //This step should be avoided. Transform directly from DKPRO to AIDA TOKEN. Problem POS mappings. AIDA works with Stanford tags
      Token aidaToken = new Token(t_number, t.getCoveredText(), t.getBegin(), t.getEnd(), 0);
      aidaToken.setPOS(taggedWord.get(CoreAnnotations.PartOfSpeechAnnotation.class));
      aidaToken.setSentence(s_number);
      tokens.addToken(aidaToken);
      t_number++;
    }
    s_number++;
  }
  return tokens;
}
 
Example 4
Source Project: phrasal   Source File: ProcessorTools.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Convert a raw/preprocessed String pair to a labeled sequence appropriate for training
 * the CRF-based post-processor.
 * 
 * The SymmetricalWordAlignment is created by a Preprocessor. Source is the raw input, target is
 * the tokenized/pre-processed output.
 * 
 * @return
 */
public static List<CoreLabel> alignedPairToLabeledSequence(SymmetricalWordAlignment alignment) {
  List<CoreLabel> sequence = new ArrayList<>(alignment.eSize() * 7);
  
  for (int i = 0; i < alignment.fSize(); ++i) {
    if (sequence.size() > 0) sequence.add(createDatum(WHITESPACE, Operation.Whitespace.toString(), sequence.size(), WHITESPACE, 0));
    String token = alignment.f().get(i).toString();
    Set<Integer> eAlignments = alignment.f2e(i);
    if (eAlignments.size() == 0) {
      System.err.printf("%s: WARNING: discarding unaligned token (%s)%n", ProcessorTools.class.getName(), token);
      
    } else {
      List<String> eTokens = new ArrayList<>(eAlignments.size());
      for (int j : eAlignments) {
        eTokens.add(alignment.e().get(j).toString());
      }
      List<CoreLabel> charSequence = toSequence(token, eTokens, sequence.size());
      sequence.addAll(charSequence);
    }
  }
  return sequence;
}
 
Example 5
Source Project: dependensee   Source File: Main.java    License: GNU General Public License v2.0 6 votes vote down vote up
public static void writeImage(String sentence, String outFile, int scale) throws Exception {
    
    LexicalizedParser lp = null;
    try {
        lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    } catch (Exception e) {
        System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar");
        return;
    }
    
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree tree = lp.apply(wordList);
    writeImage(tree, outFile, scale);
    
}
 
Example 6
Source Project: tint   Source File: GalicianReadability.java    License: GNU General Public License v3.0 6 votes vote down vote up
@Override public void addingContentWord(CoreLabel token) {
    super.addingContentWord(token);

    token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 4);
    String lemma = token.lemma();
    if (model.getLevel3Lemmas().contains(lemma)) {
        level3WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 3);
    }
    if (model.getLevel2Lemmas().contains(lemma)) {
        level2WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 2);
    }
    if (model.getLevel1Lemmas().contains(lemma)) {
        level1WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 1);
    }
}
 
Example 7
private static void usingStanfordNER() {
        String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz";
        CRFClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(model);

        String sentence = "";
        for (String element : sentences) {
            sentence += element;
        }

        List<List<CoreLabel>> entityList = classifier.classify(sentence);

        for (List<CoreLabel> internalList : entityList) {
            for (CoreLabel coreLabel : internalList) {
                String word = coreLabel.word();
                String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
//                System.out.println(word + ":" + category);
                if (!"O".equals(category)) {
                    System.out.println(word + ":" + category);
                }

            }

        }
    }
 
Example 8
private static void usingStanfordPOSTagger() {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos");
    props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger");
    props.put("pos.maxlen", 10);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(theSentence);
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            String word = token.get(TextAnnotation.class);
            String pos = token.get(PartOfSpeechAnnotation.class);
            System.out.print(word + "/" + pos + " ");
        }
        System.out.println();

        try {
            pipeline.xmlPrint(document, System.out);
            pipeline.prettyPrint(document, System.out);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
}
 
Example 9
Source Project: Stargraph   Source File: NERSearcher.java    License: MIT License 6 votes vote down vote up
private List<LinkedNamedEntity> postProcessFoundNamedEntities(List<List<CoreLabel>> sentences) {
    final List<List<LinkedNamedEntity>> sentenceList = mergeConsecutiveNamedEntities(sentences);

    if (this.reverseNameOrder) {
        sentenceList.forEach(sentence -> {
            sentence.forEach(LinkedNamedEntity::reverseValue);
        });
    }

    if (sentenceList.isEmpty() || (sentenceList.size() == 1 && sentenceList.get(0).isEmpty())) {
        logger.trace(marker, "No Entities detected.");
        return Collections.emptyList();
    }

    return linkNamedEntities(sentenceList);
}
 
Example 10
Source Project: phrases   Source File: Postprocess.java    License: Apache License 2.0 6 votes vote down vote up
public List<Pattern> run(List<Pattern> patterns) {

        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, sentiment");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        for (Pattern pattern : patterns) {
            Annotation annotation = pipeline.process(pattern.toSentences());
            for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    Tree tree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class);
                    int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
                    for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                        String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);

                    }
            }
        }
        return null;
    }
 
Example 11
Source Project: dependensee   Source File: MainTest.java    License: GNU General Public License v2.0 6 votes vote down vote up
/**
 * Test of writeImage method, of class Main.
 */

@Test
public void testWriteImage() throws Exception {
    String text = "A quick brown fox jumped over the lazy dog.";
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    LexicalizedParser lp = LexicalizedParser.loadModel();
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
    Tree tree = lp.apply(wordList);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
    Main.writeImage(tdl, "image.png", 3);
    assert (new File("image.png").exists());
}
 
Example 12
/**
 * Annotate all the pronominal mentions in the document.
 *
 * @param ann The document.
 * @return The list of pronominal mentions in the document.
 */
private static List<CoreMap> annotatePronominalMentions(Annotation ann) {
    List<CoreMap> pronouns = new ArrayList<>();
    List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
    for (int sentenceIndex = 0; sentenceIndex < sentences.size(); sentenceIndex++) {
        CoreMap sentence = sentences.get(sentenceIndex);
        Integer annoTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
        if (annoTokenBegin == null) {
            annoTokenBegin = 0;
        }

        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        for (int tokenIndex = 0; tokenIndex < tokens.size(); tokenIndex++) {
            CoreLabel token = tokens.get(tokenIndex);
            if (kbpIsPronominalMention(token)) {
                CoreMap pronoun = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1,
                        annoTokenBegin, null, CoreAnnotations.TextAnnotation.class, null);
                pronoun.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
                sentence.get(CoreAnnotations.MentionsAnnotation.class).add(pronoun);
                pronouns.add(pronoun);
            }
        }
    }

    return pronouns;
}
 
Example 13
public static List<String> extractNER(String doc){
    Annotation document = new Annotation(doc);

    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> result = new ArrayList<String>();
    for(CoreMap sentence: sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            // this is the text of the token
            String word = token.get(CoreAnnotations.TextAnnotation.class);
            // this is the POS tag of the token
            String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
            // this is the NER label of the token
            String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
            result.add(ne);
        }
    }
    return result;
}
 
Example 14
Source Project: blog-codes   Source File: TokenizerDemo.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
	for (String arg : args) {
		// option #1: By sentence.
		DocumentPreprocessor dp = new DocumentPreprocessor(arg);
		for (List<HasWord> sentence : dp) {
			System.out.println(sentence);
		}
		// option #2: By token
		PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg), new CoreLabelTokenFactory(), "");
		while (ptbt.hasNext()) {
			CoreLabel label = ptbt.next();
			System.out.println(label);
		}
	}
}
 
Example 15
Source Project: Shour   Source File: CoreNLP.java    License: MIT License 5 votes vote down vote up
public static List<String> tokensToStrings(List<CoreLabel> tokens) {
    List<String> words = new ArrayList<String>();
    for (CoreLabel label : tokens) {
        words.add(getWord(label));
    }
    return words;
}
 
Example 16
Source Project: jstarcraft-nlp   Source File: CoreNlpTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Iterable<CoreNlpToken> tokenize(CharSequence text) {
    Iterable<CoreLabel> iterator;
    if (StringUtility.isBlank(text)) {
        // 空格无需分词
        iterator = Collections.EMPTY_LIST;
    } else {
        Annotation annotation = new Annotation(text.toString());
        annotator.annotate(annotation);
        iterator = annotation.get(CoreAnnotations.TokensAnnotation.class);
    }
    CoreNlpToken iterable = new CoreNlpToken(iterator.iterator());
    return iterable;
}
 
Example 17
Source Project: analyzer-solr   Source File: StanfordAdapter.java    License: MIT License 5 votes vote down vote up
/**
 * 
 */
private StanfordAdapter(Reader input, String modelDir) {
	Properties props = new Properties();
	props.setProperty("sighanCorporaDict", modelDir);
	// props.setProperty("NormalizationTable", "data/norm.simp.utf8");
	// props.setProperty("normTableEncoding", "UTF-8");
	// below is needed because CTBSegDocumentIteratorFactory accesses it
	props.setProperty("serDictionary", modelDir + "/dict-chris6.ser.gz" + "," +  modelDir + "/dict-chris6.ser.gz");
	props.setProperty("inputEncoding", "UTF-8");
	props.setProperty("sighanPostProcessing", "true");

	segmenter = new CRFClassifier<CoreLabel>(props);
	segmenter.loadClassifierNoExceptions(modelDir + "/ctb.gz", props);
}
 
Example 18
@Override
public List<Term> segment(String sentence) {
    CoreDocument exampleDocument = new CoreDocument(sentence);
    // annotate document
    stanfordCoreNLP.annotate(exampleDocument);
    // access tokens from a CoreDocument
    // a token is represented by a CoreLabel
    List<CoreLabel> firstSentenceTokens = exampleDocument.sentences().get(0).tokens();
    // this for loop will print out all of the tokens and the character offset info
    List<Term> terms = new ArrayList<>();
    for (CoreLabel token : firstSentenceTokens) {
        terms.add(new Term(token.word(), token.tag()));
    }
    return terms;
}
 
Example 19
Source Project: ambiverse-nlu   Source File: ClausIE.java    License: Apache License 2.0 5 votes vote down vote up
public ClausIE(LexicalizedParser lp, TokenizerFactory<CoreLabel> tokenizerFactory, LexicalizedParserQuery lpq)
    throws IOException, URISyntaxException {
  this(new Options());
  this.lp = lp;
  this.tokenizerFactory = tokenizerFactory;
  this.lpq = lpq;
}
 
Example 20
Source Project: coreNlp   Source File: StopwordAnnotator.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void annotate(Annotation annotation) {
    if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(TokensAnnotation.class)) {
        List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
        for (CoreLabel token : tokens) {
            boolean isWordStopword = stopwords.contains(token.word().toLowerCase());
            boolean isLemmaStopword = checkLemma ? stopwords.contains(token.word().toLowerCase()) : false;
            Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword);
            token.set(StopwordAnnotator.class, pair);
        }
    }
}
 
Example 21
Source Project: coreNlp   Source File: StopwordAnnotatorTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test to validate that the custom stopword list words
 * @throws Exception
 */
@org.junit.Test
public void testCustomStopwordList() throws Exception {

    //setup coreNlp properties for stopwords. Note the custom stopword list property
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, stopword");
    props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");
    props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);

    //get the custom stopword set
    Set<?> stopWords = StopwordAnnotator.getStopWordList(Version.LUCENE_36, customStopWordList, true);

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(example);
    pipeline.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    for (CoreLabel token : tokens) {

        //get the stopword annotation
        Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

        String word = token.word().toLowerCase();
        if (stopWords.contains(word)) {
            assertTrue(stopword.first());
        }
        else {
            assertFalse(stopword.first());
        }

        //not checking lemma, so always false
        assertFalse(stopword.second());
    }
}
 
Example 22
Source Project: gAnswer   Source File: CoreNLP.java    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public String getBaseFormOfPattern (String text) {
	String ret = new String("");
	
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    // run all Annotators on this text
    pipeline_lemma.annotate(document);


    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    int count = 0;
    for(CoreMap sentence: sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        // this is the base form (lemma) of the token
        String lemma = token.getString(LemmaAnnotation.class);
        ret += lemma;
        ret += " ";
      }
      count ++;
      if (count % 100 == 0) {
    	  System.out.println(count);
      }
    }
    
    return ret.substring(0, ret.length()-1);
}
 
Example 23
Source Project: gAnswer   Source File: CoreNLP.java    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public Word[] getTaggedWords (String sentence) {
	CoreMap taggedSentence = getPOS(sentence);
	Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()];
	int count = 0;
	for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) {
		// this is the text of the token
		String word = token.get(TextAnnotation.class);
		// this is the POS tag of the token
		String pos = token.get(PartOfSpeechAnnotation.class);
		//System.out.println(word+"["+pos+"]");
		ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1);
		count ++;
	}
	return ret;
}
 
Example 24
Source Project: gAnswer   Source File: NERecognizer.java    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public void recognize(Sentence sentence) {
	List<CoreLabel> lcl = classifier.classify(sentence.plainText).get(0);
	for (CoreLabel cl : lcl) {
		int position = Integer.parseInt(cl.get(PositionAnnotation.class))+1;
		Word w = sentence.getWordByIndex(position);
		String ner = cl.get(AnswerAnnotation.class);
		if (ner.equals("O")) w.ner = null;
		else w.ner = ner;
	}
}
 
Example 25
Source Project: core-nlp-example   Source File: CoreNlpExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {

        // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        // read some text in the text variable
        String text = "What is the Weather in Bangalore right now?";

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(text);

        // run all Annotators on this text
        pipeline.annotate(document);

        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);

        for (CoreMap sentence : sentences) {
            // traversing the words in the current sentence
            // a CoreLabel is a CoreMap with additional token-specific methods
            for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                // this is the text of the token
                String word = token.get(CoreAnnotations.TextAnnotation.class);
                // this is the POS tag of the token
                String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
                // this is the NER label of the token
                String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);

                System.out.println(String.format("Print: word: [%s] pos: [%s] ne: [%s]", word, pos, ne));
            }
        }
    }
 
Example 26
Source Project: dependensee   Source File: Main.java    License: GNU General Public License v2.0 5 votes vote down vote up
public static void writeImage(String sentence, String outFile, LexicalizedParser lp) throws Exception {
    
    Tree parse;
    try {
        TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
        parse = lp.apply(wordList);            
    } catch (Exception e) {
        throw e;
    }
    writeImage(parse, outFile);
    
}
 
Example 27
Source Project: Criteria2Query   Source File: NERTool.java    License: Apache License 2.0 5 votes vote down vote up
public static void train(String traindatapath,String targetpath){
	long startTime = System.nanoTime();
       /* Step 1: learn the classifier from the training data */
       String trainFile = traindatapath; 
       /* Learn the classifier from the training data */
       String serializeFileLoc =targetpath;
       // properties: https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/NERFeatureFactory.html
       Properties props = new Properties();
       props.put("trainFile", trainFile); // To train with multiple files, a comma separated list
       props.put("map", "word=0,answer=1");
       props.put("useClassFeature", "true");
       props.put("useNGrams", "true");
       props.put("noMidNGrams", "true");
       props.put("maxNGramLeng", "6");
       props.put("useDisjunctive", "true");
       props.put("usePrev", "true");
       props.put("useNext", "true");
       props.put("useSequences", "true");
       props.put("usePrevSequences", "true");
       props.put("maxLeft", "1");
       props.put("useTypeSeqs", "true");
       props.put("useTypeSeqs2", "true");
       props.put("useTypeySequences", "true");
       props.put("wordShape", "chris2useLC");
       // props.put("printFeatures", "true");
       // This feature can be turned off in recent versions with the flag -useKnownLCWords false
       // https://nlp.stanford.edu/software/crf-faq.html question 13

       SeqClassifierFlags flags = new SeqClassifierFlags(props);
       CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(flags);
       crf.train();
       crf.serializeClassifier(serializeFileLoc);
       
}
 
Example 28
Source Project: minie   Source File: CoreNLPUtils.java    License: GNU General Public License v3.0 5 votes vote down vote up
/**
 * Given a semantic graph of a whole sentence (sg) and a "local root" node, get the subgraph from 'sg' which has 
 * 'localRoot' as a root. 
 * @param sg: semantic graph of the whole sentence
 * @param localRoot: the root of the subgraph
 * @return semantic graph object which is the subgraph from 'sg'
 */
public static SemanticGraph getSubgraph(SemanticGraph sg, IndexedWord localRoot){
    ObjectArrayList<TypedDependency> subGraphDependencies = getSubgraphTypedDependencies(sg, localRoot, 
                                                                        new ObjectArrayList<TypedDependency>());
    TreeGraphNode rootTGN = new TreeGraphNode(new CoreLabel(localRoot));
    EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(subGraphDependencies, rootTGN);
    return SemanticGraphFactory.generateUncollapsedDependencies(gs);
}
 
Example 29
Source Project: phrasal   Source File: CRFPostprocessor.java    License: GNU General Public License v3.0 5 votes vote down vote up
/**
 * Train a model given a preprocessor.
 * 
 * @param preProcessor
 */
protected void train(Preprocessor preProcessor) {
  DocumentReaderAndWriter<CoreLabel> docReader = 
      new ProcessorTools.PostprocessorDocumentReaderAndWriter(preProcessor);
  ObjectBank<List<CoreLabel>> lines =
    classifier.makeObjectBankFromFile(flags.trainFile, docReader);

  classifier.train(lines, docReader);
  System.err.println("Finished training.");
}
 
Example 30
Source Project: minie   Source File: CoreNLPUtils.java    License: GNU General Public License v3.0 5 votes vote down vote up
/**
 * Given a list of words (as core maps), return the phrase of words as a list of indexed word objects
 * @param words: list of words (e.g. [She, is, pretty])
 * @return list of words (as IndexedWord)
 */
public static ObjectArrayList<IndexedWord> listOfCoreMapWordsToIndexedWordList(List<CoreMap> cmList){
    ObjectArrayList<IndexedWord> wordList = new ObjectArrayList<>();
    for (CoreMap cm: cmList){
        wordList.add(new IndexedWord(new CoreLabel(cm)));
    }
    return wordList;
}