edu.stanford.nlp.process.Tokenizer Java Examples

The following examples show how to use edu.stanford.nlp.process.Tokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TypeClassifier.java    From winter with Apache License 2.0 6 votes vote down vote up
/**
 * Initializes the tokenizer to detect date columns.
 */
public void initialize() {
	Properties props = new Properties();
	pipeline.addAnnotator(new TokenizerAnnotator(false) {

		@Override
		public Tokenizer<CoreLabel> getTokenizer(Reader r) {
			// TODO Auto-generated method stub
			return new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), "");

		}

	});
	pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
	pipeline.addAnnotator(new POSTaggerAnnotator(false));
	pipeline.addAnnotator(new TimeAnnotator("sutime", props));
}
 
Example #2
Source File: StanfordParser.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Parses a sentence and returns the PCFG score as a confidence measure.
 * 
 * @param sentence
 *            a sentence
 * @return PCFG score
 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        score = parser.getPCFGScore();
    }

    return score;
}
 
Example #3
Source File: StanfordLexicalDemo.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
public static void main(String args[]){
    String parseModel = getResourcePath() + "englishPCFG.ser.gz";
    LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parseModel);
    String [] sentenceArray = {"The", "cow" ,"jumped", "over", "the", "moon", "."};
    List<CoreLabel> words = SentenceUtils.toCoreLabelList(sentenceArray);
    Tree parseTree = lexicalizedParser.apply(words); 
    parseTree.pennPrint(); 
    
    TreePrint treePrint =  new TreePrint("typedDependenciesCollapsed"); 
    treePrint.printTree(parseTree); 
    
    
    String sentence = "The cow jumped over the moon."; 
    TokenizerFactory<CoreLabel> tokenizerFactory =  PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 
    Tokenizer<CoreLabel> tokenizer =  tokenizerFactory.getTokenizer(new StringReader(sentence)); 
    List<CoreLabel> wordList = tokenizer.tokenize(); 
    parseTree = lexicalizedParser.apply(wordList); 
    TreebankLanguagePack tlp =  lexicalizedParser.treebankLanguagePack(); 
    GrammaticalStructureFactory gsf =  tlp.grammaticalStructureFactory(); 
    GrammaticalStructure gs =  gsf.newGrammaticalStructure(parseTree); 
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); 
    System.out.println(tdl); 
    
    for(TypedDependency dependency : tdl) { 
        System.out.println("Governor Word: [" + dependency.gov()  
            + "] Relation: [" + dependency.reln().getLongName() 
            + "] Dependent Word: [" + dependency.dep() + "]"); 
    } 
    
}
 
Example #4
Source File: CoreNLPPreprocessor.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
@Override
public Sequence<IString> process(String input) {
  String tokenizerInput = toUncased(input.trim());
  Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput));
  List<String> outputStrings = new ArrayList<>();
  while (tokenizer.hasNext()) {
    String string = tokenizer.next().get(TextAnnotation.class);
    outputStrings.add(string);
  }
  Sequence<IString> rv = IStrings.toIStringSequence(outputStrings);
  
  if(compoundSplitter != null) rv = compoundSplitter.process(rv);
  
  return rv;
}
 
Example #5
Source File: StanfordParser.java    From OpenEphyra with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Parses a sentence and returns a string representation of the parse tree.
 * 
 * @param sentence
 *            a sentence
 * @return Tree whose Label is a MapLabel containing correct begin and end
 *         character offsets in keys BEGIN_KEY and END_KEY
 */
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new
    // MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
 
Example #6
Source File: Chapter7.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
private static void usingStanfordLexicalizedParser() {
        String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
        LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel);

        // This option shows parsing a list of correctly tokenized words
        System.out.println("---First option");
        String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."};
        List<CoreLabel> words = Sentence.toCoreLabelList(senetenceArray);

        Tree parseTree = lexicalizedParser.apply(words);
        parseTree.pennPrint();
        System.out.println();

        // This option shows loading and using an explicit tokenizer
        System.out.println("---Second option");
        String sentence = "The cow jumped over the moon.";
        TokenizerFactory<CoreLabel> tokenizerFactory
                = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        Tokenizer<CoreLabel> tokenizer
                = tokenizerFactory.getTokenizer(new StringReader(sentence));
        List<CoreLabel> wordList = tokenizer.tokenize();
        parseTree = lexicalizedParser.apply(wordList);

        TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
        System.out.println(tdl);
        for (TypedDependency dependency : tdl) {
            System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
                    + "] Dependent Word: [" + dependency.dep() + "]");
        }
        System.out.println();

        // You can also use a TreePrint object to print trees and dependencies
//        System.out.println("---Using TreePrint");
//        TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed");
//        treePrint.printTree(parseTree);
//        System.out.println("TreePrint Formats");
//        for (String format : TreePrint.outputTreeFormats) {
//            System.out.println(format);
//        }
//        System.out.println();
    }