edu.stanford.nlp.ling.HasWord Java Examples

The following examples show how to use edu.stanford.nlp.ling.HasWord. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CoreNLPAnnotator.java    From Stargraph with MIT License 6 votes vote down vote up
@Override
protected List<Word> doRun(Language language, String sentence) {
    MaxentTagger tagger = taggers.computeIfAbsent(language, lang -> {
        if (lang == EN) {
            return new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
        }
        throw new UnsupportedLanguageException(lang);
    });

    PartOfSpeechSet partOfSpeechSet = PartOfSpeechSet.getPOSSet(language);
    List<Word> words = new ArrayList<>();

    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(sentence));
    sentences.forEach(s -> {
        tagger.tagSentence(s).forEach(taggedWord ->
                words.add(new Word(partOfSpeechSet.valueOf(taggedWord.tag()), taggedWord.value())));
    });

    return words;
}
 
Example #2
Source File: DocumentFrequencyCounter.java    From wiseowl with MIT License 6 votes vote down vote up
/**
 * Get an IDF map for the given document string.
 *
 * @param document
 * @return
 */
private static Counter<String> getIDFMapForDocument(String document) {
  // Clean up -- remove some Gigaword patterns that slow things down
  // / don't help anything
  document = headingSeparator.matcher(document).replaceAll("");

  DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
  preprocessor.setTokenizerFactory(tokenizerFactory);

  Counter<String> idfMap = new ClassicCounter<String>();
  for (List<HasWord> sentence : preprocessor) {
    if (sentence.size() > MAX_SENTENCE_LENGTH)
      continue;

    List<TaggedWord> tagged = tagger.tagSentence(sentence);

    for (TaggedWord w : tagged) {
      if (w.tag().startsWith("n"))
        idfMap.incrementCount(w.word());
    }
  }

  return idfMap;
}
 
Example #3
Source File: CorenlpPipeline.java    From datashare with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Part-of-Speech Classification (Maximum entropy) only
 *
 * @param input    the string to annotator
 * @param hash     the input hash code
 * @param language the input language
 */
private Annotations processPosClassifier(String input, String hash, Language language) throws InterruptedException {
    Annotations annotations = new Annotations(hash, getType(), language);
    LOGGER.info("POS-tagging for " + language.toString());

    // Split input into sentences
    final CoreNlpAnnotator<MaxentTagger> nlpAnnotator;
    nlpAnnotator = CoreNlpPosModels.getInstance().get(language);
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(input));
    for (List<HasWord> sentence : sentences) {
        // NlpTag with parts-of-speech
        List<TaggedWord> taggedSentence = nlpAnnotator.annotator.tagSentence(sentence);
        // Feed annotatopn
        for (TaggedWord word : taggedSentence) {
            int begin = word.beginPosition();
            int end = word.endPosition();
            String pos = word.tag(); // like line 157 we don't use POS tagging
            annotations.add(POS, begin, end);
        }
    }
    return annotations;
}
 
Example #4
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Splits the sentence into individual tokens.
 * 
 * @param sentence Input sentence
 * @return Array of tokens
 */
public static String[] tokenize(String sentence) {
	List t = MaxentTagger.tokenizeText(new StringReader(sentence));
	
	List<String> tokens = new ArrayList<String>();
	
	for (int j = 0; j < t.size(); j++) {
		Sentence s1 = (Sentence) t.get(j);
		
		for (int i = 0; i < s1.length(); i++) {
			HasWord w = s1.getHasWord(i);
			tokens.add(w.word());
		}
	}
	
	return (String[]) tokens.toArray(new String[tokens.size()]);
}
 
Example #5
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Tags the tokens with part of speech
 * 
 * @param tokens Array of token strings
 * @return Part of speech tags
 */
public static String[] tagPos(String[] tokens) {
	Sentence untagged = createSentence(tokens);
	Sentence tagged = MaxentTagger.tagSentence(untagged);
	
	String[] pos = new String[tagged.size()];
	for (int i = 0; i < tagged.size(); i++) {
		HasWord w = (HasWord) tagged.get(i);
		String[] s = w.toString().split("/");
		if (s.length > 1)
			pos[i] = s[s.length - 1];
		else
			pos[i] = "";
	}
	
	return pos;
}
 
Example #6
Source File: TokenizerDemo.java    From blog-codes with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
	for (String arg : args) {
		// option #1: By sentence.
		DocumentPreprocessor dp = new DocumentPreprocessor(arg);
		for (List<HasWord> sentence : dp) {
			System.out.println(sentence);
		}
		// option #2: By token
		PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg), new CoreLabelTokenFactory(), "");
		while (ptbt.hasNext()) {
			CoreLabel label = ptbt.next();
			System.out.println(label);
		}
	}
}
 
Example #7
Source File: Chapter5.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingStanfordMaxentPOS() {
        try {
            MaxentTagger tagger = new MaxentTagger(getModelDir() + "//wsj-0-18-bidirectional-distsim.tagger");
//            MaxentTagger tagger = new MaxentTagger(getModelDir() + "//gate-EN-twitter.model");
//            System.out.println(tagger.tagString("AFAIK she H8 cth!"));
//            System.out.println(tagger.tagString("BTW had a GR8 tym at the party BBIAM."));
            List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader("sentences.txt")));
            for (List<HasWord> sentence : sentences) {
                List<TaggedWord> taggedSentence = tagger.tagSentence(sentence);
                // Simple display
                System.out.println("---" + taggedSentence);
                // Simple conversion to String
//                System.out.println(Sentence.listToString(taggedSentence, false));
                // Display of words and tags
//                for (TaggedWord taggedWord : taggedSentence) {
//                    System.out.print(taggedWord.word() + "/" + taggedWord.tag() + " ");
//                }
//                System.out.println();
                // List of specifc tags
//                System.out.print("NN Tagged: ");
//                for (TaggedWord taggedWord : taggedSentence) {
//                    if (taggedWord.tag().startsWith("NN")) {
//                        System.out.print(taggedWord.word() + " ");
//                    }
//                }
//                System.out.println();
            }
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        }
    }
 
Example #8
Source File: CoreNLP.java    From Criteria2Query with Apache License 2.0 5 votes vote down vote up
public List<String> splitParagraph(String paragraph){
	Reader reader = new StringReader(paragraph);
	DocumentPreprocessor dp = new DocumentPreprocessor(reader);
	List<String> sentenceList = new ArrayList<String>();
	for (List<HasWord> sentence : dp) {
		String sentenceString = SentenceUtils.listToString(sentence);
		sentenceList.add(sentenceString);
	}
	return sentenceList;
}
 
Example #9
Source File: StanfordPOSTagger.java    From jatecs with GNU General Public License v3.0 5 votes vote down vote up
public Vector<ArrayList<TaggedWord>> tag(String input) {
    Vector<ArrayList<TaggedWord>> returnVector = new Vector<ArrayList<TaggedWord>>();
    List<List<HasWord>> sentences = MaxentTagger
            .tokenizeText(new BufferedReader(new StringReader(input)));
    for (List<? extends HasWord> sentence : sentences) {
        returnVector.add(tagger.tagSentence(sentence));
    }
    return returnVector;
}
 
Example #10
Source File: ParseTree.java    From NLIDB with Apache License 2.0 5 votes vote down vote up
/**
 * Construct a parse tree using the stanford NLP parser. Only one sentence.
 * Here we are omitting the information of dependency labels (tags).
 * @param text input text.
 */
public ParseTree(String text, NLParser parser) {
	// pre-processing the input text
	DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
	List<HasWord> sentence = null;
	for (List<HasWord> sentenceHasWord : tokenizer) {
		sentence = sentenceHasWord;
		break;
	}
	// part-of-speech tagging
	List<TaggedWord> tagged = parser.tagger.tagSentence(sentence);
	// dependency syntax parsing
	GrammaticalStructure gs = parser.parser.predict(tagged);
	
	// Reading the parsed sentence into ParseTree
	int N = sentence.size()+1;
	Node[] nodes = new Node[N];
	root = new Node(0, "ROOT", "ROOT");
	nodes[0] = root;
	for (int i = 0; i < N-1; i++) {
		nodes[i+1] = new Node(i+1, 
				sentence.get(i).word(), tagged.get(i).tag());
	}
	for (TypedDependency typedDep : gs.allTypedDependencies()) {
		int from = typedDep.gov().index();
		int to   = typedDep.dep().index();
		// String label = typedDep.reln().getShortName(); // omitting the label
		nodes[to].parent = nodes[from];
		nodes[from].children.add(nodes[to]);
	}
}
 
Example #11
Source File: ParserDemo.java    From NLIDB with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	String modelPath = DependencyParser.DEFAULT_MODEL;
	String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";

	for (int argIndex = 0; argIndex < args.length;) {
		switch (args[argIndex]) {
		case "-tagger":
			taggerPath = args[argIndex + 1];
			argIndex += 2;
			break;
		case "-com.dukenlidb.nlidb.model":
			modelPath = args[argIndex + 1];
			argIndex += 2;
			break;
		default:
			throw new RuntimeException("Unknown argument " + args[argIndex]);
		}
	}

	String text = "Return authors who have more papers than Bob in VLDB after 2000";

	MaxentTagger tagger = new MaxentTagger(taggerPath);
	DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

	DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
	for (List<HasWord> sentence : tokenizer) {
		List<TaggedWord> tagged = tagger.tagSentence(sentence);
		GrammaticalStructure gs = parser.predict(tagged);

		// Print typed dependencies
		log.info(gs);
	}
	
}
 
Example #12
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Combines the tokens into a <code>Sentence</code> 
 * 
 * @param tokens
 * @return <code>Sentence</code> made of the tokens
 */
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
	ArrayList<HasWord> wordList = new ArrayList<HasWord>();
	
	for (String s : tokens) {
		HasWord w = new Word(s);
		wordList.add(w);
	}
	
	Sentence sentence = new Sentence();
	sentence.setWords(wordList);
	
	return sentence;
}
 
Example #13
Source File: StanfordPOSTagger.java    From ADW with GNU General Public License v3.0 5 votes vote down vote up
public List<TaggedWord> tag(List<? extends HasWord> sentence)
{
	if(sentence == null || sentence.size() == 0)
		return new ArrayList<TaggedWord>();
	
	return tagger.tagSentence(sentence);
}
 
Example #14
Source File: TaggerDemo.java    From blog-codes with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception { 
	InputStream input = TaggerDemo.class.getResourceAsStream("/"+MaxentTagger.DEFAULT_JAR_PATH);

	MaxentTagger tagger = new MaxentTagger(input);
	
	List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader("Karma of humans is AI"));

	for (List<HasWord> sentence : sentences) {

		List<TaggedWord> tSentence = tagger.tagSentence(sentence);

		System.out.println(SentenceUtils.listToString(tSentence, false));

	}

}
 
Example #15
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
private static void usingTheStanfordTokenizer() {

        // Using PTBTokenizer
        System.out.println("----PTBTokenizer Example");

        // First example
//        PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
//                new CoreLabelTokenFactory(),null);
//        while (ptb.hasNext()) {
//            System.out.println(ptb.next());
//        }
        // CoreLabel example
        CoreLabelTokenFactory ctf = new CoreLabelTokenFactory();
        PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
                ctf, "invertible=true");
//        PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
//                new WordTokenFactory(), null);
        while (ptb.hasNext()) {
            CoreLabel cl = (CoreLabel) ptb.next();
            System.out.println(cl.originalText() + " ("
                    + cl.beginPosition() + "-" + cl.endPosition() + ")");
        }

        // Using a DocumentPreprocessor
        System.out.println("----DocumentPreprocessor Example");
        Reader reader = new StringReader(paragraph);
        DocumentPreprocessor documentPreprocessor
                = new DocumentPreprocessor(reader);

        Iterator<List<HasWord>> it = documentPreprocessor.iterator();
        while (it.hasNext()) {
            List<HasWord> sentence = it.next();
            for (HasWord token : sentence) {
                System.out.println(token);
            }
        }

//        for (List<HasWord> sentence : documentPreprocessor) {
////            List<HasWord> sentence = it.next();
//            for (HasWord token : sentence) {
//                System.out.println(token);
//            }
//        }
        // Using a pipeline
        System.out.println("----pipeline Example");
        Properties properties = new Properties();
        properties.put("annotators", "tokenize, ssplit");

        StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
        Annotation annotation = new Annotation(paragraph);

        pipeline.annotate(annotation);
        pipeline.prettyPrint(annotation, System.out);

    }
 
Example #16
Source File: StanfordPOSTagger.java    From ADW with GNU General Public License v3.0 4 votes vote down vote up
public List<TaggedWord> tag(String sentence)
{
	List<HasWord> tokens = Sentence.toWordList(sentence.split("\\s+"));
	return tag(tokens);
}