edu.stanford.nlp.tagger.maxent.MaxentTagger Java Examples

The following examples show how to use edu.stanford.nlp.tagger.maxent.MaxentTagger. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CoreNLPAnnotator.java    From Stargraph with MIT License 6 votes vote down vote up
@Override
protected List<Word> doRun(Language language, String sentence) {
    MaxentTagger tagger = taggers.computeIfAbsent(language, lang -> {
        if (lang == EN) {
            return new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
        }
        throw new UnsupportedLanguageException(lang);
    });

    PartOfSpeechSet partOfSpeechSet = PartOfSpeechSet.getPOSSet(language);
    List<Word> words = new ArrayList<>();

    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(sentence));
    sentences.forEach(s -> {
        tagger.tagSentence(s).forEach(taggedWord ->
                words.add(new Word(partOfSpeechSet.valueOf(taggedWord.tag()), taggedWord.value())));
    });

    return words;
}
 
Example #2
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Tags the tokens with part of speech
 * 
 * @param tokens Array of token strings
 * @return Part of speech tags
 */
public static String[] tagPos(String[] tokens) {
	Sentence untagged = createSentence(tokens);
	Sentence tagged = MaxentTagger.tagSentence(untagged);
	
	String[] pos = new String[tagged.size()];
	for (int i = 0; i < tagged.size(); i++) {
		HasWord w = (HasWord) tagged.get(i);
		String[] s = w.toString().split("/");
		if (s.length > 1)
			pos[i] = s[s.length - 1];
		else
			pos[i] = "";
	}
	
	return pos;
}
 
Example #3
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Splits the sentence into individual tokens.
 * 
 * @param sentence Input sentence
 * @return Array of tokens
 */
public static String[] tokenize(String sentence) {
	List t = MaxentTagger.tokenizeText(new StringReader(sentence));
	
	List<String> tokens = new ArrayList<String>();
	
	for (int j = 0; j < t.size(); j++) {
		Sentence s1 = (Sentence) t.get(j);
		
		for (int i = 0; i < s1.length(); i++) {
			HasWord w = s1.getHasWord(i);
			tokens.add(w.word());
		}
	}
	
	return (String[]) tokens.toArray(new String[tokens.size()]);
}
 
Example #4
Source File: LexiconPreprocessor.java    From sentiment-analysis with Apache License 2.0 6 votes vote down vote up
/**Some common pre-processing stuff*/
public double[] getProcessed(String str, MaxentTagger tagger){
	StringTokenizer st = new StringTokenizer(str);
	String current;
	String toreturn = "";
	while (st.hasMoreTokens()){			
		current = st.nextToken();						
		current = replaceEmoticons(current);			// current is altered to "happy"/"sad"
		current = replaceTwitterFeatures(current);		// i.e. links, mentions, hash-tags
		current = replaceConsecutiveLetters(current);	// replaces more than 2 repetitive letters with 2
		current = replaceNegation(current);				// if current is a negation word, then current = "not"
		current = replaceAbbreviations(current);		// if current is an abbreviation, then replace it
		current = current.replaceAll("[^A-Za-z]", " ");
		toreturn = toreturn.concat(" "+current);
	}
	double[] vals = getPOS(toreturn, tagger);
	return vals;
}
 
Example #5
Source File: CorenlpPipeline.java    From datashare with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Part-of-Speech Classification (Maximum entropy) only
 *
 * @param input    the string to annotator
 * @param hash     the input hash code
 * @param language the input language
 */
private Annotations processPosClassifier(String input, String hash, Language language) throws InterruptedException {
    Annotations annotations = new Annotations(hash, getType(), language);
    LOGGER.info("POS-tagging for " + language.toString());

    // Split input into sentences
    final CoreNlpAnnotator<MaxentTagger> nlpAnnotator;
    nlpAnnotator = CoreNlpPosModels.getInstance().get(language);
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(input));
    for (List<HasWord> sentence : sentences) {
        // NlpTag with parts-of-speech
        List<TaggedWord> taggedSentence = nlpAnnotator.annotator.tagSentence(sentence);
        // Feed annotatopn
        for (TaggedWord word : taggedSentence) {
            int begin = word.beginPosition();
            int end = word.endPosition();
            String pos = word.tag(); // like line 157 we don't use POS tagging
            annotations.add(POS, begin, end);
        }
    }
    return annotations;
}
 
Example #6
Source File: ComplexPreprocessor.java    From sentiment-analysis with Apache License 2.0 6 votes vote down vote up
/**The only extra method compared to the text-based approach.*/
private String getPOS(String sample, MaxentTagger tagger){
	String tagged = tagger.tagString(sample.trim().replaceAll(" +", " "));	
	StringTokenizer stk = new StringTokenizer(tagged);
	
	String output = "";
	while (stk.hasMoreTokens()){
		String tmp = stk.nextToken();
		String tmp2 = tmp.replaceAll("[^A-Za-z_0-9]", "");
		output = output+tmp2+" ";
		if (tmp.contains("."))
			output=output.concat(".");
		if (tmp.contains("!"))
			output=output.concat("!");
		if (tmp.contains(","))
			output=output.concat(",");	
		if (tmp.contains("?"))
			output=output.concat("?");			
	}
	return output;
}
 
Example #7
Source File: AMRServices.java    From amr with GNU General Public License v2.0 6 votes vote down vote up
private AMRServices(String skolemPredicateBaseName, Type textType,
		String refPredicateBaseName, SpecificationMapping mapping,
		File stanfordModelFile, String opPredicatePrefix,
		LogicalConstant dummyEntity, LogicalConstant nameInstancePredicate,
		Type typingPredicateType, IllinoisNERWrapper namedEntityRecognizer,
		File propBankDir) throws IOException {
	this.opPredicatePrefix = opPredicatePrefix;
	this.dummyEntity = dummyEntity;
	this.nameInstancePredicate = nameInstancePredicate;
	this.typingPredicateType = typingPredicateType;
	this.namedEntityRecognizer = namedEntityRecognizer;
	// Add a lemmatizer that simply returns the lower-cased word.
	this.lemmatizer = new UnionLemmatizer(new WordNetLemmatizer(),
			word -> SetUtils.createSingleton(word.toLowerCase()));
	this.skolemPredicateBaseName = skolemPredicateBaseName;
	this.textType = textType;
	this.refPredicateBaseName = refPredicateBaseName;
	this.mapping = mapping;
	this.tagger = stanfordModelFile == null ? null
			: new MaxentTagger(stanfordModelFile.getAbsolutePath());
	this.propBank = propBankDir == null ? null : new PropBank(propBankDir);
}
 
Example #8
Source File: Chapter5.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingStanfordMaxentPOS() {
        try {
            MaxentTagger tagger = new MaxentTagger(getModelDir() + "//wsj-0-18-bidirectional-distsim.tagger");
//            MaxentTagger tagger = new MaxentTagger(getModelDir() + "//gate-EN-twitter.model");
//            System.out.println(tagger.tagString("AFAIK she H8 cth!"));
//            System.out.println(tagger.tagString("BTW had a GR8 tym at the party BBIAM."));
            List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader("sentences.txt")));
            for (List<HasWord> sentence : sentences) {
                List<TaggedWord> taggedSentence = tagger.tagSentence(sentence);
                // Simple display
                System.out.println("---" + taggedSentence);
                // Simple conversion to String
//                System.out.println(Sentence.listToString(taggedSentence, false));
                // Display of words and tags
//                for (TaggedWord taggedWord : taggedSentence) {
//                    System.out.print(taggedWord.word() + "/" + taggedWord.tag() + " ");
//                }
//                System.out.println();
                // List of specifc tags
//                System.out.print("NN Tagged: ");
//                for (TaggedWord taggedWord : taggedSentence) {
//                    if (taggedWord.tag().startsWith("NN")) {
//                        System.out.print(taggedWord.word() + " ");
//                    }
//                }
//                System.out.println();
            }
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        }
    }
 
Example #9
Source File: StanfordPOSTagger.java    From ADW with GNU General Public License v3.0 5 votes vote down vote up
private StanfordPOSTagger()
{
	try
	{
		this.tagger =
			new MaxentTagger(ADWConfiguration.getInstance().getStanfordPOSModel());
	}
	catch (Exception e)
	{
		throw new RuntimeException("Cannot init: " + e);
	}
}
 
Example #10
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Initializes the POS Tagger
 * 
 * @param model model file
 * @return true, iff the POS tagger was initialized successfully
 */
public static boolean init(String model) {
	try {
		new MaxentTagger(model);
	} catch (Exception e) {
		return false;
	}
	
	return true;
}
 
Example #11
Source File: ComplexPreprocessor.java    From sentiment-analysis with Apache License 2.0 5 votes vote down vote up
/**Some common pre-processing stuff*/
public String getProcessed(String str, MaxentTagger tagger){

	String toreturn = "";
	toreturn = getPOS(toreturn, tagger);
	return toreturn;
}
 
Example #12
Source File: TweetPreprocessor.java    From sentiment-analysis with Apache License 2.0 5 votes vote down vote up
public TweetPreprocessor(String t){
	maxid = 0;
	main_folder = t;
	tp = new TextPreprocessor(main_folder);
	cp = new ComplexPreprocessor();
	fp = new FeaturePreprocessor(main_folder);
	tagger = new MaxentTagger(main_folder+"datasets/gate-EN-twitter.model");
	try {
		lp = new LexiconPreprocessor(main_folder);
	} catch (IOException e) {
		e.printStackTrace();
	}
}
 
Example #13
Source File: PrefixTagger.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 1) {
    System.err
    .println("Usage: java edu.stanford.nlp.tagger.maxent.PrefixTagger (input-file) (model - optional) ");
    System.exit(1);
  }
  String inputFile = args[0];
  String modelFile = MaxentTagger.DEFAULT_NLP_GROUP_MODEL_PATH;
  if(args.length > 1) modelFile = args[1];

  MaxentTagger tagger = new MaxentTagger(modelFile);
  PrefixTagger ts = new PrefixTagger(tagger);
  ts.tagFile(inputFile);
}
 
Example #14
Source File: PrefixTagger.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Creates a new PrefixTagger.
 *
 * @param maxentTagger
 *          general information on the tagger (this parameter will soon
 *          change)
 */
public PrefixTagger(MaxentTagger maxentTagger) {
  super(maxentTagger);

  // window sizes are set as same as those in maxentTagger
  this.leftWindow = leftWindow();
  this.rightWindow = rightWindow();
  this.offset = leftWindow();
}
 
Example #15
Source File: ParserDemo.java    From NLIDB with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	String modelPath = DependencyParser.DEFAULT_MODEL;
	String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";

	for (int argIndex = 0; argIndex < args.length;) {
		switch (args[argIndex]) {
		case "-tagger":
			taggerPath = args[argIndex + 1];
			argIndex += 2;
			break;
		case "-com.dukenlidb.nlidb.model":
			modelPath = args[argIndex + 1];
			argIndex += 2;
			break;
		default:
			throw new RuntimeException("Unknown argument " + args[argIndex]);
		}
	}

	String text = "Return authors who have more papers than Bob in VLDB after 2000";

	MaxentTagger tagger = new MaxentTagger(taggerPath);
	DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

	DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
	for (List<HasWord> sentence : tokenizer) {
		List<TaggedWord> tagged = tagger.tagSentence(sentence);
		GrammaticalStructure gs = parser.predict(tagged);

		// Print typed dependencies
		log.info(gs);
	}
	
}
 
Example #16
Source File: StanfordPOSTagger.java    From jatecs with GNU General Public License v3.0 5 votes vote down vote up
public Vector<ArrayList<TaggedWord>> tag(String input) {
    Vector<ArrayList<TaggedWord>> returnVector = new Vector<ArrayList<TaggedWord>>();
    List<List<HasWord>> sentences = MaxentTagger
            .tokenizeText(new BufferedReader(new StringReader(input)));
    for (List<? extends HasWord> sentence : sentences) {
        returnVector.add(tagger.tagSentence(sentence));
    }
    return returnVector;
}
 
Example #17
Source File: TypeClassifier.java    From winter with Apache License 2.0 5 votes vote down vote up
public TypeClassifier() {
	pipeline = new AnnotationPipeline();
	classifier = new Classifier();
	featureSet = new FeatureSet(new MaxentTagger(
			"de.uni_mannheim.informatik.dws.winter.webtables.detectors.tabletypeclassifier\\english-left3words-distsim.tagger"));
	initialize();
}
 
Example #18
Source File: POSTagger.java    From EasySRL with Apache License 2.0 4 votes vote down vote up
private StanfordPOSTagger(final File modelFile) {
	this.tagger = new MaxentTagger(modelFile.toString());
}
 
Example #19
Source File: NLParser.java    From NLIDB with Apache License 2.0 4 votes vote down vote up
public NLParser() {
	String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
	String modelPath = DependencyParser.DEFAULT_MODEL;
	tagger = new MaxentTagger(taggerPath);
	parser = DependencyParser.loadFromModelFile(modelPath);
}
 
Example #20
Source File: CoreNlpPosModels.java    From datashare with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
protected CoreNlpAnnotator<MaxentTagger> loadModelFile(Language language) {
    Path modelFilePath = getModelsBasePath(language).resolve(getJarFileName(language));
    super.addResourceToContextClassLoader(modelFilePath);
    return new CoreNlpAnnotator<>(new MaxentTagger(getInJarModelPath(language)));
}
 
Example #21
Source File: LexiconPreprocessor.java    From sentiment-analysis with Apache License 2.0 4 votes vote down vote up
/**The only extra method compared to the text-based approach.*/
private double[] getPOS(String sample, MaxentTagger tagger){
	String tagged = tagger.tagString(sample.trim().replaceAll(" +", " "));	
	StringTokenizer stk = new StringTokenizer(tagged);
	String output = "";
	double noun=0.0;
	double adj=0.0;
	double verb=0.0;
	double adv=0.0;
	double polarity = 0.0;
	boolean foundNegation = false;
	
	while (stk.hasMoreTokens()){
		String token = stk.nextToken();
		String tmp = token.substring(0, token.lastIndexOf("_")).toLowerCase();
		int idx = token.lastIndexOf("_");
		String pos = token.substring(idx+1);
		if (tmp.equals("not"))
			foundNegation = true;
		else if (pos.equals("NN") || pos.equals("NNS") || pos.equals("NNP") || pos.equals("NNPS")){
			output = output+"n#"+tmp+" ";
			if (foundNegation==true){
				foundNegation = false;
				noun = noun - swn.extract(tmp, "n");
			}else
				noun = noun + swn.extract(tmp, "n");
		}else if (pos.equals("RB") || pos.equals("RBR") || pos.equals("RBS") || pos.equals("RP")){
			output = output+"r#"+tmp+" ";
			if (foundNegation==true){
				foundNegation = false;
				adv = adv-swn.extract(tmp, "r");
			}else
				adv = adv+swn.extract(tmp, "r");
		}else if (pos.equals("JJ") || pos.equals("JJR") || pos.equals("JJS")){
			output = output+"a#"+tmp+" ";
			if (foundNegation==true){
				foundNegation = false;
				adj = adj-swn.extract(tmp, "a");
			}else
				adj = adj+swn.extract(tmp, "a");
		}else if (pos.equals("VB") || pos.equals("VBD") || pos.equals("VBG") || pos.equals("VBN") || pos.equals("VBP") || pos.equals("VBZ")){
			output = output+"v#"+tmp+" ";
			if (foundNegation==true){
				foundNegation = false;
				verb = verb-swn.extract(tmp, "v");
			}else
				verb = verb+swn.extract(tmp, "v");
		}
		// The polarity value
		if (tmp.equals("not"))
			foundNegation = true;
		else if (posWords.contains(tmp)){
			if (foundNegation==true){
				polarity = polarity-1.0;
				foundNegation = false;
			}else{
				polarity = polarity+1.0;
			}
		}else if (negWords.contains(tmp)){
			if (foundNegation==true){
				polarity = polarity+1.0;
				foundNegation=false;
			}else{
				polarity = polarity - 1.0;
			}
		}
	}
	double[] ret = new double[6];
	ret[0] = verb;
	ret[1] = noun;
	ret[2] = adj;
	ret[3] = adv;
	ret[4] = adv+verb+noun+adj;
	ret[5] = polarity;
	return ret;
}
 
Example #22
Source File: FeatureSet.java    From winter with Apache License 2.0 4 votes vote down vote up
public FeatureSet(MaxentTagger maxentTagger) {
	super();
	this.maxentTagger = maxentTagger;
}
 
Example #23
Source File: POSTagger.java    From JHazm with MIT License 4 votes vote down vote up
public POSTagger(String pathToModel) throws IOException {
    this.tagger = new MaxentTagger(pathToModel);
}
 
Example #24
Source File: TaggerDemo.java    From blog-codes with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception { 
	InputStream input = TaggerDemo.class.getResourceAsStream("/"+MaxentTagger.DEFAULT_JAR_PATH);

	MaxentTagger tagger = new MaxentTagger(input);
	
	List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader("Karma of humans is AI"));

	for (List<HasWord> sentence : sentences) {

		List<TaggedWord> tSentence = tagger.tagSentence(sentence);

		System.out.println(SentenceUtils.listToString(tSentence, false));

	}

}
 
Example #25
Source File: PrefixTagger.java    From phrasal with GNU General Public License v3.0 3 votes vote down vote up
/**
 * Creates a new PrefixTagger. Since PrefixTagger can't determine how many
 * words of context are needed by the tagging model, <i>leftWindow</i> must be
 * manually specified.
 *
 * @param maxentTagger
 *          general information on the tagger (this parameter will soon
 *          change)
 * @param leftWindow
 *          How many words to the left determine the current tag.
 */
public PrefixTagger(MaxentTagger maxentTagger, int leftWindow, int rightWindow) {
  super(maxentTagger);
  if (leftWindow < 0 || rightWindow < 0)
    throw new UnsupportedOperationException();
  this.leftWindow = leftWindow;
  this.rightWindow = rightWindow;
  this.offset = -rightWindow;
}
 
Example #26
Source File: POSTagger.java    From sarcasmbot with GNU General Public License v3.0 3 votes vote down vote up
public POSTagger(String modelFile) {
	

	this.tagger = new MaxentTagger(modelFile);
	this.ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
				"untokenizable=noneKeep");

}