edu.stanford.nlp.ling.Sentence Java Examples

The following examples show how to use edu.stanford.nlp.ling.Sentence. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: StanfordParser.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Parses a sentence and returns the PCFG score as a confidence measure.
 * 
 * @param sentence
 *            a sentence
 * @return PCFG score
 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        score = parser.getPCFGScore();
    }

    return score;
}
 
Example #2
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Splits the sentence into individual tokens.
 * 
 * @param sentence Input sentence
 * @return Array of tokens
 */
public static String[] tokenize(String sentence) {
	List t = MaxentTagger.tokenizeText(new StringReader(sentence));
	
	List<String> tokens = new ArrayList<String>();
	
	for (int j = 0; j < t.size(); j++) {
		Sentence s1 = (Sentence) t.get(j);
		
		for (int i = 0; i < s1.length(); i++) {
			HasWord w = s1.getHasWord(i);
			tokens.add(w.word());
		}
	}
	
	return (String[]) tokens.toArray(new String[tokens.size()]);
}
 
Example #3
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Tags the tokens with part of speech
 * 
 * @param tokens Array of token strings
 * @return Part of speech tags
 */
public static String[] tagPos(String[] tokens) {
	Sentence untagged = createSentence(tokens);
	Sentence tagged = MaxentTagger.tagSentence(untagged);
	
	String[] pos = new String[tagged.size()];
	for (int i = 0; i < tagged.size(); i++) {
		HasWord w = (HasWord) tagged.get(i);
		String[] s = w.toString().split("/");
		if (s.length > 1)
			pos[i] = s[s.length - 1];
		else
			pos[i] = "";
	}
	
	return pos;
}
 
Example #4
Source File: CoverageChecker.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
static public void countNgrams(String line, Counter<String> ngramCounts, Set<String> limitSet, int order) {
   String[] toks = line.split("\\s");
   for (int i = 0; i < toks.length; i++) {
      for (int j = 0; j < order && j+i < toks.length ; j++) {
         String[] ngramArr = Arrays.copyOfRange(toks, i, i+j+1);
         String ngram = Sentence.listToString(Arrays.asList(ngramArr));
         if (limitSet == null || limitSet.contains(ngram)) {
            ngramCounts.incrementCount(ngram);
         }
      }
   }	   
}
 
Example #5
Source File: StanfordParser.java    From OpenEphyra with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Parses a sentence and returns a string representation of the parse tree.
 * 
 * @param sentence
 *            a sentence
 * @return Tree whose Label is a MapLabel containing correct begin and end
 *         character offsets in keys BEGIN_KEY and END_KEY
 */
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new
    // MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
 
Example #6
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Combines the tokens into a <code>Sentence</code> 
 * 
 * @param tokens
 * @return <code>Sentence</code> made of the tokens
 */
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
	ArrayList<HasWord> wordList = new ArrayList<HasWord>();
	
	for (String s : tokens) {
		HasWord w = new Word(s);
		wordList.add(w);
	}
	
	Sentence sentence = new Sentence();
	sentence.setWords(wordList);
	
	return sentence;
}
 
Example #7
Source File: POSTagger.java    From JHazm with MIT License 5 votes vote down vote up
public List<TaggedWord> batchTag(List<String> sentence) {
    String[] sen = new String[sentence.size()];
    for (int i = 0; i < sentence.size(); i++)
       sen[i] = sentence.get(i).replace(" ", "_");
    List newSent = Sentence.toWordList(sen);
    List taggedSentence = this.tagger.tagSentence(newSent);

    List<TaggedWord> taggedSen = new ArrayList<>();
    for (int i = 0; i < taggedSentence.size(); i++) {
        TaggedWord tw = (TaggedWord)taggedSentence.get(i);
        tw.setWord(sentence.get(i));
        taggedSen.add(tw);
    }
    return taggedSen;
}
 
Example #8
Source File: Chapter7.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
private static void usingStanfordLexicalizedParser() {
        String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
        LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel);

        // This option shows parsing a list of correctly tokenized words
        System.out.println("---First option");
        String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."};
        List<CoreLabel> words = Sentence.toCoreLabelList(senetenceArray);

        Tree parseTree = lexicalizedParser.apply(words);
        parseTree.pennPrint();
        System.out.println();

        // This option shows loading and using an explicit tokenizer
        System.out.println("---Second option");
        String sentence = "The cow jumped over the moon.";
        TokenizerFactory<CoreLabel> tokenizerFactory
                = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        Tokenizer<CoreLabel> tokenizer
                = tokenizerFactory.getTokenizer(new StringReader(sentence));
        List<CoreLabel> wordList = tokenizer.tokenize();
        parseTree = lexicalizedParser.apply(wordList);

        TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
        System.out.println(tdl);
        for (TypedDependency dependency : tdl) {
            System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
                    + "] Dependent Word: [" + dependency.dep() + "]");
        }
        System.out.println();

        // You can also use a TreePrint object to print trees and dependencies
//        System.out.println("---Using TreePrint");
//        TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed");
//        treePrint.printTree(parseTree);
//        System.out.println("TreePrint Formats");
//        for (String format : TreePrint.outputTreeFormats) {
//            System.out.println(format);
//        }
//        System.out.println();
    }
 
Example #9
Source File: CRFPostprocessor.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Evaluate the postprocessor given an input file specified in the flags.
 * 
 * @param preProcessor
 * @param pwOut
 */
protected void evaluate(Preprocessor preProcessor, PrintWriter pwOut) {
  System.err.println("Starting evaluation...");
  DocumentReaderAndWriter<CoreLabel> docReader = new ProcessorTools.PostprocessorDocumentReaderAndWriter(preProcessor);
  ObjectBank<List<CoreLabel>> lines =
    classifier.makeObjectBankFromFile(flags.testFile, docReader);

  Counter<String> labelTotal = new ClassicCounter<String>();
  Counter<String> labelCorrect = new ClassicCounter<String>();
  int total = 0;
  int correct = 0;
  PrintWriter pw = new PrintWriter(IOTools.getWriterFromFile("apply.out"));
  for (List<CoreLabel> line : lines) {
    line = classifier.classify(line);
    pw.println(Sentence.listToString(ProcessorTools.toPostProcessedSequence(line)));
    total += line.size();
    for (CoreLabel label : line) {
      String hypothesis = label.get(CoreAnnotations.AnswerAnnotation.class);
      String reference = label.get(CoreAnnotations.GoldAnswerAnnotation.class);
      labelTotal.incrementCount(reference);
      if (hypothesis.equals(reference)) {
        correct++;
        labelCorrect.incrementCount(reference);
      }
    }
  }
  pw.close();

  double accuracy = ((double) correct) / ((double) total);
  accuracy *= 100.0;

  pwOut.println("EVALUATION RESULTS");
  pwOut.printf("#datums:\t%d%n", total);
  pwOut.printf("#correct:\t%d%n", correct);
  pwOut.printf("accuracy:\t%.2f%n", accuracy);
  pwOut.println("==================");

  // Output the per label accuracies
  pwOut.println("PER LABEL ACCURACIES");
  for (String refLabel : labelTotal.keySet()) {
    double nTotal = labelTotal.getCount(refLabel);
    double nCorrect = labelCorrect.getCount(refLabel);
    double acc = (nCorrect / nTotal) * 100.0;
    pwOut.printf(" %s\t%.2f%n", refLabel, acc);
  }
}
 
Example #10
Source File: CRFPostprocessor.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Decode raw text input.
 * 
 * @param postProcessor
 * @param reader
 * @param outstream
 * @param nThreads
 * @return
 */
protected static double decode(final CRFPostprocessor postProcessor,
    BufferedReader reader, PrintWriter outstream, int nThreads) {
  long numChars = 0;
  int lineNumber = 0;
  long startTime = System.nanoTime();
  try {
    // Setup the threadpool
    MulticoreWrapper<String,String> wrapper = 
        new MulticoreWrapper<String,String>(nThreads, 
            new ThreadsafeProcessor<String,String>() {
              @Override
              public String process(String input) {
                List<CoreLabel> labeledSeq = ProcessorTools.toCharacterSequence(input);
                labeledSeq = postProcessor.classifier.classify(labeledSeq);
                List<CoreLabel> tokenSeq = ProcessorTools.toPostProcessedSequence(labeledSeq);
                return Sentence.listToString(tokenSeq);
              }
              @Override
              public ThreadsafeProcessor<String, String> newInstance() {
                return this;
              }
    });
    
    // Read the input
    for (String line; (line = reader.readLine()) != null; ++lineNumber) {
      numChars += line.length();
      wrapper.put(line.trim());
      while(wrapper.peek()) outstream.println(wrapper.poll());
    }
    
    wrapper.join();
    while(wrapper.peek()) outstream.println(wrapper.poll());
    
  } catch (IOException e) {
    System.err.printf("%s: Error at input line %d%s", CRFPostprocessor.class.getName(), lineNumber);
    e.printStackTrace();
  }
  // Calculate throughput
  double elapsedTime = ((double) System.nanoTime() - startTime) / 1e9;
  double charsPerSecond = (double) numChars / elapsedTime;
  return charsPerSecond;
}
 
Example #11
Source File: RuleQuery.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
@Override
public String toString() {
  return String.format("%s (%s) %.5f", Sentence.listToString(tgt), Sentence.listToString(align), score);
}
 
Example #12
Source File: TranslationQuery.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
@Override
public String toString() {
  return String.format("%s (%s) %.5f", Sentence.listToString(tgt), Sentence.listToString(align), score);
}
 
Example #13
Source File: MakePTMPhrasalInput.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
/**
   * @param args
   * @throws IOException 
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 3) {
      System.err.print(usage());
      System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, argDefs());
    String[] positionalArgs = options.getProperty("").split("\\s+");

    String srcLang = positionalArgs[0];
    String tgtLang = positionalArgs[1];
    String sqlFile = positionalArgs[2];
    
    Preprocessor srcPreproc = ProcessorFactory.getPreprocessor(srcLang);
    Preprocessor tgtPreproc = ProcessorFactory.getPreprocessor(tgtLang);
    
    System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s%n", "doc_id", "seg_id", "username", "mt_tok", "user_tok", "s2mt_tok", "src_tok");
//    CSVReader reader = new CSVReader(new FileReader(sqlFile));
    // Skip header
    boolean seenHeader = false;
//    for (String[] fields; (fields = reader.readNext()) != null;) {
  for (String[] fields = null;;) {
      if ( ! seenHeader) {
        seenHeader = true;
        continue;
      }
//      String segId = String.format("%s:%s", fields[0], fields[1]).replace(".src.json", ".tgt");
      String tgtLine = fields[3].trim();
      String alignStr = extend(fields[5]).trim();
      String srcLine = fields[6].trim();
      SymmetricalWordAlignment s2t = new SymmetricalWordAlignment(srcLine, tgtLine, alignStr);
      SymmetricalWordAlignment s2sPrime = srcPreproc.processAndAlign(srcLine);
      SymmetricalWordAlignment t2tPrime = tgtPreproc.processAndAlign(tgtLine);
      String userTextTok = tgtPreproc.process(fields[3]).toString();
      
      // Want sprime --> tprime
      List<String> alignmentList = new LinkedList<>();
      for (int i = 0, size = s2sPrime.eSize(); i < size; ++i) {
        Set<Integer> alignments = s2sPrime.e2f(i);
        for (int j : alignments) {
          Set<Integer> alignments2 = s2t.f2e(j);
          for (int k : alignments2) {
            Set<Integer> alignments3 = t2tPrime.f2e(k);
            for (int q : alignments3) {
              alignmentList.add(String.format("%d-%d",i,q));
            }
          }
        }
      }
      System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s%n", fields[0], fields[1], fields[2], t2tPrime.e().toString(), userTextTok, Sentence.listToString(alignmentList), s2sPrime.e().toString());
    }
//    reader.close();
  }
 
Example #14
Source File: StanfordPOSTagger.java    From ADW with GNU General Public License v3.0 4 votes vote down vote up
public List<TaggedWord> tag(String sentence)
{
	List<HasWord> tokens = Sentence.toWordList(sentence.split("\\s+"));
	return tag(tokens);
}
 
Example #15
Source File: AbstractWordClassMap.java    From phrasal with GNU General Public License v3.0 2 votes vote down vote up
/**
 * Map the input word to a word class.
 * 
 * @param word
 * @return
 */
public IString get(IString word) {
  List<IString> classList = getList(word);
  return numMappings == 1 ? classList.get(0) : new IString(Sentence.listToString(classList, true, DELIMITER));
}