edu.stanford.nlp.util.Pair Java Examples

The following examples show how to use edu.stanford.nlp.util.Pair. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IntelKBPSemgrexExtractor.java    From InformationExtraction with GNU General Public License v3.0 6 votes vote down vote up
@Override
public Pair<String, Double> classify(KBPInput input) {
    for (RelationType rel : RelationType.values()) {

        if (rules.containsKey(rel) &&
                rel.entityType == input.subjectType &&
                rel.validNamedEntityLabels.contains(input.objectType)) {
            Collection<SemgrexPattern> rulesForRel = rules.get(rel);
            CoreMap sentence = input.sentence.asCoreMap(Sentence::nerTags, Sentence::dependencyGraph);
            boolean matches
                    = matches(sentence, rulesForRel, input,
                    sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)) ||
                    matches(sentence, rulesForRel, input,
                            sentence.get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class));
            if (matches) {
                //logger.log("MATCH for " + rel +  ". " + sentence: + sentence + " with rules for  " + rel);
                return Pair.makePair(rel.canonicalName, 1.0);
            }
        }
    }

    return Pair.makePair(NO_RELATION, 1.0);
}
 
Example #2
Source File: AbstractDependencyLanguageModelFeaturizer.java    From phrasal with GNU General Public License v3.0 6 votes vote down vote up
public void loadDependencies(String filename) throws IOException {
  LineNumberReader reader = IOTools.getReaderFromFile(filename);
  forwardDependenciesCache = new HashMap<Integer, Map<Integer, HashSet<Integer>>>();
  reverseDependenciesCache = new HashMap<Integer, Map<Integer, Integer>>();
  reachableNodesCache = new HashMap<Integer, Map<Integer, Set<Integer>>>();

  
  HashMap<Integer, Pair<IndexedWord, List<Integer>>> deps;
  int i = 0;
  while ((deps = DependencyUtils.getDependenciesFromCoNLLFileReader(reader, true, true)) != null) {
    reverseDependenciesCache.put(i,DependencyUtils.getReverseDependencies(deps));
    Map<Integer, HashSet<Integer>> forwardDeps = new HashMap<Integer, HashSet<Integer>>();
    for (Integer gov : deps.keySet()) {
      List<Integer> children = deps.get(gov).second;
      forwardDeps.put(gov, new HashSet<Integer>());
      for (Integer child : children) {
        forwardDeps.get(gov).add(child);
      }
    }
    forwardDependenciesCache.put(i, forwardDeps);
    i++;
  }
  
  reader.close();
}
 
Example #3
Source File: TranslationLayout.java    From phrasal with GNU General Public License v3.0 6 votes vote down vote up
public boolean addTranslationRow(String name, String trans, Color bgColor) {
  JLabel label = new JLabel(trans);
  label.setOpaque(true);
  label.setBackground(bgColor);
  label.setForeground(Color.WHITE);

  GridBagConstraints c = new GridBagConstraints();
  c.fill = GridBagConstraints.HORIZONTAL;
  c.gridx = 0;
  c.ipady = 20;
  c.gridwidth = numColumns;

  if (unusedRows.isEmpty()) {
    ++numFullTranslationRows;
    c.gridy = numRows + numFullTranslationRows;
  } else {
    c.gridy = unusedRows.removeFirst();
  }

  if (panel != null)
    panel.add(label, c);
  fullTranslations.put(name, new Pair<Integer, JLabel>(c.gridy, label));

  return true;
}
 
Example #4
Source File: Phrase.java    From uncc2014watsonsim with GNU General Public License v2.0 6 votes vote down vote up
private static Map<Integer, Pair<CorefMention, CorefMention>> _unpronoun(Phrase p) {
	Stream<Pair<CorefMention, CorefMention>> s =
			Stream.of(p.memo(Phrase.coreNLP).get(CorefChainAnnotation.class))
		.filter(Objects::nonNull)  // Do nothing with an empty map
		.flatMap(chains -> chains.entrySet().stream()) // Disassemble the map
	    .flatMap(entry -> {
			// Link each entry to it's main mention
			CorefMention main = entry.getValue().getRepresentativeMention();
			return entry.getValue().getMentionsInTextualOrder().stream()
				.filter(mention -> mention != main)
				.map(mention -> makePair(mention, main));
		});
	// Type inference chokes here so write it down then return.
	return s.collect(HashMap::new,
			(m, pair) -> m.put(pair.first.headIndex, pair),
			(l, r) -> {});
}
 
Example #5
Source File: KBPSemgrexExtractor.java    From InformationExtraction with GNU General Public License v3.0 6 votes vote down vote up
@Override
public Pair<String, Double> classify(KBPInput input) {
  for (RelationType rel : RelationType.values()) {

    if (rules.containsKey(rel) &&
        rel.entityType == input.subjectType &&
        rel.validNamedEntityLabels.contains(input.objectType)) {
      Collection<SemgrexPattern> rulesForRel = rules.get(rel);
      CoreMap sentence = input.sentence.asCoreMap(Sentence::nerTags, Sentence::dependencyGraph);
      boolean matches
          = matches(sentence, rulesForRel, input,
          sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)) ||
          matches(sentence, rulesForRel, input,
              sentence.get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class));
      if (matches) {
        //logger.log("MATCH for " + rel +  ". " + sentence: + sentence + " with rules for  " + rel);
        return Pair.makePair(rel.canonicalName, 1.0);
      }
    }
  }

  return Pair.makePair(NO_RELATION, 1.0);
}
 
Example #6
Source File: TextualSimilarity.java    From ADW with GNU General Public License v3.0 6 votes vote down vote up
/**
 * returns 
 * @param sentence
 * 			input sentence, space delimited
 * @param discardStopWords
 * 			true if stopwords are to be discarded from the sentence 			
 * @return
 * 		a pair containing <list of word-pos, remaining not-handled terms>  
 * 		
 */
public Pair<List<String>, List<String>> getStanfordSentence(String sentence)
{
	List<WordLemmaTag> wlts = SentenceProcessor.getInstance().processSentence(sentence, false);
	
	List<String> terms = null;
	StanfordSentence sSentence = StanfordSentence.fromLine(Strings.join(wlts," "));
	
	try
	{
		 terms = sSentence.getTerms(TAGS, 
				 Language.EN, 
				 null, 
				 MultiwordBelongingTo.WORDNET, 
				 CompoundingParameter.ALLOW_MULTIWORD_EXPRESSIONS,
				 CompoundingParameter.APPEND_POS);	 
	}
	catch(Exception e)
	{
		e.printStackTrace();
	}

	//discards OOVs, and tries to map incorrect pos-tags to the correct ones
	return fixTerms(terms, discardStopwords);
}
 
Example #7
Source File: Preprocess.java    From ADW with GNU General Public License v3.0 6 votes vote down vote up
public static void fixAllCasings(List<Pair<String,String>> pairs, String path)
{
	try
	{
		BufferedWriter bw = new BufferedWriter(new FileWriter(path, false)); 
		
		for(Pair<String,String> aPair : pairs)
		{
			Pair<String,String> fixedPair = caseFixer(aPair);
			
			bw.write(fixedPair.first+"\t"+fixedPair.second+"\n");
		}
		
		bw.close();
	}
	catch(Exception e)
	{
		e.printStackTrace();
	}
}
 
Example #8
Source File: IntelKBPEnsembleExtractor.java    From InformationExtraction with GNU General Public License v3.0 6 votes vote down vote up
@Override
public Pair<String, Double> classify(KBPInput input) {
    switch (ensembleStrategy) {
        
        case DEFAULT:
            return classifyDefault(input);
        case HIGHEST_SCORE:
            return classifyWithHighestScore(input);
        case VOTE:
            return classifyWithVote(input);
        case WEIGHTED_VOTE:
            return classifyWithWeightedVote(input);
        case HIGH_RECALL:
            return classifyWithHighRecall(input);
        case HIGH_PRECISION:
            return classifyWithHighPrecision(input);
        default:
            throw new UnsupportedClassVersionError(ensembleStrategy + " not supported");
    }
}
 
Example #9
Source File: Node.java    From dependensee with GNU General Public License v2.0 6 votes vote down vote up
public int getPathLength(Node n) {

        Queue<Pair<Node, Integer>> q = new LinkedList<Pair<Node, Integer>>();
        Set<Node> marked = new HashSet<Node>();
        q.add(new Pair<Node, Integer>(this, 0));
        marked.add(this);
        while (!q.isEmpty()) {
            Pair<Node, Integer> v = q.remove();
            if (v.first == n) {
                return v.second;
            }
            if (v.first.parent != null && !marked.contains(v.first.parent)) {
                q.add(new Pair<Node, Integer>(v.first.parent, v.second + 1));
                marked.add(v.first.parent);
            }
            for (Node node : v.first.children) {
                q.add(new Pair<Node, Integer>(node, v.second + 1));
                marked.add(node);
            }
        }
        return Integer.MAX_VALUE;
    }
 
Example #10
Source File: TranslationLayout.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
public boolean removeTranslationRow(String name) {
  Pair<Integer, JLabel> labelPair = fullTranslations.get(name);
  if (labelPair != null) {
    unusedRows.addFirst(labelPair.first());
    if (panel != null)
      panel.remove(labelPair.second());

    return true;
  }
  return false;
}
 
Example #11
Source File: KBPRelationExtractor.java    From InformationExtraction with GNU General Public License v3.0 5 votes vote down vote up
default Accuracy computeAccuracy(Stream<Pair<KBPInput, String>> examples,
                                 Optional<PrintStream> predictOut) {
  forceTrack("Accuracy");
  Accuracy accuracy = new Accuracy();
  AtomicInteger testI = new AtomicInteger(0);
  DecimalFormat confidenceFormat = new DecimalFormat("0.0000");
  forceTrack("Featurizing");
  examples.parallel().map(example -> {
    Pair<String, Double> predicted = this.classify(example.first);
    synchronized (accuracy) {
      accuracy.predict(Collections.singleton(predicted.first), Collections.singleton(example.second));
    }
    if (testI.incrementAndGet() % 1000 == 0) {
      log(KBPRelationExtractor.class, "[" + testI.get() + "]  " + accuracy.toOneLineString());
    }
    return predicted.first + "\t" + confidenceFormat.format(predicted.second);
  })
    .forEachOrdered(line -> {
      if (predictOut.isPresent()) {
        predictOut.get().println(line);
      }
    });
  endTrack("Featurizing");
  log(accuracy.toString());
  endTrack("Accuracy");
  return accuracy;
}
 
Example #12
Source File: KBPTokensregexExtractor.java    From InformationExtraction with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
  RedwoodConfiguration.standard().apply();  // Disable SLF4J crap.
  ArgumentParser.fillOptions(edu.stanford.nlp.ie.KBPTokensregexExtractor.class, args);
  edu.stanford.nlp.ie.KBPTokensregexExtractor extractor = new edu.stanford.nlp.ie.KBPTokensregexExtractor(DIR);
  List<Pair<KBPInput, String>> testExamples = KBPRelationExtractor.readDataset(TEST_FILE);

  extractor.computeAccuracy(testExamples.stream(), PREDICTIONS.map(x -> {
    try {
      return "stdout".equalsIgnoreCase(x) ? System.out : new PrintStream(new FileOutputStream(x));
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }));

}
 
Example #13
Source File: MinimumBayesRisk.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
  if (args.length < 1) {
    System.err.print(usage());
    System.exit(-1);
  }

  Properties options = StringUtils.argsToProperties(args, argDefs());
  final double scale = PropertiesUtils.getDouble(options, "s", DEFAULT_SCALE);
  final String orientation = options.getProperty("o", "utility");
  final boolean risk = "risk".equals(orientation);
  final String metricName = options.getProperty("m", DEFAULT_METRIC);

  final String filename = options.getProperty("");
  BasicNBestList nbestlists = new BasicNBestList(filename);
  MulticoreWrapper<List<BasicNBestEntry>, List<Pair<Double, String>>> wrapper = 
    new MulticoreWrapper<List<BasicNBestEntry>, List<Pair<Double, String>>>(0, new Processor(metricName, risk, scale), true);
  for (List<BasicNBestEntry> nbestlist : nbestlists) {
    wrapper.put(nbestlist);
    while (wrapper.peek()) {
      DumpRescored(wrapper.poll());
    }
  }
  wrapper.join();
  while (wrapper.peek()) {
    DumpRescored(wrapper.poll());
  }
}
 
Example #14
Source File: PrefixTagger.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Determine best tag based on current word and its immediate predecessors.
 *
 * @param s
 *          <i>leftWindow</i> plus one words
 * @param o
 *          Offset with respect to last position.
 * @return Best tag and its probability.
 */
public Pair<IString, Float> getBestTag(IString[] s, int o) {
  int loc = s.length - 1 + o;

  IStringArrayWrapper aw = null;
  Pair<IString, Float> tag;

  if (CACHE_POS) {
    aw = new IStringArrayWrapper(s);
    tag = cache.get(aw);
    if (tag != null)
      return tag;
  }

  init(s);

  int[] bestTags = new int[len];
  int[][] vals = new int[len][];
  for(int pos = 0 ; pos < len ; pos++) {
    vals[pos] = getPossibleValues(pos);
    bestTags[pos] = vals[pos][0];
  }

  this.initializeScorer();
  double[] scores = scoresOf(bestTags, loc);

  int am = ArrayMath.argmax(scores);

  // TODO
  bestTags[loc] = vals[loc][am];
  cleanUpScorer();

  tag = new Pair<IString, Float>(new IString(maxentTagger.getTag(bestTags[loc])),
          (float) scores[am]);
  if (CACHE_POS)
    cache.put(aw, tag);
  return tag;
}
 
Example #15
Source File: PrefixTagger.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Tag text file using PrefixTagger.
 *
 * @param textFile
 *          File to tag
 */
public void tagFile(String textFile) {

  for (String line : ObjectBank.getLineIterator(new File(textFile))) {

    line = line.replaceAll("$", " ");
    line = line + Tagger.EOS_WORD;
    IString[] in = IStrings.toIStringArray(line.split("\\s+"));

    // System.err.println("sent: "+Arrays.toString(in));
    for (int i = 0; i < in.length - 1; ++i) {
      int from = Math.max(0, i - leftWindow);
      int to = Math.min(i + 1 + rightWindow, in.length);
      int offset = -rightWindow;
      IString[] seq = new IString[to - from];
      System.arraycopy(in, from, seq, 0, seq.length);
      // System.err.printf("tagging(%d,%d,%d): %s\n",from,to,offset,Arrays.toString(seq));
      Pair<IString, Float> tag = getBestTag(seq);
      if (i > 0)
        System.out.print(" ");
      int loc = seq.length - 1 + offset;
      // System.err.printf("tagging(%d,%d,%d,%s): %s\n",from,to,offset,tag.first.word(),Arrays.toString(seq));
      System.out.print(seq[loc]);
      System.out.print("/");
      System.out.print(tag.first.toString());
    }
    System.out.print("\n");
  }
}
 
Example #16
Source File: DependencyUtils.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
public static Map<Integer, Integer> getReverseDependencies(HashMap<Integer, Pair<IndexedWord, List<Integer>>> forwardDependencies) {
  if (forwardDependencies == null)
    return null;
  
  Map<Integer, Integer> reverseDependencies = new HashMap<>();
  for (Integer gov : forwardDependencies.keySet()) {
    for (Integer dep : forwardDependencies.get(gov).second) {
      reverseDependencies.put(dep, gov);
    }
  }
  return reverseDependencies;
}
 
Example #17
Source File: BLEUSorter.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
static List<Integer> sortSentencesByScore(List<Sequence<IString>> hyps1,
    List<Sequence<IString>> hyps2,
    BLEUMetric<IString, String>.BLEUIncrementalMetric incMetric1,
    BLEUMetric<IString, String>.BLEUIncrementalMetric incMetric2) {
  List<Pair<Double, Integer>> scores = new ArrayList<Pair<Double, Integer>>();

  for (int sentId = 0; sentId < hyps1.size(); ++sentId) {
    double score1 = incMetric1.computeLocalSmoothScore(hyps1.get(sentId),
        sentId);
    double score2 = incMetric2.computeLocalSmoothScore(hyps2.get(sentId),
        sentId);
    boolean add;
    if (score1 == 0.0) {
      add = (score2 > 0.0);
    } else {
      add = Math.abs(score2 / score1 - 1.0) > minDelta;
    }
    if (add)
      // Only print sentence if difference is significant enough:
      scores.add(new Pair<Double, Integer>(score2 - score1, sentId));
  }
  Collections.sort(scores, (el1, el2) -> el1.first().compareTo(el2.first()));
  List<Integer> sentIds = new ArrayList<Integer>();
  for (Pair<Double, Integer> el : scores) {
    sentIds.add(el.second());
  }
  return sentIds;
}
 
Example #18
Source File: IntelKBPSemgrexExtractor.java    From InformationExtraction with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
    RedwoodConfiguration.standard().apply();  // Disable SLF4J crap.
    ArgumentParser.fillOptions(IntelKBPSemgrexExtractor.class, args);
    IntelKBPSemgrexExtractor extractor = new IntelKBPSemgrexExtractor(DIR);
    List<Pair<KBPInput, String>> testExamples = DatasetUtils.readDataset(TEST_FILE);

    extractor.computeAccuracy(testExamples.stream(), PREDICTIONS.map(x -> {
        try {
            return "stdout".equalsIgnoreCase(x) ? System.out : new PrintStream(new FileOutputStream(x));
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    }));
}
 
Example #19
Source File: Messages.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static Pair<MessageType,Request>parseRequest(HttpServletRequest request) {
  MessageType type = getMessageType(request);
  Request message = new UnknownRequest();
  if (type != MessageType.UNKNOWN_REQUEST) {
    String jsonString = request.getParameter(type.keyName());
    message = (Request) gson.fromJson(jsonString, type.msgClass());
  }
  return new Pair<MessageType,Request>(type, message);
}
 
Example #20
Source File: TranslationLayout.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
public TranslationLayout(Translation t, boolean rightToLeft) {
  translation = t;
  numColumns = translation.getNumSourceWords();
  RIGHT_TO_LEFT = rightToLeft;
  numOptions = t.numPhrases();
  numFullTranslationRows = 0;
  vPhrases = new ArrayList<VisualPhrase>();
  vPhraseLookup = new HashMap<Phrase, VisualPhrase>();
  unusedRows = new LinkedList<Integer>();
  fullTranslations = new HashMap<String, Pair<Integer, JLabel>>();
}
 
Example #21
Source File: StopwordAnnotatorTest.java    From coreNlp with Apache License 2.0 5 votes vote down vote up
/**
 * Test to validate that lemma values are checked against the (custom) stopword list
 *
 * NOTE: since we're loading the pos model into memory you'll need to set the VM memory size via '-Xms512m -Xmx1048m'
 * @throws Exception
 */
@org.junit.Test
public void testStopwordsWithLemma() throws Exception {

    //setup coreNlp properties for stopwords. Note the custom stopword list and check for lemma property
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, stopword");
    props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");
    props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);
    props.setProperty(StopwordAnnotator.CHECK_LEMMA, "true");

    //get the custom stopword set
    Set<?> stopWords = StopwordAnnotator.getStopWordList(Version.LUCENE_36, customStopWordList, true);

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(example);
    pipeline.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    for (CoreLabel token : tokens) {

        //get the stopword annotation
        Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

        String word = token.word().toLowerCase();
        if (stopWords.contains(word)) {
            assertTrue(stopword.first());
        }
        else {
            assertFalse(stopword.first());
        }

        String lemma = token.lemma().toLowerCase();
        if (stopWords.contains(lemma)) {
            assertTrue(stopword.first());
        }
        else {
            assertFalse(stopword.first());
        }
    }
}
 
Example #22
Source File: StopwordAnnotatorTest.java    From coreNlp with Apache License 2.0 5 votes vote down vote up
/**
 * Test to validate that the custom stopword list words
 * @throws Exception
 */
@org.junit.Test
public void testCustomStopwordList() throws Exception {

    //setup coreNlp properties for stopwords. Note the custom stopword list property
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, stopword");
    props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");
    props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);

    //get the custom stopword set
    Set<?> stopWords = StopwordAnnotator.getStopWordList(Version.LUCENE_36, customStopWordList, true);

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(example);
    pipeline.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    for (CoreLabel token : tokens) {

        //get the stopword annotation
        Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

        String word = token.word().toLowerCase();
        if (stopWords.contains(word)) {
            assertTrue(stopword.first());
        }
        else {
            assertFalse(stopword.first());
        }

        //not checking lemma, so always false
        assertFalse(stopword.second());
    }
}
 
Example #23
Source File: Edges.java    From uncc2014watsonsim with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Returns some new rules learned about a pronoun given its match
 * context from anaphora resolution.
 * 
 * Specifically, we fill in the tags
 * 
 * _animate(main mention, ___).
 * _gender(main mention, ___).
 * _number(main mention, ___).
 * 
 * Basically, we can tell if it is animate, it's gender, and it's count.
 * @return A list of semantic notes.
 */
public static List<Edge> generatePronounEdges(
		SemanticGraph g, IndexedWord w, Phrase t) {
	List<Edge> edges = new ArrayList<>();
	if (t.getUnpronoun().containsKey(w.index())) {
		// Use what we know about the pronoun
		Pair<CorefMention, CorefMention> mention_edge = t.getUnpronoun().get(w.index());
		String main_noun = Trees.concatNoun(g, g.getNodeByIndex(mention_edge.second.headIndex));
		
		Animacy is_animate = mention_edge.first.animacy;
		if (is_animate != Animacy.UNKNOWN) {
			edges.add(new Edge(
				main_noun, "_animate", is_animate.toString()));
		}
		
		Gender gender = mention_edge.first.gender;
		if (gender != Gender.UNKNOWN) {
			edges.add(new Edge(
				main_noun, "_gender", gender.toString()));
		}
		
		Dictionaries.Number number = mention_edge.first.number;
		if (number != Dictionaries.Number.UNKNOWN) {
			edges.add(new Edge(
				main_noun, "_number", number.toString()));
		}
	}
	return edges;
}
 
Example #24
Source File: StopwordAnnotatorTest.java    From coreNlp with Apache License 2.0 5 votes vote down vote up
/**
 * Test to validate that stopwords are properly annotated in the token list
 * @throws Exception
 */
@org.junit.Test
public void testLuceneStopwordList() throws Exception {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, stopword");
    props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(example);
    pipeline.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    //get the standard lucene stopword set
    Set<?> stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

    for (CoreLabel token : tokens) {

        //get the stopword annotation
        Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

        String word = token.word().toLowerCase();
        if (stopWords.contains(word)) {
            assertTrue(stopword.first());
        }
        else {
            assertFalse(stopword.first());
        }

        //not checking lemma, so always false
        assertFalse(stopword.second());
    }
}
 
Example #25
Source File: Edges.java    From uncc2014watsonsim with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Get the full text of the main mention of a particular word, if it has a
 * better mention. Otherwise just get it's segment of the tree using
 * concatNoun()
 * 
 * @param phrase
 * @param w
 * @return
 */
public static String getMainMention(
		Phrase phrase, SemanticGraph graph, IndexedWord word) {
	Pair<CorefMention, CorefMention> linked_refs =
			phrase.getUnpronoun().get(word.index());
	if (linked_refs == null) {
		return Trees.concatNoun(graph, word);
	} else {
		return linked_refs.second.mentionSpan;
	}
}
 
Example #26
Source File: StopwordAnnotator.java    From coreNlp with Apache License 2.0 5 votes vote down vote up
@Override
public void annotate(Annotation annotation) {
    if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(TokensAnnotation.class)) {
        List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
        for (CoreLabel token : tokens) {
            boolean isWordStopword = stopwords.contains(token.word().toLowerCase());
            boolean isLemmaStopword = checkLemma ? stopwords.contains(token.word().toLowerCase()) : false;
            Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword);
            token.set(StopwordAnnotator.class, pair);
        }
    }
}
 
Example #27
Source File: Preprocess.java    From ADW with GNU General Public License v3.0 5 votes vote down vote up
public static void pipeline(List<Pair<String,String>> pairs, String outPath)
{
	try
	{
		BufferedWriter bw = new BufferedWriter(new FileWriter(ADWConfiguration.getInstance().getOffsetMapPath(), false)); 
				
		int i = 1;
		for(Pair<String,String> aPair : pairs)
		{
			System.out.println("[working on "+ i++ +"]");
			String first = aPair.first;
			String second = aPair.second;

			//n't & 'm => not and am
			first = fixAbbrev(first);
			second = fixAbbrev(second);
			
			//remove hyphens
			first = removeHyphens(first);
			second = removeHyphens(second);
			
			//fixCurrency
			first = fixCurrency(first);
			second = fixCurrency(second);
			
			bw.write(first.replace(" .", ".")+"\t"+second.replace(" .", ".")+"\n");
		}
	
		bw.close();
	}
	catch(Exception e)
	{
		e.printStackTrace();
	}
}
 
Example #28
Source File: Preprocess.java    From ADW with GNU General Public License v3.0 5 votes vote down vote up
public static void manualCheck(List<Pair<String,String>> pairs, boolean flag, String filePath)
{
	
	if(flag)
	{
		//mirror-compounding
		mirrorCompounder(pairs);
	}
	else
	{
		try
		{
			BufferedWriter bw = new BufferedWriter(new FileWriter(filePath, false)); 
			
			//spellchecking
			for(Pair<String,String> aPair : pairs)
			{
				Pair<String,String> fixedPair = spellCorrect(aPair);
				bw.write(fixedPair.first+"\t"+fixedPair.second+"\n");
			}
			
			bw.close();
			
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
		
		
	}
}
 
Example #29
Source File: PairSimilarity.java    From ADW with GNU General Public License v3.0 5 votes vote down vote up
public Pair<List<String>,List<String>> mirrorPosTags(List<String> firstCookedSentence, List<String> secondCookedSentence) 
{
	/*
	if(secondCookedSentence.size() == 0)
	{
		System.out.println("[ERROR: Set mirror pos tagging off!]");
		System.exit(0);
	}
	*/
	
	return TextualSimilarity.fixPOSmirroring(firstCookedSentence, secondCookedSentence);
}
 
Example #30
Source File: ADW.java    From ADW with GNU General Public License v3.0 5 votes vote down vote up
public Pair<Boolean,String> evaluateInputType(String input, ItemType type)
{
	boolean passed = checkType(input, type);
	
	if(!passed)
	{
		log.warn("Invalid input type for "+ type +" and string \""+ input +"\"! Please check the input type.");
		return new Pair<Boolean,String>(false,"Invalid input type for "+ type +" and string \""+ input +"\"! Please check the input type.");
	}
	else
	{
		return new Pair<Boolean,String>(true,"Valid input type for "+ type +" and string \""+ input +"\".");
	}
}