Java Code Examples for edu.stanford.nlp.stats.Counter#incrementCount()

The following examples show how to use edu.stanford.nlp.stats.Counter#incrementCount() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DocumentFrequencyCounter.java    From wiseowl with MIT License 6 votes vote down vote up
/**
 * Get an IDF map for the given document string.
 *
 * @param document
 * @return
 */
private static Counter<String> getIDFMapForDocument(String document) {
  // Clean up -- remove some Gigaword patterns that slow things down
  // / don't help anything
  document = headingSeparator.matcher(document).replaceAll("");

  DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
  preprocessor.setTokenizerFactory(tokenizerFactory);

  Counter<String> idfMap = new ClassicCounter<String>();
  for (List<HasWord> sentence : preprocessor) {
    if (sentence.size() > MAX_SENTENCE_LENGTH)
      continue;

    List<TaggedWord> tagged = tagger.tagSentence(sentence);

    for (TaggedWord w : tagged) {
      if (w.tag().startsWith("n"))
        idfMap.incrementCount(w.word());
    }
  }

  return idfMap;
}
 
Example 2
Source File: DependencyBnBPreorderer.java    From phrasal with GNU General Public License v3.0 6 votes vote down vote up
private static Set<String> getMostFrequentTokens(LineNumberReader reader, int k) throws IOException {
  
  Counter<String> tokenCounts = new ClassicCounter<String>();
  
  String line;
  while ((line = reader.readLine()) != null) {
    String tokens[] = line.split("\\s+");
    for (String t : tokens) {
      tokenCounts.incrementCount(t);
    }
  }

  Set<String> mostFrequentTokens = new HashSet<>(k);
  Counters.retainTop(tokenCounts, k);
  mostFrequentTokens.addAll(tokenCounts.keySet());
  tokenCounts = null;
  return mostFrequentTokens;
}
 
Example 3
Source File: OptimizerUtils.java    From phrasal with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Update an existing feature whitelist according to nbestlists. Then return the features that appear
 * more than minSegmentCount times.
 * 
 * @param featureWhitelist
 * @param nbestlists
 * @param minSegmentCount
 * @return  features that appear more than minSegmentCount times
 */
public static Set<String> updatefeatureWhiteList(
    Counter<String> featureWhitelist,
    List<List<RichTranslation<IString, String>>> nbestlists,
    int minSegmentCount) {
  for (List<RichTranslation<IString, String>> nbestlist : nbestlists) {
    Set<String> segmentFeatureSet = new HashSet<String>(1000);
    for (RichTranslation<IString, String> trans : nbestlist) {
      for (FeatureValue<String> feature : trans.features) {
        if ( ! segmentFeatureSet.contains(feature.name)) {
          segmentFeatureSet.add(feature.name);
          featureWhitelist.incrementCount(feature.name);
        }
      }
    }
  }
  return Counters.keysAbove(featureWhitelist, minSegmentCount-1);
}
 
Example 4
Source File: OptimizerUtils.java    From phrasal with GNU General Public License v3.0 6 votes vote down vote up
public static Set<String> featureWhiteList(FlatNBestList nbest, int minSegmentCount) {
  List<List<ScoredFeaturizedTranslation<IString, String>>> nbestlists = nbest.nbestLists();
  Counter<String> featureSegmentCounts = new ClassicCounter<String>();
  for (List<ScoredFeaturizedTranslation<IString, String>> nbestlist : nbestlists) {
      Set<String> segmentFeatureSet = new HashSet<String>();
      for (ScoredFeaturizedTranslation<IString, String> trans : nbestlist) {
         for (FeatureValue<String> feature : trans.features) {
           segmentFeatureSet.add(feature.name);
         }
      }
      for (String featureName : segmentFeatureSet) {
        featureSegmentCounts.incrementCount(featureName);
      }
  }
  return Counters.keysAbove(featureSegmentCounts, minSegmentCount -1);
}
 
Example 5
Source File: PairwiseRankingOptimizer.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
@Override
public Counter<String> optimize(Counter<String> initialWts) {
  Counter<String> wts = new ClassicCounter<String>(initialWts);
  Counters.normalize(wts);
  double seedSeed = Math.abs(Counters.max(wts));
  long seed = (long)Math.exp(Math.log(seedSeed) + Math.log(Long.MAX_VALUE));
  System.err.printf("PRO thread using random seed: %d\n", seed);
  RVFDataset<String, String> proSamples = getSamples(new Random(seed));
  LogPrior lprior = new LogPrior();
  lprior.setSigma(l2sigma);
  LogisticClassifierFactory<String,String> lcf = new LogisticClassifierFactory<String,String>();
  LogisticClassifier<String, String> lc = lcf.trainClassifier(proSamples, lprior, false);
  Counter<String> decoderWeights = new ClassicCounter<String>(); 
  Counter<String> lcWeights = lc.weightsAsCounter();
  for (String key : lcWeights.keySet()) {
    double mul;
    if (key.startsWith("1 / ")) {
      mul = 1.0;
    } else if (key.startsWith("0 / ")) {
      mul = -1.0;
    } else {
      throw new RuntimeException("Unparsable weight name produced by logistic classifier: "+key);
    }
    String decoderKey = key.replaceFirst("^[10] / ", "");
    decoderWeights.incrementCount(decoderKey, mul*lcWeights.getCount(key));
  }

  synchronized (MERT.bestWts) {
    if (!updatedBestOnce) {
      System.err.println("Force updating weights (once)");
      double metricEval = MERT.evalAtPoint(nbest, decoderWeights, emetric);
      MERT.updateBest(decoderWeights, metricEval, true);
      updatedBestOnce = true;
    }
  }
  return decoderWeights;
}
 
Example 6
Source File: AbstractOnlineOptimizer.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
@Override
public Counter<String> getBatchGradient(Counter<String> weights,
    List<Sequence<IString>> sources, int[] sourceIds,
    List<List<RichTranslation<IString, String>>> translations,
    List<List<Sequence<IString>>> references,
    double[] referenceWeights,
    SentenceLevelMetric<IString, String> scoreMetric) {
  Counter<String> batchGradient = new ClassicCounter<String>();

  for (int i = 0; i < sourceIds.length; i++) {
    if (translations.get(i).size() > 0) {
      // Skip decoder failures.
      Counter<String> unregularizedGradient = getUnregularizedGradient(weights, sources.get(i), sourceIds[i], translations.get(i), references.get(i), referenceWeights, scoreMetric);
      batchGradient.addAll(unregularizedGradient);
    }
  }

  // Add L2 regularization directly into the derivative
  if (this.l2Regularization) {
    final Set<String> features = new HashSet<String>(weights.keySet());
    features.addAll(weights.keySet());
    final double dataFraction = sourceIds.length /(double) tuneSetSize;
    final double scaledInvSigmaSquared = dataFraction/(2*sigmaSq);
    for (String key : features) {
      double x = weights.getCount(key);
      batchGradient.incrementCount(key, x * scaledInvSigmaSquared);
    }
  }

  return batchGradient;
}
 
Example 7
Source File: OptimizerUtils.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
public static <T> Counter<T> featureValueCollectionToCounter(Collection<FeatureValue<T>> c) {
  Counter<T> counter = new ClassicCounter<T>();
  
  for (FeatureValue<T> fv : c) {
    counter.incrementCount(fv.name, fv.value);
  }
  
  return counter;
}
 
Example 8
Source File: MERT.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
public static Counter<String> summarizedAllFeaturesVector(
    List<ScoredFeaturizedTranslation<IString, String>> trans) {
  Counter<String> sumValues = new ClassicCounter<String>();

  for (ScoredFeaturizedTranslation<IString, String> tran : trans) {
    for (FeatureValue<String> fValue : tran.features) {
      sumValues.incrementCount(fValue.name, fValue.value);
    }
  }

  return sumValues;
}
 
Example 9
Source File: MetricUtils.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 
 * @param <TK>
 */
static public <TK> Counter<Sequence<TK>> getNGramCounts(Sequence<TK> sequence, int maxOrder) {
  Counter<Sequence<TK>> counts = new ClassicCounter<>();
  int sz = sequence.size();
  for (int i = 0; i < sz; i++) {
    int jMax = Math.min(sz, i + maxOrder);
    for (int j = i + 1; j <= jMax; j++) {
      Sequence<TK> ngram = sequence.subsequence(i, j);
      counts.incrementCount(ngram);
    }
  }
  return counts;
}
 
Example 10
Source File: Summarizer.java    From wiseowl with MIT License 5 votes vote down vote up
private static Counter<String> getTermFrequencies(List<CoreMap> sentences) {
  Counter<String> ret = new ClassicCounter<String>();

  for (CoreMap sentence : sentences)
    for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class))
      ret.incrementCount(cl.get(CoreAnnotations.TextAnnotation.class));

  return ret;
}
 
Example 11
Source File: CoverageChecker.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
static public void countNgrams(String line, Counter<String> ngramCounts, Set<String> limitSet, int order) {
   String[] toks = line.split("\\s");
   for (int i = 0; i < toks.length; i++) {
      for (int j = 0; j < order && j+i < toks.length ; j++) {
         String[] ngramArr = Arrays.copyOfRange(toks, i, i+j+1);
         String ngram = Sentence.listToString(Arrays.asList(ngramArr));
         if (limitSet == null || limitSet.contains(ngram)) {
            ngramCounts.incrementCount(ngram);
         }
      }
   }	   
}
 
Example 12
Source File: FeatureValues.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Convert a collection of feature values to a counter.
 * 
 * @param featureValues
 * @return
 */
public static <T> Counter<T> toCounter(Collection<FeatureValue<T>> featureValues) {
  Counter<T> counter = new ClassicCounter<T>();
  for (FeatureValue<T> fv : featureValues) {
    counter.incrementCount(fv.name, fv.value);
  }
  return counter;
}
 
Example 13
Source File: ComputeBitextIDF.java    From phrasal with GNU General Public License v3.0 5 votes vote down vote up
/**
 * @param args
 */
public static void main(String[] args) {
  if (args.length > 0) {
    System.err.printf("Usage: java %s < files > idf-file%n", ComputeBitextIDF.class.getName());
    System.exit(-1);
  }

  Counter<String> documentsPerTerm = new ClassicCounter<String>(1000000);
  LineNumberReader reader = new LineNumberReader(new InputStreamReader(System.in));
  double nDocuments = 0.0;
  try {
    for (String line; (line = reader.readLine()) != null;) {
      String[] tokens = line.trim().split("\\s+");
      Set<String> seen = new HashSet<String>(tokens.length);
      for (String token : tokens) {
        if ( ! seen.contains(token)) {
          seen.add(token);
          documentsPerTerm.incrementCount(token);
        }
      }
    }
    nDocuments = reader.getLineNumber();
    reader.close();
  } catch (IOException e) {
    e.printStackTrace();
  }

  // Output the idfs
  System.err.printf("Bitext contains %d sentences and %d word types%n", (int) nDocuments, documentsPerTerm.keySet().size());
  for (String wordType : documentsPerTerm.keySet()) {
    double count = documentsPerTerm.getCount(wordType);
    System.out.printf("%s\t%f%n", wordType, Math.log(nDocuments / count));
  }
  System.out.printf("%s\t%f%n", UNK_TOKEN, Math.log(nDocuments / 1.0));
}
 
Example 14
Source File: BasicPowellOptimizer.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public Counter<String> optimize(Counter<String> initialWts) {
  Counter<String> wts = initialWts;

  // initialize search directions
  List<Counter<String>> axisDirs = new ArrayList<Counter<String>>(
      initialWts.size());
  List<String> featureNames = new ArrayList<String>(wts.keySet());
  Collections.sort(featureNames);
  for (String featureName : featureNames) {
    Counter<String> dir = new ClassicCounter<String>();
    dir.incrementCount(featureName);
    axisDirs.add(dir);
  }

  // main optimization loop
  Counter[] p = new ClassicCounter[axisDirs.size()];
  double objValue = MERT.evalAtPoint(nbest, wts, emetric); // obj value w/o
  // smoothing
  List<Counter<String>> dirs = null;
  for (int iter = 0;; iter++) {
    if (iter % p.length == 0) {
      // reset after N iterations to avoid linearly dependent search
      // directions
      System.err.printf("%d: Search direction reset\n", iter);
      dirs = new ArrayList<Counter<String>>(axisDirs);
    }
    // search along each direction
    assert (dirs != null);
    p[0] = mert.lineSearch(nbest, wts, dirs.get(0), emetric);
    for (int i = 1; i < p.length; i++) {
      p[i] = mert.lineSearch(nbest, (Counter<String>) p[i - 1], dirs.get(i),
          emetric);
      dirs.set(i - 1, dirs.get(i)); // shift search directions
    }

    double totalWin = MERT.evalAtPoint(nbest, p[p.length - 1], emetric)
        - objValue;
    System.err.printf("%d: totalWin: %e Objective: %e\n", iter, totalWin,
        objValue);
    if (Math.abs(totalWin) < MERT.MIN_OBJECTIVE_DIFF)
      break;

    // construct combined direction
    Counter<String> combinedDir = new ClassicCounter<String>(wts);
    Counters.multiplyInPlace(combinedDir, -1.0);
    combinedDir.addAll(p[p.length - 1]);

    dirs.set(p.length - 1, combinedDir);

    // search along combined direction
    wts = mert.lineSearch(nbest, (Counter<String>) p[p.length - 1],
        dirs.get(p.length - 1), emetric);
    objValue = MERT.evalAtPoint(nbest, wts, emetric);
    System.err.printf("%d: Objective after combined search %e\n", iter,
        objValue);
  }

  return wts;
}
 
Example 15
Source File: LineSearchOptimizer.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
@Override
public Counter<String> optimize(final Counter<String> initialWts) {
  Counter<String> dir = new ClassicCounter<String>();
  dir.incrementCount(featureName, 1.0);
  return mert.lineSearch(nbest, initialWts, dir, emetric);
}
 
Example 16
Source File: PowellOptimizer.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public Counter<String> optimize(Counter<String> initialWts) {

  Counter<String> wts = initialWts;

  // initialize search directions
  List<Counter<String>> dirs = new ArrayList<Counter<String>>(
      initialWts.size());
  List<String> featureNames = new ArrayList<String>(wts.keySet());
  Collections.sort(featureNames);
  for (String featureName : featureNames) {
    Counter<String> dir = new ClassicCounter<String>();
    dir.incrementCount(featureName);
    dirs.add(dir);
  }

  // main optimization loop
  Counter[] p = new ClassicCounter[dirs.size()];
  double objValue = MERT.evalAtPoint(nbest, wts, emetric); // obj value w/o
  // smoothing
  for (int iter = 0;; iter++) {
    // search along each direction
    p[0] = mert.lineSearch(nbest, wts, dirs.get(0), emetric);
    double eval = MERT.evalAtPoint(nbest, p[0], emetric);
    double biggestWin = Math.max(0, eval - objValue);
    System.err.printf("initial totalWin: %e (%e-%e)\n", biggestWin, eval,
        objValue);
    System.err.printf("apply @ wts: %e\n",
        MERT.evalAtPoint(nbest, wts, emetric));
    System.err.printf("apply @ p[0]: %e\n",
        MERT.evalAtPoint(nbest, p[0], emetric));
    objValue = eval;
    int biggestWinId = 0;
    double totalWin = biggestWin;
    double initObjValue = objValue;
    for (int i = 1; i < p.length; i++) {
      p[i] = mert.lineSearch(nbest, (Counter<String>) p[i - 1], dirs.get(i),
          emetric);
      eval = MERT.evalAtPoint(nbest, p[i], emetric);
      if (Math.max(0, eval - objValue) > biggestWin) {
        biggestWin = eval - objValue;
        biggestWinId = i;
      }
      totalWin += Math.max(0, eval - objValue);
      System.err.printf("\t%d totalWin: %e(%e-%e)\n", i, totalWin, eval,
          objValue);
      objValue = eval;
    }

    System.err.printf("%d: totalWin %e biggestWin: %e objValue: %e\n", iter,
        totalWin, biggestWin, objValue);

    // construct combined direction
    Counter<String> combinedDir = new ClassicCounter<String>(wts);
    Counters.multiplyInPlace(combinedDir, -1.0);
    combinedDir.addAll(p[p.length - 1]);

    // check to see if we should replace the dominant 'win' direction
    // during the last iteration of search with the combined search direction
    Counter<String> testPoint = new ClassicCounter<String>(p[p.length - 1]);
    testPoint.addAll(combinedDir);
    double testPointEval = MERT.evalAtPoint(nbest, testPoint, emetric);
    double extrapolatedWin = testPointEval - objValue;
    System.err.printf("Test Point Eval: %e, extrapolated win: %e\n",
        testPointEval, extrapolatedWin);
    if (extrapolatedWin > 0
        && 2 * (2 * totalWin - extrapolatedWin)
            * Math.pow(totalWin - biggestWin, 2.0) < Math.pow(
            extrapolatedWin, 2.0) * biggestWin) {
      System.err.printf(
          "%d: updating direction %d with combined search dir\n", iter,
          biggestWinId);
      MERT.normalize(combinedDir);
      dirs.set(biggestWinId, combinedDir);
    }

    // Search along combined dir even if replacement didn't happen
    wts = mert.lineSearch(nbest, p[p.length - 1], combinedDir, emetric);
    eval = MERT.evalAtPoint(nbest, wts, emetric);
    System.err.printf(
        "%d: Objective after combined search (gain: %e prior:%e)\n", iter,
        eval - objValue, objValue);

    objValue = eval;

    double finalObjValue = objValue;
    System.err.printf("Actual win: %e (%e-%e)\n", finalObjValue
        - initObjValue, finalObjValue, initObjValue);
    if (Math.abs(initObjValue - finalObjValue) < MERT.MIN_OBJECTIVE_DIFF)
      break; // changed to prevent infinite loops
  }

  return wts;
}
 
Example 17
Source File: CRFPostprocessor.java    From phrasal with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Evaluate the postprocessor given an input file specified in the flags.
 * 
 * @param preProcessor
 * @param pwOut
 */
protected void evaluate(Preprocessor preProcessor, PrintWriter pwOut) {
  System.err.println("Starting evaluation...");
  DocumentReaderAndWriter<CoreLabel> docReader = new ProcessorTools.PostprocessorDocumentReaderAndWriter(preProcessor);
  ObjectBank<List<CoreLabel>> lines =
    classifier.makeObjectBankFromFile(flags.testFile, docReader);

  Counter<String> labelTotal = new ClassicCounter<String>();
  Counter<String> labelCorrect = new ClassicCounter<String>();
  int total = 0;
  int correct = 0;
  PrintWriter pw = new PrintWriter(IOTools.getWriterFromFile("apply.out"));
  for (List<CoreLabel> line : lines) {
    line = classifier.classify(line);
    pw.println(Sentence.listToString(ProcessorTools.toPostProcessedSequence(line)));
    total += line.size();
    for (CoreLabel label : line) {
      String hypothesis = label.get(CoreAnnotations.AnswerAnnotation.class);
      String reference = label.get(CoreAnnotations.GoldAnswerAnnotation.class);
      labelTotal.incrementCount(reference);
      if (hypothesis.equals(reference)) {
        correct++;
        labelCorrect.incrementCount(reference);
      }
    }
  }
  pw.close();

  double accuracy = ((double) correct) / ((double) total);
  accuracy *= 100.0;

  pwOut.println("EVALUATION RESULTS");
  pwOut.printf("#datums:\t%d%n", total);
  pwOut.printf("#correct:\t%d%n", correct);
  pwOut.printf("accuracy:\t%.2f%n", accuracy);
  pwOut.println("==================");

  // Output the per label accuracies
  pwOut.println("PER LABEL ACCURACIES");
  for (String refLabel : labelTotal.keySet()) {
    double nTotal = labelTotal.getCount(refLabel);
    double nCorrect = labelCorrect.getCount(refLabel);
    double acc = (nCorrect / nTotal) * 100.0;
    pwOut.printf(" %s\t%.2f%n", refLabel, acc);
  }
}
 
Example 18
Source File: KBPStatisticalExtractor.java    From InformationExtraction with GNU General Public License v3.0 2 votes vote down vote up
/**
 * <p>
 *   Often, features fall naturally into <i>feature templates</i> and their associated value.
 *   For example, unigram features have a feature template of unigram, and a feature value of the word
 *   in question.
 * </p>
 *
 * <p>
 *   This method is a convenience convention for defining these feature template / value pairs.
 *   The advantage of using the method is that it allows for easily finding the feature template for a
 *   given feature value; thus, you can do feature selection post-hoc on the String features by splitting
 *   out certain feature templates.
 * </p>
 *
 * <p>
 *   Note that spaces in the feature value are also replaced with a special character, mostly out of
 *   paranoia.
 * </p>
 *
 * @param features The feature counter we are updating.
 * @param featureTemplate The feature template to add a value to.
 * @param featureValue The value of the feature template. This is joined with the template, so it
 *                     need only be unique within the template.
 */
private static void indicator(Counter<String> features, String featureTemplate, String featureValue) {
  features.incrementCount(featureTemplate + "ℵ" + featureValue.replace(' ', 'ˑ'));
}
 
Example 19
Source File: KBPStatisticalExtractor.java    From InformationExtraction with GNU General Public License v3.0 2 votes vote down vote up
/**
 * <p>
 * Often, features fall naturally into <i>feature templates</i> and their associated value.
 * For example, unigram features have a feature template of unigram, and a feature value of the word
 * in question.
 * </p>
 * <p>
 * <p>
 * This method is a convenience convention for defining these feature template / value pairs.
 * The advantage of using the method is that it allows for easily finding the feature template for a
 * given feature value; thus, you can do feature selection post-hoc on the String features by splitting
 * out certain feature templates.
 * </p>
 * <p>
 * <p>
 * Note that spaces in the feature value are also replaced with a special character, mostly out of
 * paranoia.
 * </p>
 *
 * @param features        The feature counter we are updating.
 * @param featureTemplate The feature template to add a value to.
 * @param featureValue    The value of the feature template. This is joined with the template, so it
 *                        need only be unique within the template.
 */
private static void indicator(Counter<String> features, String featureTemplate, String featureValue) {
    features.incrementCount(featureTemplate + "ℵ" + featureValue.replace(' ', 'ˑ'));
}