opennlp.tools.postag.POSSample Java Examples

The following examples show how to use opennlp.tools.postag.POSSample. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Chapter1.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void detectingPartsOfSpeechExample() {
    String sentence = "POS processing is useful for enhancing the "
            + "quality of data sent to other elements of a pipeline.";

    POSModel model = new POSModelLoader()
            .load(new File("C:/Current Books/NLP and Java/Models/", "en-pos-maxent.bin"));
    POSTaggerME tagger = new POSTaggerME(model);

    String tokens[] = WhitespaceTokenizer.INSTANCE
            .tokenize(sentence);
    String[] tags = tagger.tag(tokens);

    POSSample sample = new POSSample(tokens, tags);
    String posTokens[] = sample.getSentence();
    String posTags[] = sample.getTags();
    for (int i = 0; i < posTokens.length; i++) {
        System.out.print(posTokens[i] + " - " + posTags[i]);
    }
    System.out.println();

    for (int i = 0; i < tokens.length; i++) {
        System.out.print(tokens[i] + "[" + tags[i] + "] ");
    }
}
 
Example #2
Source File: OpenNlpPosRecommender.java    From inception with Apache License 2.0 6 votes vote down vote up
@Override
public void train(RecommenderContext aContext, List<CAS> aCasses)
    throws RecommendationException
{
    List<POSSample> posSamples = extractPosSamples(aCasses);
    
    if (posSamples.size() < 2) {
        LOG.info("Not enough training data: [{}] items", posSamples.size());
        return;
    }

    // The beam size controls how many results are returned at most. But even if the user
    // requests only few results, we always use at least the default bean size recommended by
    // OpenNLP
    int beamSize = Math.max(maxRecommendations, POSTaggerME.DEFAULT_BEAM_SIZE);

    TrainingParameters params = traits.getParameters();
    params.put(BeamSearch.BEAM_SIZE_PARAMETER, Integer.toString(beamSize));
    POSModel model = train(posSamples, params);

    aContext.put(KEY_MODEL, model);
}
 
Example #3
Source File: OpenNlpPosRecommender.java    From inception with Apache License 2.0 6 votes vote down vote up
@Nullable
private POSModel train(List<POSSample> aPosSamples, TrainingParameters aParameters)
    throws RecommendationException
{
    if (aPosSamples.isEmpty()) {
        return null;
    }

    try (POSSampleStream stream = new POSSampleStream(aPosSamples)) {
        POSTaggerFactory taggerFactory = new POSTaggerFactory();
        return POSTaggerME.train("unknown", stream, aParameters, taggerFactory);
    }
    catch (IOException e) {
        throw new RecommendationException("Error training model", e);
    }
}
 
Example #4
Source File: AbstractTaggerTrainer.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Automatically create a tag dictionary from training data.
 * 
 * @param aDictSamples
 *          the dictSamples created from training data
 * @param aDictCutOff
 *          the cutoff to create the dictionary
 */
protected final void createAutomaticDictionary(
    final ObjectStream<POSSample> aDictSamples, final int aDictCutOff) {
  if (aDictCutOff != Flags.DEFAULT_DICT_CUTOFF) {
    try {
      TagDictionary dict = getPosTaggerFactory().getTagDictionary();
      if (dict == null) {
        dict = getPosTaggerFactory().createEmptyTagDictionary();
        getPosTaggerFactory().setTagDictionary(dict);
      }
      if (dict instanceof MutableTagDictionary) {
        POSTaggerME.populatePOSDictionary(aDictSamples,
            (MutableTagDictionary) dict, aDictCutOff);
      } else {
        throw new IllegalArgumentException("Can't extend a POSDictionary"
            + " that does not implement MutableTagDictionary.");
      }
      this.dictSamples.reset();
    } catch (final IOException e) {
      throw new TerminateToolException(-1,
          "IO error while creating/extending POS Dictionary: "
              + e.getMessage(), e);
    }
  }
}
 
Example #5
Source File: AbstractTaggerTrainer.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Create ngram dictionary from training data.
 * 
 * @param aDictSamples
 *          the training data
 * @param aNgramCutoff
 *          the cutoff
 * @return ngram dictionary
 */
protected final Dictionary createNgramDictionary(
    final ObjectStream<POSSample> aDictSamples, final int aNgramCutoff) {
  Dictionary ngramDict = null;
  if (aNgramCutoff != Flags.DEFAULT_DICT_CUTOFF) {
    System.err.print("Building ngram dictionary ... ");
    try {
      ngramDict = POSTaggerME
          .buildNGramDictionary(aDictSamples, aNgramCutoff);
      this.dictSamples.reset();
    } catch (final IOException e) {
      throw new TerminateToolException(-1,
          "IO error while building NGram Dictionary: " + e.getMessage(), e);
    }
    System.err.println("done");
  }
  return ngramDict;
}
 
Example #6
Source File: MorphoSampleStream.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Parses the next sentence and return the next
 * {@link POSSample} object.
 *
 * If an error occurs an empty {@link POSSample} object is returned
 * and an warning message is logged. Usually it does not matter if one
 * of many sentences is ignored.
 *
 * TODO: An exception in error case should be thrown.
 */
public POSSample read() throws IOException {

  List<String> toks = new ArrayList<String>();
  List<String> tags = new ArrayList<String>();

  for (String line = samples.read(); line != null && !line.equals(""); line = samples.read()) {
    String[] parts = line.split("\t");
    if (parts.length != 3) {
      System.err.println("Skipping corrupt line: " + line);
    }
    else {
      toks.add(parts[0]);
      tags.add(parts[1]);
    }
  }
  if (toks.size() > 0) {
    POSSample posSample = new POSSample(toks.toArray(new String[toks.size()]), tags.toArray(new String[tags.size()]));
    //System.err.println(posSample.toString());
    return posSample;
  }
  else {
    return null;
  }
}
 
Example #7
Source File: OpenNlpPosRecommender.java    From inception with Apache License 2.0 5 votes vote down vote up
private List<POSSample> extractPosSamples(List<CAS> aCasses)
{
    List<POSSample> posSamples = new ArrayList<>();
    
    casses: for (CAS cas : aCasses) {
        Type sentenceType = getType(cas, Sentence.class);
        Type tokenType = getType(cas, Token.class);

        Map<AnnotationFS, List<AnnotationFS>> sentences = indexCovered(cas, sentenceType,
                tokenType);
        for (Map.Entry<AnnotationFS, List<AnnotationFS>> e : sentences.entrySet()) {
            if (posSamples.size() >= traits.getTrainingSetSizeLimit()) {
                break casses;
            }
            
            AnnotationFS sentence = e.getKey();

            Collection<AnnotationFS> tokens = e.getValue();
            
            createPosSample(cas, sentence, tokens).map(posSamples::add);
        }
    }
    
    LOG.debug("Extracted {} POS samples", posSamples.size());
    
    return posSamples;
}
 
Example #8
Source File: OpenNlpPosRecommender.java    From inception with Apache License 2.0 5 votes vote down vote up
private Optional<POSSample> createPosSample(CAS aCas, AnnotationFS aSentence,
        Collection<AnnotationFS> aTokens)
{
    Type annotationType = getType(aCas, layerName);
    Feature feature = annotationType.getFeatureByBaseName(featureName);

    int numberOfTokens = aTokens.size();
    String[] tokens = new String[numberOfTokens];
    String[] tags = new String[numberOfTokens];

    int withTagCount = 0;

    int i = 0;
    for (AnnotationFS token : aTokens) {
        tokens[i] = token.getCoveredText();
        String tag = getFeatureValueCovering(aCas, token, annotationType, feature);
        tags[i] = tag;

        // If the tag is neither PAD nor null, then there is at
        // least one annotation the trainer can work with.
        if (tag != null & !PAD.equals(tag)) {
            withTagCount++;
        }

        i++;
    }
    
    // Require at least X percent of the sentence to have tags to avoid class imbalance on PAD
    // tag.
    double coverage = ((double) withTagCount * 100) / (double) numberOfTokens;
    if (coverage > traits.getTaggedTokensThreshold()) {
        return Optional.of(new POSSample(tokens, tags));
    }
    else {
        return Optional.empty();
    }
}
 
Example #9
Source File: POSSampleStream.java    From inception with Apache License 2.0 5 votes vote down vote up
@Override
public POSSample read()
    throws IOException
{
    if (iterator.hasNext()) {
        return iterator.next();
    }
    return null;
}
 
Example #10
Source File: POSEvaluate.java    From ixa-pipe-pos with Apache License 2.0 5 votes vote down vote up
/**
 * Detail evaluation of a model, outputting the report a file.
 */
public final void detailEvaluate() {
  final List<EvaluationMonitor<POSSample>> listeners = new LinkedList<EvaluationMonitor<POSSample>>();
  final POSTaggerFineGrainedReportListener detailedFListener = new POSTaggerFineGrainedReportListener(
      System.out);
  listeners.add(detailedFListener);
  final POSEvaluator evaluator = new POSEvaluator(this.posTagger,
      listeners.toArray(new POSTaggerEvaluationMonitor[listeners.size()]));
  try {
    evaluator.evaluate(this.testSamples);
  } catch (IOException e) {
    e.printStackTrace();
  }
  detailedFListener.writeReport();
}
 
Example #11
Source File: POSEvaluate.java    From ixa-pipe-pos with Apache License 2.0 5 votes vote down vote up
/**
 * Evaluate and print every error.
 */
public final void evalError() {
  final List<EvaluationMonitor<POSSample>> listeners = new LinkedList<EvaluationMonitor<POSSample>>();
  listeners.add(new POSEvaluationErrorListener());
  final POSEvaluator evaluator = new POSEvaluator(this.posTagger,
      listeners.toArray(new POSTaggerEvaluationMonitor[listeners.size()]));
  try {
    evaluator.evaluate(this.testSamples);
  } catch (IOException e) {
    e.printStackTrace();
  }
  System.out.println(evaluator.getWordAccuracy());
}
 
Example #12
Source File: OpenNlpPosRecommender.java    From inception with Apache License 2.0 4 votes vote down vote up
@Override
public EvaluationResult evaluate(List<CAS> aCasses, DataSplitter aDataSplitter)
    throws RecommendationException
{        
    List<POSSample> data = extractPosSamples(aCasses);
    List<POSSample> trainingSet = new ArrayList<>();
    List<POSSample> testSet = new ArrayList<>();

    for (POSSample posSample : data) {
        switch (aDataSplitter.getTargetSet(posSample)) {
        case TRAIN:
            trainingSet.add(posSample);
            break;
        case TEST:
            testSet.add(posSample);
            break;
        default:
            // Do nothing
            break;
        }
    }

    int testSetSize = testSet.size();
    int trainingSetSize = trainingSet.size();
    double overallTrainingSize = data.size() - testSetSize;
    double trainRatio = (overallTrainingSize > 0) ? trainingSetSize / overallTrainingSize : 0.0;
    
    if (trainingSetSize < 2 || testSetSize < 2) {
        String info = String.format(
                "Not enough evaluation data: training set [%s] items, test set [%s] of total [%s]",
                trainingSetSize, testSetSize, data.size());
        LOG.info(info);

        EvaluationResult result = new EvaluationResult(trainingSetSize,
                testSetSize, trainRatio);
        result.setEvaluationSkipped(true);
        result.setErrorMsg(info);
        return result;
    }

    LOG.info("Training on [{}] items, predicting on [{}] of total [{}]", trainingSet.size(),
        testSet.size(), data.size());

    // Train model
    POSModel model = train(trainingSet, traits.getParameters());
    if (model == null) {
        throw new RecommendationException("Model is null, cannot evaluate!");
    }

    POSTaggerME tagger = new POSTaggerME(model);

    // Evaluate
    List<LabelPair> labelPairs = new ArrayList<>();
    for (POSSample sample : testSet) {
        String[] predictedTags = tagger.tag(sample.getSentence());
        String[] goldTags = sample.getTags();
        for (int i = 0; i < predictedTags.length; i++) {
            labelPairs.add(new LabelPair(goldTags[i], predictedTags[i]));
        }
    }

    return labelPairs.stream().collect(EvaluationResult
            .collector(trainingSetSize, testSetSize, trainRatio, PAD));
}
 
Example #13
Source File: POSSampleStream.java    From inception with Apache License 2.0 4 votes vote down vote up
public POSSampleStream(List<POSSample> data)
{
    this.data = data;
    this.iterator = data.iterator();
}