opennlp.tools.doccat.DocumentSample Java Examples

The following examples show how to use opennlp.tools.doccat.DocumentSample. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OpenNlpDoccatRecommender.java    From inception with Apache License 2.0 6 votes vote down vote up
@Override
public void train(RecommenderContext aContext, List<CAS> aCasses)
    throws RecommendationException
{
    List<DocumentSample> docSamples = extractSamples(aCasses);
    
    if (docSamples.size() < 2) {
        LOG.info("Not enough training data: [{}] items", docSamples.size());
        return;
    }
    
    // The beam size controls how many results are returned at most. But even if the user
    // requests only few results, we always use at least the default bean size recommended by
    // OpenNLP
    int beamSize = Math.max(maxRecommendations, NameFinderME.DEFAULT_BEAM_SIZE);

    TrainingParameters params = traits.getParameters();
    params.put(BeamSearch.BEAM_SIZE_PARAMETER, Integer.toString(beamSize));
    
    DoccatModel model = train(docSamples, params);
    
    aContext.put(KEY_MODEL, model);
}
 
Example #2
Source File: DocumentSampleStream.java    From inception with Apache License 2.0 5 votes vote down vote up
@Override
public DocumentSample read()
{
    if (iterator != null && iterator.hasNext()) {
        return iterator.next();
    }
    return null;
}
 
Example #3
Source File: OpenNlpDoccatRecommender.java    From inception with Apache License 2.0 5 votes vote down vote up
private List<DocumentSample> extractSamples(List<CAS> aCasses)
{
    List<DocumentSample> samples = new ArrayList<>();
    casses: for (CAS cas : aCasses) {
        Type sentenceType = getType(cas, Sentence.class);
        Type tokenType = getType(cas, Token.class);

        Map<AnnotationFS, List<AnnotationFS>> sentences = indexCovered(
                cas, sentenceType, tokenType);
        for (Entry<AnnotationFS, List<AnnotationFS>> e : sentences.entrySet()) {
            AnnotationFS sentence = e.getKey();
            Collection<AnnotationFS> tokens = e.getValue();
            String[] tokenTexts = tokens.stream()
                .map(AnnotationFS::getCoveredText)
                .toArray(String[]::new);
            
            Type annotationType = getType(cas, layerName);
            Feature feature = annotationType.getFeatureByBaseName(featureName);
            
            for (AnnotationFS annotation : selectCovered(annotationType, sentence)) {
                if (samples.size() >= traits.getTrainingSetSizeLimit()) {
                    break casses;
                }
                
                String label = annotation.getFeatureValueAsString(feature);
                DocumentSample nameSample = new DocumentSample(
                        label != null ? label : NO_CATEGORY, tokenTexts);
                if (nameSample.getCategory() != null) {
                    samples.add(nameSample);
                }
            }
        }
    }
    
    return samples;
}
 
Example #4
Source File: OpenNlpDoccatRecommender.java    From inception with Apache License 2.0 5 votes vote down vote up
private DoccatModel train(List<DocumentSample> aSamples, TrainingParameters aParameters)
    throws RecommendationException
{
    try (DocumentSampleStream stream = new DocumentSampleStream(aSamples)) {
        DoccatFactory factory = new DoccatFactory();
        return DocumentCategorizerME.train("unknown", stream, aParameters, factory);
    }
    catch (IOException e) {
        throw new RecommendationException(
                "Exception during training the OpenNLP Document Categorizer model.", e);
    }
}
 
Example #5
Source File: DocumentSampleStream.java    From inception with Apache License 2.0 4 votes vote down vote up
public DocumentSampleStream(List<DocumentSample> aSamples)
{
    samples = aSamples;
    iterator = samples.iterator();
}