opennlp.tools.doccat.DocumentSample Java Examples

The following examples show how to use opennlp.tools.doccat.DocumentSample. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: OpenNlpDoccatRecommender.java From inception with Apache License 2.0

6 votes

@Override
public void train(RecommenderContext aContext, List<CAS> aCasses)
    throws RecommendationException
{
    List<DocumentSample> docSamples = extractSamples(aCasses);
    
    if (docSamples.size() < 2) {
        LOG.info("Not enough training data: [{}] items", docSamples.size());
        return;
    }
    
    // The beam size controls how many results are returned at most. But even if the user
    // requests only few results, we always use at least the default bean size recommended by
    // OpenNLP
    int beamSize = Math.max(maxRecommendations, NameFinderME.DEFAULT_BEAM_SIZE);

    TrainingParameters params = traits.getParameters();
    params.put(BeamSearch.BEAM_SIZE_PARAMETER, Integer.toString(beamSize));
    
    DoccatModel model = train(docSamples, params);
    
    aContext.put(KEY_MODEL, model);
}

Example #2

Source File: DocumentSampleStream.java From inception with Apache License 2.0

5 votes

@Override
public DocumentSample read()
{
    if (iterator != null && iterator.hasNext()) {
        return iterator.next();
    }
    return null;
}

Example #3

Source File: OpenNlpDoccatRecommender.java From inception with Apache License 2.0

5 votes

private List<DocumentSample> extractSamples(List<CAS> aCasses)
{
    List<DocumentSample> samples = new ArrayList<>();
    casses: for (CAS cas : aCasses) {
        Type sentenceType = getType(cas, Sentence.class);
        Type tokenType = getType(cas, Token.class);

        Map<AnnotationFS, List<AnnotationFS>> sentences = indexCovered(
                cas, sentenceType, tokenType);
        for (Entry<AnnotationFS, List<AnnotationFS>> e : sentences.entrySet()) {
            AnnotationFS sentence = e.getKey();
            Collection<AnnotationFS> tokens = e.getValue();
            String[] tokenTexts = tokens.stream()
                .map(AnnotationFS::getCoveredText)
                .toArray(String[]::new);
            
            Type annotationType = getType(cas, layerName);
            Feature feature = annotationType.getFeatureByBaseName(featureName);
            
            for (AnnotationFS annotation : selectCovered(annotationType, sentence)) {
                if (samples.size() >= traits.getTrainingSetSizeLimit()) {
                    break casses;
                }
                
                String label = annotation.getFeatureValueAsString(feature);
                DocumentSample nameSample = new DocumentSample(
                        label != null ? label : NO_CATEGORY, tokenTexts);
                if (nameSample.getCategory() != null) {
                    samples.add(nameSample);
                }
            }
        }
    }
    
    return samples;
}

Example #4

Source File: OpenNlpDoccatRecommender.java From inception with Apache License 2.0

5 votes

private DoccatModel train(List<DocumentSample> aSamples, TrainingParameters aParameters)
    throws RecommendationException
{
    try (DocumentSampleStream stream = new DocumentSampleStream(aSamples)) {
        DoccatFactory factory = new DoccatFactory();
        return DocumentCategorizerME.train("unknown", stream, aParameters, factory);
    }
    catch (IOException e) {
        throw new RecommendationException(
                "Exception during training the OpenNLP Document Categorizer model.", e);
    }
}

Example #5

Source File: DocumentSampleStream.java From inception with Apache License 2.0

4 votes

public DocumentSampleStream(List<DocumentSample> aSamples)
{
    samples = aSamples;
    iterator = samples.iterator();
}