package nlp.intent.toolkit; import opennlp.tools.doccat.DocumentSample; import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; import java.io.IOException; import java.util.Vector; public class IntentDocumentSampleStream implements ObjectStream<DocumentSample> { String category; ObjectStream<String> stream; public IntentDocumentSampleStream(String category, ObjectStream<String> stream) { this.category = category; this.stream = stream; } public DocumentSample read() throws IOException { String sampleString = stream.read(); if (sampleString != null) { // Whitespace tokenize entire string String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sampleString); //remove entities Vector<String> vector = new Vector<String>(tokens.length); boolean skip = false; for (String token : tokens) { if (!token.startsWith("<")) { vector.add(token); } } tokens = new String[vector.size()]; vector.copyInto(tokens); DocumentSample sample; if (tokens.length > 0) { sample = new DocumentSample(category, tokens); } else { throw new IOException("Empty lines are not allowed!"); } return sample; } else { return null; } } public void reset() throws IOException, UnsupportedOperationException { stream.reset(); } public void close() throws IOException { stream.close(); } }