opennlp.tools.util.ObjectStream Java Examples

The following examples show how to use opennlp.tools.util.ObjectStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractTaggerTrainer.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Construct an AbstractTrainer. In the params parameter there is information
 * about the language, the featureset, and whether to use pos tag dictionaries
 * or automatically created dictionaries from the training set.
 * 
 * @param params
 *          the training parameters
 * @throws IOException
 *           the io exceptions
 */
public AbstractTaggerTrainer(final TrainingParameters params) throws IOException {
  this.lang = Flags.getLanguage(params);
  final String trainData = Flags.getDataSet("TrainSet", params);
  final String testData = Flags.getDataSet("TestSet", params);
  final ObjectStream<String> trainStream = InputOutputUtils
      .readFileIntoMarkableStreamFactory(trainData);
  this.trainSamples = new MorphoSampleStream(trainStream);
  final ObjectStream<String> testStream = InputOutputUtils
      .readFileIntoMarkableStreamFactory(testData);
  this.testSamples = new MorphoSampleStream(testStream);
  final ObjectStream<String> dictStream = InputOutputUtils
      .readFileIntoMarkableStreamFactory(trainData);
  setDictSamples(new MorphoSampleStream(dictStream));
  this.dictCutOff = Flags.getAutoDictFeatures(params);
  this.ngramCutOff = Flags.getNgramDictFeatures(params);

}
 
Example #2
Source File: AbstractTaggerTrainer.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Automatically create a tag dictionary from training data.
 * 
 * @param aDictSamples
 *          the dictSamples created from training data
 * @param aDictCutOff
 *          the cutoff to create the dictionary
 */
protected final void createAutomaticDictionary(
    final ObjectStream<POSSample> aDictSamples, final int aDictCutOff) {
  if (aDictCutOff != Flags.DEFAULT_DICT_CUTOFF) {
    try {
      TagDictionary dict = getPosTaggerFactory().getTagDictionary();
      if (dict == null) {
        dict = getPosTaggerFactory().createEmptyTagDictionary();
        getPosTaggerFactory().setTagDictionary(dict);
      }
      if (dict instanceof MutableTagDictionary) {
        POSTaggerME.populatePOSDictionary(aDictSamples,
            (MutableTagDictionary) dict, aDictCutOff);
      } else {
        throw new IllegalArgumentException("Can't extend a POSDictionary"
            + " that does not implement MutableTagDictionary.");
      }
      this.dictSamples.reset();
    } catch (final IOException e) {
      throw new TerminateToolException(-1,
          "IO error while creating/extending POS Dictionary: "
              + e.getMessage(), e);
    }
  }
}
 
Example #3
Source File: AbstractTaggerTrainer.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Create ngram dictionary from training data.
 * 
 * @param aDictSamples
 *          the training data
 * @param aNgramCutoff
 *          the cutoff
 * @return ngram dictionary
 */
protected final Dictionary createNgramDictionary(
    final ObjectStream<POSSample> aDictSamples, final int aNgramCutoff) {
  Dictionary ngramDict = null;
  if (aNgramCutoff != Flags.DEFAULT_DICT_CUTOFF) {
    System.err.print("Building ngram dictionary ... ");
    try {
      ngramDict = POSTaggerME
          .buildNGramDictionary(aDictSamples, aNgramCutoff);
      this.dictSamples.reset();
    } catch (final IOException e) {
      throw new TerminateToolException(-1,
          "IO error while building NGram Dictionary: " + e.getMessage(), e);
    }
    System.err.println("done");
  }
  return ngramDict;
}
 
Example #4
Source File: LanguageDetectorAndTrainingDataUnitTest.java    From tutorials with MIT License 6 votes vote down vote up
@Test
public void givenLanguageDictionary_whenLanguageDetect_thenLanguageIsDetected() throws FileNotFoundException, IOException {
    InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt"));
    ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
    LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
    params.put(TrainingParameters.CUTOFF_PARAM, 5);
    params.put("DataIndexer", "TwoPass");
    params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");

    LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory());

    LanguageDetector ld = new LanguageDetectorME(model);
    Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
    
    assertThat(Arrays.asList(languages)).extracting("lang", "confidence").contains(tuple("pob", 0.9999999950605625),
             tuple("ita", 4.939427661577956E-9), tuple("spa", 9.665954064665144E-15),
            tuple("fra", 8.250349924885834E-25));
}
 
Example #5
Source File: Chapter4.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void trainingOpenNLPNERModel() {
    try (OutputStream modelOutputStream = new BufferedOutputStream(
            new FileOutputStream(new File("modelFile")));) {
        ObjectStream<String> lineStream = new PlainTextByLineStream(
                new FileInputStream("en-ner-person.train"), "UTF-8");
        ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

        TokenNameFinderModel model = NameFinderME.train("en", "person", sampleStream,
                null, 100, 5);

        model.serialize(modelOutputStream);
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
 
Example #6
Source File: POSCrossValidator.java    From ixa-pipe-pos with Apache License 2.0 5 votes vote down vote up
/**
 * Construct a CrossValidator. In the params parameter there is information
 * about the language, the featureset, and whether to use pos tag dictionaries
 * or automatically created dictionaries from the training set.
 * 
 * @param params
 *          the training parameters
 * @throws IOException
 *           the io exceptions
 */
public POSCrossValidator(final TrainingParameters params) throws IOException {
  this.lang = Flags.getLanguage(params);
  final String trainData = Flags.getDataSet("TrainSet", params);
  final ObjectStream<String> trainStream = InputOutputUtils
      .readFileIntoMarkableStreamFactory(trainData);
  this.trainSamples = new WordTagSampleStream(trainStream);
  this.dictCutOff = Flags.getAutoDictFeatures(params);
  this.folds = Flags.getFolds(params);
  createPOSFactory(params);
  getEvalListeners(params);
}
 
Example #7
Source File: IntentDocumentSampleStream.java    From org.openhab.ui.habot with Eclipse Public License 1.0 4 votes vote down vote up
public IntentDocumentSampleStream(String category, ObjectStream<String> stream) {
    this.category = category;
    this.stream = stream;
}
 
Example #8
Source File: IntentDocumentSampleStream.java    From nlp-intent-toolkit with The Unlicense 4 votes vote down vote up
public IntentDocumentSampleStream(String category, ObjectStream<String> stream) {
    this.category = category;
    this.stream = stream;
}
 
Example #9
Source File: LemmaSampleSequenceStream.java    From ixa-pipe-pos with Apache License 2.0 4 votes vote down vote up
public LemmaSampleSequenceStream(ObjectStream<LemmaSample> samples,
    LemmatizerContextGenerator contextGenerator) {
  this.samples = samples;
  this.contextGenerator = contextGenerator;
}
 
Example #10
Source File: LemmaSampleStream.java    From ixa-pipe-pos with Apache License 2.0 4 votes vote down vote up
public LemmaSampleStream(ObjectStream<String> samples) {
  super(samples);
}
 
Example #11
Source File: MorphoSampleStream.java    From ixa-pipe-pos with Apache License 2.0 4 votes vote down vote up
public MorphoSampleStream(ObjectStream<String> samples) {
  super(samples);
}
 
Example #12
Source File: AbstractLemmatizerTrainer.java    From ixa-pipe-pos with Apache License 2.0 3 votes vote down vote up
/**
 * Construct an AbstractTrainer. In the params parameter there is information
 * about the language, the featureset, and whether to use pos tag dictionaries
 * or automatically created dictionaries from the training set.
 * 
 * @param params
 *          the training parameters
 * @throws IOException
 *           the io exceptions
 */
public AbstractLemmatizerTrainer(final TrainingParameters params) throws IOException {
  this.lang = Flags.getLanguage(params);
  final String trainData = Flags.getDataSet("TrainSet", params);
  final String testData = Flags.getDataSet("TestSet", params);
  final ObjectStream<String> trainStream = InputOutputUtils.readFileIntoMarkableStreamFactory(trainData);
  this.trainSamples = new LemmaSampleStream(trainStream);
  final ObjectStream<String> testStream = InputOutputUtils.readFileIntoMarkableStreamFactory(testData);
  this.testSamples = new LemmaSampleStream(testStream);
}
 
Example #13
Source File: LemmaSampleEventStream.java    From ixa-pipe-pos with Apache License 2.0 2 votes vote down vote up
/**
 * Creates a new event stream based on the specified data stream using the specified context generator.
 * @param d The data stream for this event stream.
 * @param cg The context generator which should be used in the creation of events for this event stream.
 */
public LemmaSampleEventStream(ObjectStream<LemmaSample> d, LemmatizerContextGenerator cg) {
  super(d);
  this.contextGenerator = cg;
}