opennlp.tools.tokenize.TokenizerModel Java Examples

The following examples show how to use opennlp.tools.tokenize.TokenizerModel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TokenSegmenter.java    From dexter with Apache License 2.0 6 votes vote down vote up
public TokenSegmenter() {
	InputStream modelIn = null;
	try {
		// Loading tokenizer model
		modelIn = getClass().getResourceAsStream("/nlp/en-token.bin");
		final TokenizerModel tokenModel = new TokenizerModel(modelIn);
		modelIn.close();

		tokenizer = new TokenizerME(tokenModel);

	} catch (final IOException ioe) {
		ioe.printStackTrace();
	} finally {
		if (modelIn != null) {
			try {
				modelIn.close();
			} catch (final IOException e) {
			} // oh well!
		}
	}
}
 
Example #2
Source File: ConcurrentTokenizer.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Initializes the current instance with the given context.
 *
 * Note: Do all initialization in this method, do not use the constructor.
 */
public void initialize(UimaContext context) throws ResourceInitializationException {

    super.initialize(context);

    TokenizerModel model;

    try {
        TokenizerModelResource modelResource =
                        (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);

        model = modelResource.getModel();
    } catch (ResourceAccessException e) {
        throw new ResourceInitializationException(e);
    }

    tokenizer = new TokenizerME(model);
}
 
Example #3
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void trainingATokenizer() {
    createOpenNLPModel();
    try {
        paragraph = "A demonstration of how to train a tokenizer.";
        InputStream modelInputStream = new FileInputStream(new File(
                ".", "mymodel.bin"));
        TokenizerModel model = new TokenizerModel(modelInputStream);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(paragraph);
        for (String token : tokens) {
            System.out.println(token);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
 
Example #4
Source File: ConcurrentTokenizer.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Initializes the current instance with the given context.
 * 
 * Note: Do all initialization in this method, do not use the constructor.
 */
public void initialize(UimaContext context) throws ResourceInitializationException {

    super.initialize(context);

    TokenizerModel model;

    try {
        TokenizerModelResource modelResource =
                        (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);

        model = modelResource.getModel();
    } catch (ResourceAccessException e) {
        throw new ResourceInitializationException(e);
    }

    tokenizer = new TokenizerME(model);
}
 
Example #5
Source File: ConcurrentTokenizer.java    From Canova with Apache License 2.0 6 votes vote down vote up
/**
 * Initializes the current instance with the given context.
 * 
 * Note: Do all initialization in this method, do not use the constructor.
 */
public void initialize(UimaContext context)
    throws ResourceInitializationException {

  super.initialize(context);

  TokenizerModel model;

  try {
    TokenizerModelResource modelResource = (TokenizerModelResource) context
        .getResourceObject(UimaUtil.MODEL_PARAMETER);

    model = modelResource.getModel();
  } catch (ResourceAccessException e) {
    throw new ResourceInitializationException(e);
  }

  tokenizer = new TokenizerME(model);
}
 
Example #6
Source File: ConcurrentTokenizer.java    From DataVec with Apache License 2.0 6 votes vote down vote up
/**
 * Initializes the current instance with the given context.
 * 
 * Note: Do all initialization in this method, do not use the constructor.
 */
public void initialize(UimaContext context) throws ResourceInitializationException {

    super.initialize(context);

    TokenizerModel model;

    try {
        TokenizerModelResource modelResource =
                        (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);

        model = modelResource.getModel();
    } catch (ResourceAccessException e) {
        throw new ResourceInitializationException(e);
    }

    tokenizer = new TokenizerME(model);
}
 
Example #7
Source File: TokenizeME.java    From datafu with Apache License 2.0 6 votes vote down vote up
public DataBag exec(Tuple input) throws IOException
{
    if(input.size() != 1) {
        throw new IOException();
    }

    String inputString = input.get(0).toString();
    if(inputString == null || inputString == "") {
        return null;
    }
    DataBag outBag = bf.newDefaultBag();
    if(this.tokenizer == null) {
        String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);;
        InputStream file = new FileInputStream(loadFile);
        InputStream buffer = new BufferedInputStream(file);
        TokenizerModel model = new TokenizerModel(buffer);
        this.tokenizer = new TokenizerME(model);
    }
    String tokens[] = this.tokenizer.tokenize(inputString);
    for(String token : tokens) {
        Tuple outTuple = tf.newTuple(token);
        outBag.add(outTuple);
    }
    return outBag;
}
 
Example #8
Source File: OpenNLP.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
  try {
    tokensModel.loadModel(TokenizerModel.class, getClass().getResourceAsStream("en_token.bin"));
    sentencesModel.loadModel(SentenceModel.class, getClass().getResourceAsStream("en_sent.bin"));
    posModel.loadModel(POSModel.class, getClass().getResourceAsStream("en_pos_maxent.bin"));
    chunkModel.loadModel(ChunkerModel.class, getClass().getResourceAsStream("en_chunker.bin"));
  } catch (BaleenException be) {
    getMonitor().error("Unable to load OpenNLP Language Models", be);
    throw new ResourceInitializationException(be);
  }

  try {
    sentenceDetector = new SentenceDetectorME((SentenceModel) sentencesModel.getModel());
    wordTokenizer = new TokenizerME((TokenizerModel) tokensModel.getModel());
    posTagger = new POSTaggerME((POSModel) posModel.getModel());
    phraseChunker = new ChunkerME((ChunkerModel) chunkModel.getModel());
  } catch (Exception e) {
    getMonitor().error("Unable to create OpenNLP taggers", e);
    throw new ResourceInitializationException(e);
  }
}
 
Example #9
Source File: SharedOpenNLPModelTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void testLoad() throws Exception {
  SharedOpenNLPModel m = new SharedOpenNLPModel();

  m.loadModel(TokenizerModel.class, OpenNLP.class.getResourceAsStream("en_token.bin"));

  BaseModel bm = m.getModel();
  assertNotNull(bm);
  assertTrue(bm instanceof TokenizerModel);
  assertEquals("en", bm.getLanguage());

  // Trying to load a different model shouldn't change the resource
  m.loadModel(SentenceModel.class, OpenNLP.class.getResourceAsStream("en_sent.bin"));
  assertEquals(bm, m.getModel());

  m.doDestroy();
}
 
Example #10
Source File: OpenNLPTokenizerFactory.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
@Override
public void inform(ResourceLoader loader) throws IOException {
    if(sentenceModelFile!=null) {
        sentenceOp = new SentenceDetectorME(new SentenceModel(
                loader.openResource(sentenceModelFile)));
    }

    if(tokenizerModelFile==null)
        throw new IOException("Parameter 'tokenizerModle' is required, but is invalid:"+tokenizerModelFile);
    tokenizerOp = new TokenizerME(new TokenizerModel(
            loader.openResource(tokenizerModelFile)
    ));

    if(parChunkingClass!=null) {
        try {
            Class c = Class.forName(parChunkingClass);
            Object o = c.newInstance();
            paragraphChunker = (ParagraphChunker) o;
        }catch (Exception e){
            throw new IOException(e);
        }
    }

}
 
Example #11
Source File: JM_Scorer.java    From uncc2014watsonsim with GNU General Public License v2.0 6 votes vote down vote up
public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{
	POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin"))));
	Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin"))));
	Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin"))));
	double score = 0;
	
	Parse[] questionParse = ParserTool.parseLine(q, parser, 1);
	Parse[] passageParse = ParserTool.parseLine(q, parser, 1);
	
	if (passage.contains(ca)) {
		for (int i =0; i < questionParse.length; i++) {
			score += matchChildren(questionParse[i],passageParse[i]);
		}
	}
	
	return score;
}
 
Example #12
Source File: OpenNLPToolsTokenizerWrapper.java    From mateplus with GNU General Public License v2.0 5 votes vote down vote up
public static OpenNLPToolsTokenizerWrapper loadOpenNLPTokenizer(
		File modelFile) throws IOException {
	BufferedInputStream modelIn = new BufferedInputStream(
			new FileInputStream(modelFile.toString()));
	opennlp.tools.tokenize.Tokenizer tokenizer = new TokenizerME(
			new TokenizerModel(modelIn));
	return new OpenNLPToolsTokenizerWrapper(tokenizer);
}
 
Example #13
Source File: TokenizerUnitTest.java    From tutorials with MIT License 5 votes vote down vote up
@Test
public void givenEnglishModel_whenTokenize_thenTokensAreDetected() throws Exception {
    InputStream inputStream = getClass().getResourceAsStream("/models/en-token.bin");
    TokenizerModel model = new TokenizerModel(inputStream);
    TokenizerME tokenizer = new TokenizerME(model);
    String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
    assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}
 
Example #14
Source File: ValueParser.java    From TableDisentangler with GNU General Public License v3.0 5 votes vote down vote up
public ValueParser() {
	try {
		InputStream is = new FileInputStream("en-token.bin");

		TokenizerModel model = new TokenizerModel(is);

		tokenizer = new TokenizerME(model);
	} catch (Exception ex) {
		ex.printStackTrace();
	}
}
 
Example #15
Source File: OpenNlpModule.java    From SciGraph with Apache License 2.0 5 votes vote down vote up
@CheckedProvides(TokenizerProvider.class)
Tokenizer getTokenizer() throws IOException {
  try (InputStream is = getClass().getResourceAsStream("/opennlp/en-token.bin")) {
    TokenizerModel model = new TokenizerModel(is);
    return new TokenizerME(model);
  }
}
 
Example #16
Source File: BasicActions.java    From knowledge-extraction with Apache License 2.0 5 votes vote down vote up
public String[] testTokenizer(){
	String[] tokens = {};
	try (InputStream modelIn = BasicActions.class.getClassLoader()
			.getResourceAsStream(Consts.EN_TOKEN_MODEL);) {
		
		TokenizerModel tokenModel = new TokenizerModel(modelIn);
		Tokenizer tokenizer = new TokenizerME(tokenModel);
		tokens = tokenizer.tokenize(TEST_PHRASE);
		System.out.println(Arrays.toString(tokens));
	} catch (IOException e) {
		e.printStackTrace();
	}
	return tokens;
}
 
Example #17
Source File: Chapter4.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingMultipleNERModels() {
    // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
    // en-ner-organization.bin en-ner-time.bin
    try {
        InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));

        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);

        String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
            "en-ner-organization.bin"};
        ArrayList<String> list = new ArrayList();
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(
                    new FileInputStream(
                            new File(getModelDir(), name)));
            NameFinderME nameFinder = new NameFinderME(entityModel);
            for (int index = 0; index < sentences.length; index++) {
                String tokens[] = tokenizer.tokenize(sentences[index]);
                Span nameSpans[] = nameFinder.find(tokens);
                for (Span span : nameSpans) {
                    list.add("Sentence: " + index
                            + " Span: " + span.toString() + " Entity: "
                            + tokens[span.getStart()]);
                }
            }
        }
        System.out.println("Multiple Entities");
        for (String element : list) {
            System.out.println(element);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}
 
Example #18
Source File: OpenNLPOpsFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public static NLPTokenizerOp getTokenizer(String modelName) throws IOException {
  if (modelName == null) {
    return new NLPTokenizerOp();
  } else {
    TokenizerModel model = tokenizerModels.get(modelName);
    return new NLPTokenizerOp(model);
  }
}
 
Example #19
Source File: OpenNLPAnnotator.java    From Stargraph with MIT License 5 votes vote down vote up
private TokenizerModel readTokenizerModel(Language language) {
    logger.debug(marker, "Reading tokenizer model for {}", language);
    File modelFile = new File(modelsDir, String.format("%s-token.bin", language.name().toLowerCase()));
    try (InputStream in = new FileInputStream(modelFile)) {
        return new TokenizerModel(in);
    } catch (IOException e) {
        throw new StarGraphException("Can't read '" + modelFile + "'", e);
    }
}
 
Example #20
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingTheTokenizerMEClass() {
    try {
        InputStream modelIn = new FileInputStream(new File(
                getModelDir(), "en-token.bin"));
        TokenizerModel model = new TokenizerModel(modelIn);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(paragraph);
        for (String token : tokens) {
            System.out.println(token);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
 
Example #21
Source File: Chapter4.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingMultipleNERModels() {
    // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
    // en-ner-organization.bin en-ner-time.bin
    try {
        InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));

        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);

        String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
            "en-ner-organization.bin"};
        ArrayList<String> list = new ArrayList();
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(
                    new FileInputStream(
                            new File(getModelDir(), name)));
            NameFinderME nameFinder = new NameFinderME(entityModel);
            for (int index = 0; index < sentences.length; index++) {
                String tokens[] = tokenizer.tokenize(sentences[index]);
                Span nameSpans[] = nameFinder.find(tokens);
                for (Span span : nameSpans) {
                    list.add("Sentence: " + index
                            + " Span: " + span.toString() + " Entity: "
                            + tokens[span.getStart()]);
                }
            }
        }
        System.out.println("Multiple Entities");
        for (String element : list) {
            System.out.println(element);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}
 
Example #22
Source File: NLPTokenizerOp.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public NLPTokenizerOp(TokenizerModel model) {
  tokenizer = new TokenizerME(model);
}
 
Example #23
Source File: OpenNLPAnnotator.java    From Stargraph with MIT License 4 votes vote down vote up
private TokenizerModel getTokenizerModel(Language language) {
    return tokenizerModels.computeIfAbsent(language, this::readTokenizerModel);
}
 
Example #24
Source File: NERDemo.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
public static void main(String args[]){
    String sentences[] = {"Joe was the last person to see Fred. ", 
        "He saw him in Boston at McKenzie's pub at 3:00 where he " 
        + " paid $2.45 for an ale. ", 
        "Joe wanted to go to Vermont for the day to visit a cousin who " 
        + "works at IBM, but Sally and he had to look for Fred"}; 
    String sentence = "He was the last person to see Fred."; 
    try
    {
        InputStream tokenStream = new FileInputStream(new File(getResourcePath()+ "en-token.bin"));
        InputStream modelStream = new FileInputStream(new File(getResourcePath() + "en-ner-person.bin"));
        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        TokenNameFinderModel entityModel = new TokenNameFinderModel(modelStream);
        NameFinderME nameFinder = new NameFinderME(entityModel);
        String tokens1[] = tokenizer.tokenize(sentence);
        Span nameSpans1[] = nameFinder.find(tokens1);
        for (int i = 0; i < nameSpans1.length; i++) { 
            System.out.println("Span: " + nameSpans1[i].toString()); 
            System.out.println("Entity: " 
                + tokens1[nameSpans1[i].getStart()]); 
        } 
        
        System.out.println("---------- Multiple Sentences -----------");
        for (String sentence1 : sentences) { 
            String tokens[] = tokenizer.tokenize(sentence1); 
            Span nameSpans[] = nameFinder.find(tokens); 
            for (int i = 0; i < nameSpans.length; i++) { 
                System.out.println("Span: " + nameSpans[i].toString()); 
                System.out.println("Entity: "  
                    + tokens[nameSpans[i].getStart()]); 
            } 
            System.out.println(); 
        } 
        
    }
    catch(Exception e){
        System.out.println(e);
    }
}