package org.apache.lucene.analysis.jate; import opennlp.tools.sentdetect.SentenceDetector; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; import java.io.IOException; import java.util.Map; import java.util.concurrent.Exchanger; public class OpenNLPTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware { private SentenceDetector sentenceOp = null; private String sentenceModelFile = null; private opennlp.tools.tokenize.Tokenizer tokenizerOp = null; private String tokenizerModelFile = null; private String parChunkingClass=null; private ParagraphChunker paragraphChunker; /** * Creates a new StandardTokenizerFactory */ public OpenNLPTokenizerFactory(Map<String, String> args) { super(args); sentenceModelFile = args.get("sentenceModel"); tokenizerModelFile = args.get("tokenizerModel"); parChunkingClass=args.get("paragraphChunker-class"); } @Override public Tokenizer create(AttributeFactory factory) { OpenNLPTokenizer tokenizer; if(paragraphChunker==null) tokenizer= new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp); else tokenizer=new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp, paragraphChunker); return tokenizer; } @Override public void inform(ResourceLoader loader) throws IOException { if(sentenceModelFile!=null) { sentenceOp = new SentenceDetectorME(new SentenceModel( loader.openResource(sentenceModelFile))); } if(tokenizerModelFile==null) throw new IOException("Parameter 'tokenizerModle' is required, but is invalid:"+tokenizerModelFile); tokenizerOp = new TokenizerME(new TokenizerModel( loader.openResource(tokenizerModelFile) )); if(parChunkingClass!=null) { try { Class c = Class.forName(parChunkingClass); Object o = c.newInstance(); paragraphChunker = (ParagraphChunker) o; }catch (Exception e){ throw new IOException(e); } } } }