Java Code Examples for org.apache.lucene.analysis.custom.CustomAnalyzer

The following examples show how to use org.apache.lucene.analysis.custom.CustomAnalyzer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Quelea   Source File: SongSearchIndex.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Create a new empty search index.
 */
public SongSearchIndex() {
    songs = new HashMap<>();
    try {
        analyzer = CustomAnalyzer.builder()
                .withTokenizer(StandardTokenizerFactory.class)
                .addTokenFilter(LowerCaseFilterFactory.class)
                .addTokenFilter(ASCIIFoldingFilterFactory.class)
                .build();
        index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-song").toAbsolutePath());
    }
    catch(IOException ex) {
        LOGGER.log(Level.SEVERE, "Couldn't create song search index");
        throw new RuntimeException("Couldn't create song search index", ex);
    }
}
 
Example 2
Source Project: lucene-solr   Source File: AnalysisImplTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testAnalyze_custom() {
  AnalysisImpl analysis = new AnalysisImpl();
  Map<String, String> tkParams = new HashMap<>();
  tkParams.put("maxTokenLen", "128");
  CustomAnalyzerConfig.Builder builder = new CustomAnalyzerConfig.Builder(
      "keyword", tkParams)
      .addTokenFilterConfig("lowercase", Collections.emptyMap());
  CustomAnalyzer analyzer = (CustomAnalyzer) analysis.buildCustomAnalyzer(builder.build());
  assertEquals("org.apache.lucene.analysis.custom.CustomAnalyzer", analyzer.getClass().getName());
  assertEquals("org.apache.lucene.analysis.core.KeywordTokenizerFactory", analyzer.getTokenizerFactory().getClass().getName());
  assertEquals("org.apache.lucene.analysis.core.LowerCaseFilterFactory", analyzer.getTokenFilterFactories().get(0).getClass().getName());

  String text = "Apache Lucene";
  List<Analysis.Token> tokens = analysis.analyze(text);
  assertNotNull(tokens);
}
 
Example 3
Source Project: lucene-solr   Source File: TestOpenNLPPOSFilterFactory.java    License: Apache License 2.0 6 votes vote down vote up
public void testPOS() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
      SENTENCES_posTags, null, null, true);

  analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
      null, null, null, true, toPayloads(SENTENCES_posTags));
}
 
Example 4
Source Project: lucene-solr   Source File: NestPathField.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void setArgs(IndexSchema schema, Map<String, String> args) {
  args.putIfAbsent("stored", "false");
  args.putIfAbsent("omitTermFreqAndPositions", "true");
  args.putIfAbsent("omitNorms", "true");
  args.putIfAbsent("maxCharsForDocValues", "-1");
  super.setArgs(schema, args);

  // CustomAnalyzer is easy to use
  CustomAnalyzer customAnalyzer;
  try {
    customAnalyzer = CustomAnalyzer.builder(schema.getResourceLoader())
        .withDefaultMatchVersion(schema.getDefaultLuceneMatchVersion())
        .withTokenizer(KeywordTokenizerFactory.class)
        .addTokenFilter(PatternReplaceFilterFactory.class,
            "pattern", "#\\d*",
            "replace", "all")
        .build();
  } catch (IOException e) {
    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);//impossible?
  }
  // Solr HTTP Schema APIs don't know about CustomAnalyzer so use TokenizerChain instead
  setIndexAnalyzer(new TokenizerChain(customAnalyzer));
  // leave queryAnalyzer as literal
}
 
Example 5
Source Project: HongsCORE   Source File: DemoTest.java    License: MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    Analyzer az = CustomAnalyzer.builder()
        //.withTokenizer("Standard")
        .withTokenizer("Name")
        .addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
        //.addTokenFilter("ICUTransform", "id", "Han-Latin;NFD;[[:NonspacingMark:][:Space:]] Remove")
        //.addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
        .build();

    StringReader      sr = new StringReader(args[0]);
    TokenStream       ts = az.tokenStream  ("" , sr);
    OffsetAttribute   oa = ts.addAttribute (OffsetAttribute.class);
    CharTermAttribute ta = ts.addAttribute (CharTermAttribute.class);

    try {
        ts.reset(); // Resets this stream to the beginning. (Required)
        while (ts.incrementToken()) {
            System.out.println(ta.toString() + "|" + ta.length()
                    + "[" + oa.startOffset() + "," + oa.endOffset() + "]");
        }
        ts.end(  ); // Perform end-of-stream operations, e.g. set the final offset.
    } finally {
        ts.close(); // Release resources associated with this stream.
    }

}
 
Example 6
private Builder createDefaultAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example 7
private Builder createArtistAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example 8
Source Project: Quelea   Source File: BibleSearchIndex.java    License: GNU General Public License v3.0 5 votes vote down vote up
/**
 * Create a new empty search index.
 */
public BibleSearchIndex() {
    chapters = new HashMap<>();
    try {
        analyzer = CustomAnalyzer.builder()
                .withTokenizer(StandardTokenizerFactory.class)
                .addTokenFilter(LowerCaseFilterFactory.class)
                .addTokenFilter(ASCIIFoldingFilterFactory.class)
                .build();
        index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-bible").toAbsolutePath());
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, "Couldn't create song search index");
        throw new RuntimeException("Couldn't create song search index", ex);
    }
}
 
Example 9
Source Project: airsonic   Source File: AnalyzerFactory.java    License: GNU General Public License v3.0 5 votes vote down vote up
private Builder createDefaultAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example 10
Source Project: airsonic   Source File: AnalyzerFactory.java    License: GNU General Public License v3.0 5 votes vote down vote up
private Builder createArtistAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example 11
Source Project: lucene-solr   Source File: AnalyzerPaneProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setAnalyzer(Analyzer analyzer) {
  analyzerNameLbl.setText(analyzer.getClass().getName());

  if (analyzer instanceof CustomAnalyzer) {
    CustomAnalyzer customAnalyzer = (CustomAnalyzer) analyzer;

    DefaultListModel<String> charFilterListModel = new DefaultListModel<>();
    customAnalyzer.getCharFilterFactories().stream()
        .map(f -> f.getClass().getSimpleName())
        .forEach(charFilterListModel::addElement);
    charFilterList.setModel(charFilterListModel);

    tokenizerTF.setText(customAnalyzer.getTokenizerFactory().getClass().getSimpleName());

    DefaultListModel<String> tokenFilterListModel = new DefaultListModel<>();
    customAnalyzer.getTokenFilterFactories().stream()
        .map(f -> f.getClass().getSimpleName())
        .forEach(tokenFilterListModel::addElement);
    tokenFilterList.setModel(tokenFilterListModel);

    charFilterList.setBackground(Color.white);
    tokenizerTF.setBackground(Color.white);
    tokenFilterList.setBackground(Color.white);
  } else {
    charFilterList.setModel(new DefaultListModel<>());
    tokenizerTF.setText("");
    tokenFilterList.setModel(new DefaultListModel<>());

    charFilterList.setBackground(Color.lightGray);
    tokenizerTF.setBackground(Color.lightGray);
    tokenFilterList.setBackground(Color.lightGray);
  }
}
 
Example 12
Source Project: lucene-solr   Source File: AnalysisPanelProvider.java    License: Apache License 2.0 5 votes vote down vote up
void showAnalysisChainDialog() {
  if (getCurrentAnalyzer() instanceof CustomAnalyzer) {
    CustomAnalyzer analyzer = (CustomAnalyzer) getCurrentAnalyzer();
    new DialogOpener<>(analysisChainDialogFactory).open("Analysis chain", 600, 320,
        (factory) -> {
          factory.setAnalyzer(analyzer);
        });
  }
}
 
Example 13
Source Project: lucene-solr   Source File: AnalysisImpl.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Analyzer buildCustomAnalyzer(CustomAnalyzerConfig config) {
  Objects.requireNonNull(config);
  try {
    // create builder
    CustomAnalyzer.Builder builder = config.getConfigDir()
        .map(path -> CustomAnalyzer.builder(FileSystems.getDefault().getPath(path)))
        .orElse(CustomAnalyzer.builder());

    // set tokenizer
    builder.withTokenizer(config.getTokenizerConfig().getName(), config.getTokenizerConfig().getParams());

    // add char filters
    for (CustomAnalyzerConfig.ComponentConfig cfConf : config.getCharFilterConfigs()) {
      builder.addCharFilter(cfConf.getName(), cfConf.getParams());
    }

    // add token filters
    for (CustomAnalyzerConfig.ComponentConfig tfConf : config.getTokenFilterConfigs()) {
      builder.addTokenFilter(tfConf.getName(), tfConf.getParams());
    }

    // build analyzer
    this.analyzer = builder.build();
    return analyzer;
  } catch (Exception e) {
    throw new LukeException("Failed to build custom analyzer.", e);
  }
}
 
Example 14
Source Project: lucene-solr   Source File: AnalysisImplTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testAnalyzer_custom_with_confdir() throws Exception {
  Path confDir = createTempDir("conf");
  Path stopFile = Files.createFile(Paths.get(confDir.toString(), "stop.txt"));
  Files.write(stopFile, "of\nthe\nby\nfor\n".getBytes(StandardCharsets.UTF_8));

  AnalysisImpl analysis = new AnalysisImpl();
  Map<String, String> tkParams = new HashMap<>();
  tkParams.put("maxTokenLen", "128");
  Map<String, String> tfParams = new HashMap<>();
  tfParams.put("ignoreCase", "true");
  tfParams.put("words", "stop.txt");
  tfParams.put("format", "wordset");
  CustomAnalyzerConfig.Builder builder = new CustomAnalyzerConfig.Builder(
      "whitespace", tkParams)
      .configDir(confDir.toString())
      .addTokenFilterConfig("lowercase", Collections.emptyMap())
      .addTokenFilterConfig("stop", tfParams);
  CustomAnalyzer analyzer = (CustomAnalyzer) analysis.buildCustomAnalyzer(builder.build());
  assertEquals("org.apache.lucene.analysis.custom.CustomAnalyzer", analyzer.getClass().getName());
  assertEquals("org.apache.lucene.analysis.core.WhitespaceTokenizerFactory", analyzer.getTokenizerFactory().getClass().getName());
  assertEquals("org.apache.lucene.analysis.core.LowerCaseFilterFactory", analyzer.getTokenFilterFactories().get(0).getClass().getName());
  assertEquals("org.apache.lucene.analysis.core.StopFilterFactory", analyzer.getTokenFilterFactories().get(1).getClass().getName());

  String text = "Government of the People, by the People, for the People";
  List<Analysis.Token> tokens = analysis.analyze(text);
  assertNotNull(tokens);
}
 
Example 15
Source Project: lucene-solr   Source File: AnalysisImplTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testAnalyzeStepByStep_custom() {
  AnalysisImpl analysis = new AnalysisImpl();
  Map<String, String> tkParams = new HashMap<>();
  tkParams.put("maxTokenLen", "128");
  CustomAnalyzerConfig.Builder builder = new CustomAnalyzerConfig.Builder("keyword", tkParams)
      .addTokenFilterConfig("lowercase", Collections.emptyMap())
      .addCharFilterConfig("htmlstrip", Collections.emptyMap());
  CustomAnalyzer analyzer = (CustomAnalyzer) analysis.buildCustomAnalyzer(builder.build());
  assertEquals("org.apache.lucene.analysis.custom.CustomAnalyzer", analyzer.getClass().getName());
  assertEquals("org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory",
      analyzer.getCharFilterFactories().get(0).getClass().getName());
  assertEquals("org.apache.lucene.analysis.core.KeywordTokenizerFactory",
      analyzer.getTokenizerFactory().getClass().getName());
  assertEquals("org.apache.lucene.analysis.core.LowerCaseFilterFactory",
      analyzer.getTokenFilterFactories().get(0).getClass().getName());

  String text = "Apache Lucene";
  Analysis.StepByStepResult result = analysis.analyzeStepByStep(text);
  assertNotNull(result);
  assertNotNull(result.getCharfilteredTexts());
  assertEquals(1,result.getCharfilteredTexts().size());
  assertEquals("htmlStrip", result.getCharfilteredTexts().get(0).getName());

  assertNotNull(result.getNamedTokens());
  assertEquals(2, result.getNamedTokens().size());
  //FIXME check each namedTokensList
  assertEquals("keyword", result.getNamedTokens().get(0).getName());
  assertEquals("lowercase", result.getNamedTokens().get(1).getName());
}
 
Example 16
public void testBasic() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
      SENTENCES_chunks, null, null, true);
}
 
Example 17
public void testPayloads() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
      .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
      null, null, null, true, toPayloads(SENTENCES_chunks));
}
 
Example 18
Source Project: lucene-solr   Source File: TestOpenNLPPOSFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
public void testBasic() throws IOException {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
}
 
Example 19
Source Project: lucene-solr   Source File: TestOpenNLPPOSFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
public void testNoBreak() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .build();
  assertAnalyzesTo(analyzer, NO_BREAK, NO_BREAK_terms, NO_BREAK_startOffsets, NO_BREAK_endOffsets,
      null, null, null, true);
}
 
Example 20
Source Project: lucene-solr   Source File: TestOpenNLPTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTokenizer() throws IOException {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin", "tokenizerModel", "en-test-tokenizer.bin")
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
  assertAnalyzesTo(analyzer, SENTENCE1, SENTENCE1_punc);
}
 
Example 21
Source Project: lucene-solr   Source File: TestOpenNLPTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTokenizerNoSentenceDetector() throws IOException {
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
        .withTokenizer("opennlp", "tokenizerModel", "en-test-tokenizer.bin")
        .build();
  });
  assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'sentenceModel'"));
}
 
Example 22
Source Project: lucene-solr   Source File: TestOpenNLPTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTokenizerNoTokenizer() throws IOException {
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
        .withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin")
        .build();
  });
  assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'tokenizerModel'"));
}
 
Example 23
public void test1SentenceDictionaryOnly() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
      .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
      .build();
  assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
      SENTENCE_posTags, null, null, true);
}
 
Example 24
public void test2SentencesDictionaryOnly() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
      SENTENCES_posTags, null, null, true);
}
 
Example 25
public void test1SentenceMaxEntOnly() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
      .build();
  assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
      SENTENCE_posTags, null, null, true);
}
 
Example 26
public void test2SentencesMaxEntOnly() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
      SENTENCES_posTags, null, null, true);
}
 
Example 27
public void test1SentenceDictionaryAndMaxEnt() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
      .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
      .build();
  assertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
      SENTENCE_both_posTags, null, null, true);
}
 
Example 28
public void test2SentencesDictionaryAndMaxEnt() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
      SENTENCES_both_posTags, null, null, true);
}
 
Example 29
public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter(KeywordRepeatFilterFactory.class)
      .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
      .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
      SENTENCES_keep_orig_posTags, null, null, true);
}
 
Example 30
public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
  CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
      .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
      .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
      .addTokenFilter(KeywordRepeatFilterFactory.class)
      .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
      .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
      .build();
  assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
      SENTENCES_keep_orig_posTags, null, null, true);
}