org.apache.lucene.analysis.util.TokenFilterFactory Java Examples

The following examples show how to use org.apache.lucene.analysis.util.TokenFilterFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: TestSynonymFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

/** Test that analyzer and tokenizerFactory is both specified */
public void testAnalyzer() throws Exception {
  final String analyzer = CJKAnalyzer.class.getName();
  final String tokenizerFactory = PatternTokenizerFactory.class.getName();
  TokenFilterFactory factory = null;

  factory = tokenFilterFactory("Synonym",
      "synonyms", "synonyms2.txt",
      "analyzer", analyzer);
  assertNotNull(factory);

  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    tokenFilterFactory("Synonym",
        "synonyms", "synonyms.txt",
        "analyzer", analyzer,
        "tokenizerFactory", tokenizerFactory);
  });
  assertTrue(expected.getMessage().contains("Analyzer and TokenizerFactory can't be specified both"));
}

Example #2

Source File: ResourceLoaderTest.java From lucene-solr with Apache License 2.0

6 votes

public void testCacheWrongType() throws Exception {
  clearCache();

  SolrResourceLoader loader = new SolrResourceLoader();
  @SuppressWarnings({"rawtypes"})
  Class[] params = { Map.class };
  Map<String,String> args = Map.of("minGramSize", "1", "maxGramSize", "2");
  final String className = "solr.NGramTokenizerFactory";

  // We could fail here since the class name and expected type don't match, but instead we try to infer what the user actually meant
  TokenFilterFactory tff = loader.newInstance(className, TokenFilterFactory.class, new String[0], params, new Object[]{new HashMap<>(args)});
  assertNotNull("Did not load TokenFilter when asking for corresponding Tokenizer", tff);

  // This should work, but won't if earlier call succeeding corrupting the cache
  TokenizerFactory tf = loader.newInstance(className, TokenizerFactory.class, new String[0], params, new Object[]{new HashMap<>(args)});
  assertNotNull("Did not load Tokenizer after bad call earlier", tf);
  loader.close();
}

Example #3

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public void testWhitespaceFactoryWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Example #4

Source File: CustomAnalyzerStrField.java From lucene-solr with Apache License 2.0

6 votes

public CustomAnalyzerStrField() {
  Random r = LuceneTestCase.random();

  // two arg constructor
  Analyzer a2 = new TokenizerChain
    (new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);
  
  // three arg constructor
  Analyzer a3 = new TokenizerChain
    (r.nextBoolean() ? null : new CharFilterFactory[0],
     new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);

  if (r.nextBoolean()) {
    indexAnalyzer = a2;
    queryAnalyzer = a3;
  } else {
    queryAnalyzer = a2;
    indexAnalyzer = a3;
  }
}

Example #5

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public void testWhitespaceWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer("whitespace")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Example #6

Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void addExternalJars(List<String> jarFiles) {
  List<URL> urls = new ArrayList<>();

  for (String jarFile : jarFiles) {
    Path path = FileSystems.getDefault().getPath(jarFile);
    if (!Files.exists(path) || !jarFile.endsWith(".jar")) {
      throw new LukeException(String.format(Locale.ENGLISH, "Invalid jar file path: %s", jarFile));
    }
    try {
      URL url = path.toUri().toURL();
      urls.add(url);
    } catch (IOException e) {
      throw new LukeException(e.getMessage(), e);
    }
  }

  // reload available tokenizers, charfilters, and tokenfilters
  URLClassLoader classLoader = new URLClassLoader(
      urls.toArray(new URL[0]), this.getClass().getClassLoader());
  CharFilterFactory.reloadCharFilters(classLoader);
  TokenizerFactory.reloadTokenizers(classLoader);
  TokenFilterFactory.reloadTokenFilters(classLoader);
}

Example #7

Source File: PayloadUtils.java From lucene-solr with Apache License 2.0

6 votes

public static String getPayloadEncoder(FieldType fieldType) {
  // TODO: support custom payload encoding fields too somehow - maybe someone has a custom component that encodes payloads as floats
  String encoder = null;
  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain for DelimitedPayloadTokenFilterFactory or NumericPayloadTokenFilterFactory
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof DelimitedPayloadTokenFilterFactory) {
        encoder = factory.getOriginalArgs().get(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR);
        break;
      }

      if (factory instanceof NumericPayloadTokenFilterFactory) {
        // encodes using `PayloadHelper.encodeFloat(payload)`
        encoder = "float";
        break;
      }
    }
  }

  return encoder;
}

Example #8

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public void testStopWordsFromClasspath() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter("stop",
          "ignoreCase", "true",
          "words", "org/apache/lucene/analysis/custom/teststop.txt",
          "format", "wordset")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(1, tokenFilters.size());
  assertSame(StopFilterFactory.class, tokenFilters.get(0).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  a.close();
}

Example #9

Source File: TestAsciiFoldingFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

public void testMultiTermAnalysis() throws IOException {
  TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
  TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
  stream = factory.create(stream);
  assertTokenStreamContents(stream, new String[] { "Ete" });

  stream = new CannedTokenStream(new Token("Été", 0, 3));
  stream = factory.normalize(stream);
  assertTokenStreamContents(stream, new String[] { "Ete" });

  factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
  stream = new CannedTokenStream(new Token("Été", 0, 3));
  stream = factory.create(stream);
  assertTokenStreamContents(stream, new String[] { "Ete", "Été" });

  stream = new CannedTokenStream(new Token("Été", 0, 3));
  stream = factory.normalize(stream);
  assertTokenStreamContents(stream, new String[] { "Ete" });
}

Example #10

Source File: PhrasesIdentificationComponent.java From lucene-solr with Apache License 2.0

6 votes

/** 
 * Helper method, public for testing purposes only.
 * <p>
 * Given an analyzer, inspects it to determine if:
 * <ul>
 *  <li>it is a {@link TokenizerChain}</li>
 *  <li>it contains exactly one instance of {@link ShingleFilterFactory}</li>
 * </ul>
 * <p>
 * If these these conditions are met, then this method returns the <code>maxShingleSize</code> 
 * in effect for this analyzer, otherwise returns -1.
 * </p>
 * 
 * @param analyzer An analyzer inspect
 * @return <code>maxShingleSize</code> if available
 * @lucene.internal
 */
public static int getMaxShingleSize(Analyzer analyzer) {
  if (!TokenizerChain.class.isInstance(analyzer)) {
    return -1;
  }
  
  final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories();
  if (0 == factories.length) {
    return -1;
  }
  int result = -1;
  for (TokenFilterFactory tff : factories) {
    if (ShingleFilterFactory.class.isInstance(tff)) {
      if (0 < result) {
        // more then one shingle factory in our analyzer, which is weird, so make no assumptions...
        return -1;
      }
      // would be nice if there was an easy way to just ask a factory for the effective value
      // of an arguement...
      final Map<String,String> args = tff.getOriginalArgs();
      result = args.containsKey("maxShingleSize")
        ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
    }
  }
  return result;
}

Example #11

Source File: SolrQueryParserBase.java From lucene-solr with Apache License 2.0

6 votes

protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) {
  if (leadingWildcards == null) leadingWildcards = new HashMap<>();
  ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType);
  if (fac != null || leadingWildcards.containsKey(fieldType)) {
    return fac;
  }

  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain if it supports leading wildcards
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof ReversedWildcardFilterFactory) {
        fac = (ReversedWildcardFilterFactory)factory;
        break;
      }
    }
  }

  leadingWildcards.put(fieldType, fac);
  return fac;
}

Example #12

Source File: TaggerRequestHandler.java From SolrTextTagger with Apache License 2.0

5 votes

private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}

Example #13

Source File: TestFactories.java From lucene-solr with Apache License 2.0

5 votes

public void test() throws IOException {
  for (String tokenizer : TokenizerFactory.availableTokenizers()) {
    doTestTokenizer(tokenizer);
  }
  
  for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
    doTestTokenFilter(tokenFilter);
  }
  
  for (String charFilter : CharFilterFactory.availableCharFilters()) {
    doTestCharFilter(charFilter);
  }
}

Example #14

Source File: ResourceLoaderTest.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"rawtypes", "deprecation"})
public void testLoadDeprecatedFactory() throws Exception {
  SolrResourceLoader loader = new SolrResourceLoader(Paths.get("solr/collection1").toAbsolutePath());
  // ensure we get our exception
  loader.newInstance(DeprecatedTokenFilterFactory.class.getName(), TokenFilterFactory.class, null,
      new Class[] { Map.class }, new Object[] { new HashMap<String,String>() });
  // TODO: How to check that a warning was printed to log file?
  loader.close();    
}

Example #15

Source File: TokenizerChainTest.java From lucene-solr with Apache License 2.0

5 votes

@Test
@SuppressWarnings({"unchecked"})
public void testNormalization() throws Exception {
  String fieldName = "f";
  TokenFilterFactory[] tff = new TokenFilterFactory[2];
  tff[0] = new LowerCaseFilterFactory(Collections.EMPTY_MAP);
  tff[1] = new ASCIIFoldingFilterFactory(Collections.EMPTY_MAP);
  TokenizerChain tokenizerChain = new TokenizerChain(
      new MockTokenizerFactory(Collections.EMPTY_MAP),
      tff);
  assertEquals(new BytesRef("fooba"),
      tokenizerChain.normalize(fieldName, "FOOB\u00c4"));
  tokenizerChain.close();
}

Example #16

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

/** 
 * Creates a new TokenizerChain.
 *
 * @param charFilters Factories for the CharFilters to use, if any - if null, will be treated as if empty.
 * @param tokenizer Factory for the Tokenizer to use, must not be null.
 * @param filters Factories for the TokenFilters to use if any- if null, will be treated as if empty.
 */
public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
  charFilters = null == charFilters ? EMPTY_CHAR_FITLERS : charFilters;
  filters = null == filters ? EMPTY_TOKEN_FITLERS : filters;
  if (null == tokenizer) {
    throw new NullPointerException("TokenizerFactory must not be null");
  }
  
  this.charFilters = charFilters;
  this.tokenizer = tokenizer;
  this.filters = filters;
}

Example #17

Source File: SolrStopwordsCarrot2LexicalDataFactory.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Obtains stop words for a field from the associated
 * {@link StopFilterFactory}, if any.
 */
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
  // No need to synchronize here, Carrot2 ensures that instances
  // of this class are not used by multiple threads at a time.
  synchronized (solrStopWords) {
    if (!solrStopWords.containsKey(fieldName)) {
      solrStopWords.put(fieldName, new ArrayList<>());

      IndexSchema schema = core.getLatestSchema();
      final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
      if (fieldAnalyzer instanceof TokenizerChain) {
        final TokenFilterFactory[] filterFactories = 
            ((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
        for (TokenFilterFactory factory : filterFactories) {
          if (factory instanceof StopFilterFactory) {
            // StopFilterFactory holds the stop words in a CharArraySet
            CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
            solrStopWords.get(fieldName).add(stopWords);
          }

          if (factory instanceof CommonGramsFilterFactory) {
            CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
            solrStopWords.get(fieldName).add(commonWords);
          }
        }
      }
    }
    return solrStopWords.get(fieldName);
  }
}

Example #18

Source File: TestSynonymFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/** checks for synonyms of "second" in synonyms-wordnet.txt */
private void checkWordnetSynonyms(TokenFilterFactory factory) throws Exception {
  Reader reader = new StringReader("second");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = factory.create(stream);
  assertTrue(stream instanceof SynonymFilter);
  assertTokenStreamContents(stream,
      new String[] { "second", "2nd", "two" },
      new int[] { 1, 0, 0 });
}

Example #19

Source File: TestTypeTokenFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

public void testCreationWithWhiteList() throws Exception {
  TokenFilterFactory factory = tokenFilterFactory("Type",
      "types", "stoptypes-1.txt, stoptypes-2.txt",
      "useWhitelist", "true");
  CannedTokenStream input = new CannedTokenStream();
  factory.create(input);
}

Example #20

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
  TokenStream ts = tk;
  for (TokenFilterFactory filter : filters) {
    ts = filter.create(ts);
  }
  return new TokenStreamComponents(tk, ts);
}

Example #21

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  TokenStream result = in;
  for (TokenFilterFactory filter : filters) {
    result = filter.normalize(result);
  }
  return result;
}

Example #22

Source File: PluginsService.java From crate with Apache License 2.0

5 votes

/**
 * Reloads all Lucene SPI implementations using the new classloader.
 * This method must be called after the new classloader has been created to
 * register the services for use.
 */
static void reloadLuceneSPI(ClassLoader loader) {
    // do NOT change the order of these method calls!

    // Codecs:
    PostingsFormat.reloadPostingsFormats(loader);
    DocValuesFormat.reloadDocValuesFormats(loader);
    Codec.reloadCodecs(loader);
    // Analysis:
    CharFilterFactory.reloadCharFilters(loader);
    TokenFilterFactory.reloadTokenFilters(loader);
    TokenizerFactory.reloadTokenizers(loader);
}

Example #23

Source File: AnalysisFactoryTestCase.java From crate with Apache License 2.0

5 votes

public void testTokenFilters() {
    Set<String> missing = new TreeSet<String>();
    missing.addAll(org.apache.lucene.analysis.util.TokenFilterFactory.availableTokenFilters()
                       .stream().map(key -> key.toLowerCase(Locale.ROOT)).collect(Collectors.toSet()));
    missing.removeAll(getTokenFilters().keySet());
    assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing.toString(), missing.isEmpty());
}

Example #24

Source File: FieldAnalysisRequestHandlerTest.java From lucene-solr with Apache License 2.0

5 votes

@Test //See SOLR-8460
public void testCustomAttribute() throws Exception {
  FieldAnalysisRequest request = new FieldAnalysisRequest();
  request.addFieldType("skutype1");
  request.setFieldValue("hi, 3456-12 a Test");
  request.setShowMatch(false);
  FieldType fieldType = new TextField();
  Analyzer analyzer = new TokenizerChain(
      new TokenizerFactory(Collections.emptyMap()) {
        @Override
        public Tokenizer create(AttributeFactory factory) {
          return new CustomTokenizer(factory);
        }
      },
      new TokenFilterFactory[] {
          new TokenFilterFactory(Collections.emptyMap()) {
            @Override
            public TokenStream create(TokenStream input) {
              return new CustomTokenFilter(input);
            }
          }
      }
  );
  fieldType.setIndexAnalyzer(analyzer);

  @SuppressWarnings({"rawtypes"})
  NamedList<NamedList> result = handler.analyzeValues(request, fieldType, "fieldNameUnused");
  // just test that we see "900" in the flags attribute here
  @SuppressWarnings({"unchecked", "rawtypes"})
  List<NamedList> tokenInfoList = (List<NamedList>) result.findRecursive("index", CustomTokenFilter.class.getName());
  // '1' from CustomTokenFilter plus 900 from CustomFlagsAttributeImpl.
  assertEquals(901, tokenInfoList.get(0).get("org.apache.lucene.analysis.tokenattributes.FlagsAttribute#flags"));
}

Example #25

Source File: TestFactories.java From lucene-solr with Apache License 2.0

5 votes

private void doTestTokenFilter(String tokenfilter) throws IOException {
  Class<? extends TokenFilterFactory> factoryClazz = TokenFilterFactory.lookupClass(tokenfilter);
  TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
  if (factory != null) {
    // we managed to fully create an instance. check a few more things:
    if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
      // beast it just a little, it shouldnt throw exceptions:
      // (it should have thrown them in initialize)
      Analyzer a = new FactoryAnalyzer(assertingTokenizer, factory, null);
      checkRandomData(random(), a, 3, 20, false, false);
      a.close();
    }
  }
}

Example #26

Source File: TaggerRequestHandler.java From lucene-solr with Apache License 2.0

5 votes

private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}

Example #27

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testHtmlStripClassicFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withDefaultMatchVersion(LUCENE_8_0_0)
      .addCharFilter("htmlstrip")
      .withTokenizer("classic")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .withPositionIncrementGap(100)
      .withOffsetGap(1000)
      .build();
  
  assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
  List<CharFilterFactory> charFilters = a.getCharFilterFactories();
  assertEquals(1, charFilters.size());
  assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(100, a.getPositionIncrementGap("dummy"));
  assertEquals(1000, a.getOffsetGap("dummy"));
  assertSame(LUCENE_8_0_0, a.getVersion());

  assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "<p><b>föó</b> bär     FÖÖ BAR</p>", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Example #28

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testFactoryHtmlStripClassicFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withDefaultMatchVersion(LUCENE_8_0_0)
      .addCharFilter(HTMLStripCharFilterFactory.class)
      .withTokenizer(ClassicTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .withPositionIncrementGap(100)
      .withOffsetGap(1000)
      .build();
  
  assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
  List<CharFilterFactory> charFilters = a.getCharFilterFactories();
  assertEquals(1, charFilters.size());
  assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(100, a.getPositionIncrementGap("dummy"));
  assertEquals(1000, a.getOffsetGap("dummy"));
  assertSame(LUCENE_8_0_0, a.getVersion());

  assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "<p><b>föó</b> bär     FÖÖ BAR</p>", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Example #29

Source File: TestSynonymFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/** Test that we can parse TokenierFactory's arguments */
public void testTokenizerFactoryArguments() throws Exception {
  final String clazz = PatternTokenizerFactory.class.getName();
  TokenFilterFactory factory = null;

  // simple arg form
  factory = tokenFilterFactory("Synonym", 
      "synonyms", "synonyms.txt", 
      "tokenizerFactory", clazz,
      "pattern", "(.*)",
      "group", "0");
  assertNotNull(factory);
  // prefix
  factory = tokenFilterFactory("Synonym", 
      "synonyms", "synonyms.txt", 
      "tokenizerFactory", clazz,
      "tokenizerFactory.pattern", "(.*)",
      "tokenizerFactory.group", "0");
  assertNotNull(factory);

  // sanity check that sub-PatternTokenizerFactory fails w/o pattern
  expectThrows(Exception.class, () -> {
    tokenFilterFactory("Synonym", 
        "synonyms", "synonyms.txt", 
        "tokenizerFactory", clazz);
  });

  // sanity check that sub-PatternTokenizerFactory fails on unexpected
  expectThrows(Exception.class, () -> {
    tokenFilterFactory("Synonym", 
        "synonyms", "synonyms.txt", 
        "tokenizerFactory", clazz,
        "tokenizerFactory.pattern", "(.*)",
        "tokenizerFactory.bogusbogusbogus", "bogus",
        "tokenizerFactory.group", "0");
  });
}

Example #30

Source File: TestSynonymFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/** checks for synonyms of "GB" in synonyms.txt */
private void checkSolrSynonyms(TokenFilterFactory factory) throws Exception {
  Reader reader = new StringReader("GB");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = factory.create(stream);
  assertTrue(stream instanceof SynonymFilter);
  assertTokenStreamContents(stream,
      new String[] { "GB", "gib", "gigabyte", "gigabytes" },
      new int[] { 1, 0, 0, 0 });
}