org.apache.lucene.analysis.ngram.NGramTokenizer Java Examples

The following examples show how to use org.apache.lucene.analysis.ngram.NGramTokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Utils.java    From fuzzy-matcher with Apache License 2.0 6 votes vote down vote up
public static Stream<String> getNGrams(String value, int size) {
    Stream.Builder<String> stringStream = Stream.builder();
    if (value.length() <= size) {
        stringStream.add(value);
    } else {
        NGramTokenizer nGramTokenizer = new NGramTokenizer(size, size);
        CharTermAttribute charTermAttribute = nGramTokenizer.addAttribute(CharTermAttribute.class);
        nGramTokenizer.setReader(new StringReader(value));
        try {
            nGramTokenizer.reset();
            while (nGramTokenizer.incrementToken()) {
                stringStream.add(charTermAttribute.toString());
            }
            nGramTokenizer.end();
            nGramTokenizer.close();
        } catch (IOException io) {
            throw new MatchException("Failure in creating tokens : ", io);
        }
    }
    return stringStream.build();
}
 
Example #2
Source File: NGramTokenizationStrategy.java    From datawave with Apache License 2.0 6 votes vote down vote up
/**
 * Increments the tokenizer and returns the next n-gram in the stream, or null at some termination state, such as EOS.
 * 
 * 
 * @param tokenizer
 *            The tokenizer responsible for generating the next available n-gram
 * @return the next n-gram in the stream, or null at some termination state, such as EOS
 */
protected String increment(final NGramTokenizer tokenizer) throws TokenizationException {
    String ngram = super.increment(tokenizer);
    if (null == ngram) {
        try {
            if ((null != tokenizer) && tokenizer.incrementToken()) {
                final CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
                if (null != charTermAttribute) {
                    ngram = charTermAttribute.toString();
                    charTermAttribute.resizeBuffer(0);
                } else {
                    ngram = null;
                }
            } else {
                ngram = null;
            }
        } catch (final IOException e) {
            throw new TokenizationException("Could not get next n-gram from NGramTokenizer", e);
        }
    }
    
    return ngram;
}
 
Example #3
Source File: TestICUNormalizer2CharFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
  String input = "㌰゙5℃№㈱㌘ザゾ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}
 
Example #4
Source File: TestConditionalTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testConsistentOffsets() throws IOException {

    long seed = random().nextLong();
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer source = new NGramTokenizer();
        TokenStream sink = new ValidatingTokenFilter(new KeywordRepeatFilter(source), "stage 0");
        sink = new ValidatingTokenFilter(sink, "stage 1");
        sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
        sink = new ValidatingTokenFilter(sink, "last stage");
        return new TokenStreamComponents(source, sink);
      }
    };

    checkRandomData(random(), analyzer, 1);

  }
 
Example #5
Source File: CommonAnalysisPlugin.java    From crate with Apache License 2.0 6 votes vote down vote up
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
    List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
    tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
        () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() {
        @Override
        public String name() {
            return "lowercase";
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            return new LowerCaseFilter(tokenStream);
        }
    }));

    // Temporary shim for aliases. TODO deprecate after they are moved
    tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));

    return tokenizers;
}
 
Example #6
Source File: WeightedValuePruningStrategy.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
protected String increment(final NGramTokenizer tokenizer) throws TokenizationException {
    this.incrementCount++;
    final String ngram;
    if (this.incrementCount <= this.maxIncrementCount) {
        ngram = super.increment(tokenizer);
    } else {
        ngram = null;
    }
    
    return ngram;
}
 
Example #7
Source File: AbstractNGramTokenizationStrategy.java    From datawave with Apache License 2.0 5 votes vote down vote up
/**
 * Increments the tokenizer and returns the next n-gram in the stream, or null if no n-gram was generated.
 * 
 * @param tokenizer
 *            The tokenizer responsible for generating the next available n-gram
 * @return the next n-gram in the stream, or null if no n-gram was generated
 */
protected String increment(final NGramTokenizer tokenizer) throws TokenizationException {
    final AbstractNGramTokenizationStrategy source = this.getSourceStrategy();
    final String ngram;
    if (null != source) {
        ngram = source.increment(tokenizer);
    } else {
        ngram = null;
    }
    return ngram;
}
 
Example #8
Source File: NGramLuceneQuery.java    From onedev with MIT License 5 votes vote down vote up
private static PhraseQuery build(String fieldName, String fieldValue, int gramSize) {
	Preconditions.checkArgument(fieldValue.length()>=gramSize);
	PhraseQuery.Builder builder = new PhraseQuery.Builder();
	try (NGramTokenizer tokenizer = new NGramTokenizer(gramSize, gramSize)) {
		tokenizer.setReader(new StringReader(fieldValue.toLowerCase()));
		tokenizer.reset();
		while (tokenizer.incrementToken()) { 
			builder.add(new Term(fieldName, 
					tokenizer.getAttribute(CharTermAttribute.class).toString()));
		}
	} catch (IOException e) {
		throw new RuntimeException(e);
	}
	return builder.build();
}
 
Example #9
Source File: EdgeNGramTokenizerFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public EdgeNGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.side = Lucene43EdgeNGramTokenizer.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel()));
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
 
Example #10
Source File: NGramTokenizerFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
NGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
 
Example #11
Source File: NGramTokenizerFactory.java    From crate with Apache License 2.0 5 votes vote down vote up
NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    int ngramDiff = maxGram - minGram;
    if (ngramDiff > maxAllowedNgramDiff) {
        deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
            + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
    }
    this.matcher = parseTokenChars(settings.getAsList("token_chars"));
}
 
Example #12
Source File: NGramTokenizerFactory.java    From crate with Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create() {
    if (matcher == null) {
        return new NGramTokenizer(minGram, maxGram);
    } else {
        return new NGramTokenizer(minGram, maxGram) {
            @Override
            protected boolean isTokenChar(int chr) {
                return matcher.isTokenChar(chr);
            }
        };
    }
}
 
Example #13
Source File: NGramAnalyzer.java    From onedev with MIT License 4 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer src = new NGramTokenizer(minGram, maxGram);
	TokenStream stream = new LowerCaseFilter(src);
	return new TokenStreamComponents(src, stream);
}
 
Example #14
Source File: EdgeNGramTokenizerFactory.java    From crate with Apache License 2.0 4 votes vote down vote up
EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsList("token_chars"));
}