Java Code Examples for org.apache.lucene.analysis.ngram.NGramTokenizer

The following examples show how to use org.apache.lucene.analysis.ngram.NGramTokenizer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: fuzzy-matcher   Source File: Utils.java    License: Apache License 2.0 6 votes vote down vote up
public static Stream<String> getNGrams(String value, int size) {
    Stream.Builder<String> stringStream = Stream.builder();
    if (value.length() <= size) {
        stringStream.add(value);
    } else {
        NGramTokenizer nGramTokenizer = new NGramTokenizer(size, size);
        CharTermAttribute charTermAttribute = nGramTokenizer.addAttribute(CharTermAttribute.class);
        nGramTokenizer.setReader(new StringReader(value));
        try {
            nGramTokenizer.reset();
            while (nGramTokenizer.incrementToken()) {
                stringStream.add(charTermAttribute.toString());
            }
            nGramTokenizer.end();
            nGramTokenizer.close();
        } catch (IOException io) {
            throw new MatchException("Failure in creating tokens : ", io);
        }
    }
    return stringStream.build();
}
 
Example 2
Source Project: datawave   Source File: NGramTokenizationStrategy.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Increments the tokenizer and returns the next n-gram in the stream, or null at some termination state, such as EOS.
 * 
 * 
 * @param tokenizer
 *            The tokenizer responsible for generating the next available n-gram
 * @return the next n-gram in the stream, or null at some termination state, such as EOS
 */
protected String increment(final NGramTokenizer tokenizer) throws TokenizationException {
    String ngram = super.increment(tokenizer);
    if (null == ngram) {
        try {
            if ((null != tokenizer) && tokenizer.incrementToken()) {
                final CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
                if (null != charTermAttribute) {
                    ngram = charTermAttribute.toString();
                    charTermAttribute.resizeBuffer(0);
                } else {
                    ngram = null;
                }
            } else {
                ngram = null;
            }
        } catch (final IOException e) {
            throw new TokenizationException("Could not get next n-gram from NGramTokenizer", e);
        }
    }
    
    return ngram;
}
 
Example 3
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
  String input = "㌰゙5℃№㈱㌘ザゾ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}
 
Example 4
Source Project: lucene-solr   Source File: TestConditionalTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testConsistentOffsets() throws IOException {

    long seed = random().nextLong();
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer source = new NGramTokenizer();
        TokenStream sink = new ValidatingTokenFilter(new KeywordRepeatFilter(source), "stage 0");
        sink = new ValidatingTokenFilter(sink, "stage 1");
        sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
        sink = new ValidatingTokenFilter(sink, "last stage");
        return new TokenStreamComponents(source, sink);
      }
    };

    checkRandomData(random(), analyzer, 1);

  }
 
Example 5
Source Project: datawave   Source File: WeightedValuePruningStrategy.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected String increment(final NGramTokenizer tokenizer) throws TokenizationException {
    this.incrementCount++;
    final String ngram;
    if (this.incrementCount <= this.maxIncrementCount) {
        ngram = super.increment(tokenizer);
    } else {
        ngram = null;
    }
    
    return ngram;
}
 
Example 6
/**
 * Increments the tokenizer and returns the next n-gram in the stream, or null if no n-gram was generated.
 * 
 * @param tokenizer
 *            The tokenizer responsible for generating the next available n-gram
 * @return the next n-gram in the stream, or null if no n-gram was generated
 */
protected String increment(final NGramTokenizer tokenizer) throws TokenizationException {
    final AbstractNGramTokenizationStrategy source = this.getSourceStrategy();
    final String ngram;
    if (null != source) {
        ngram = source.increment(tokenizer);
    } else {
        ngram = null;
    }
    return ngram;
}
 
Example 7
Source Project: onedev   Source File: NGramLuceneQuery.java    License: MIT License 5 votes vote down vote up
private static PhraseQuery build(String fieldName, String fieldValue, int gramSize) {
	Preconditions.checkArgument(fieldValue.length()>=gramSize);
	PhraseQuery.Builder builder = new PhraseQuery.Builder();
	try (NGramTokenizer tokenizer = new NGramTokenizer(gramSize, gramSize)) {
		tokenizer.setReader(new StringReader(fieldValue.toLowerCase()));
		tokenizer.reset();
		while (tokenizer.incrementToken()) { 
			builder.add(new Term(fieldName, 
					tokenizer.getAttribute(CharTermAttribute.class).toString()));
		}
	} catch (IOException e) {
		throw new RuntimeException(e);
	}
	return builder.build();
}
 
Example 8
Source Project: Elasticsearch   Source File: EdgeNGramTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
public EdgeNGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.side = Lucene43EdgeNGramTokenizer.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel()));
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
 
Example 9
Source Project: Elasticsearch   Source File: NGramTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
NGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
 
Example 10
Source Project: crate   Source File: CommonAnalysisPlugin.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
    List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
    tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
        () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() {
        @Override
        public String name() {
            return "lowercase";
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            return new LowerCaseFilter(tokenStream);
        }
    }));

    // Temporary shim for aliases. TODO deprecate after they are moved
    tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));

    return tokenizers;
}
 
Example 11
Source Project: crate   Source File: NGramTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    int ngramDiff = maxGram - minGram;
    if (ngramDiff > maxAllowedNgramDiff) {
        deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
            + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
    }
    this.matcher = parseTokenChars(settings.getAsList("token_chars"));
}
 
Example 12
Source Project: crate   Source File: NGramTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create() {
    if (matcher == null) {
        return new NGramTokenizer(minGram, maxGram);
    } else {
        return new NGramTokenizer(minGram, maxGram) {
            @Override
            protected boolean isTokenChar(int chr) {
                return matcher.isTokenChar(chr);
            }
        };
    }
}
 
Example 13
Source Project: onedev   Source File: NGramAnalyzer.java    License: MIT License 4 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer src = new NGramTokenizer(minGram, maxGram);
	TokenStream stream = new LowerCaseFilter(src);
	return new TokenStreamComponents(src, stream);
}
 
Example 14
Source Project: crate   Source File: EdgeNGramTokenizerFactory.java    License: Apache License 2.0 4 votes vote down vote up
EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsList("token_chars"));
}