Java Code Examples for org.apache.lucene.util.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY

The following examples show how to use org.apache.lucene.util.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: QueryParserImpl.java    From AdSearch_Endpoints with Apache License 2.0 6 votes vote down vote up
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }
 
Example 2
Source File: SegmentationIcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testHugeTerm2() throws Exception {
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 40960; i++) {
        sb.append('a');
    }
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    char token[] = new char[4096];
    Arrays.fill(token, 'a');
    String expectedToken = new String(token);
    String expected[] = {
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken
    };
    assertTokenStreamContents(tokenizer, expected);
}
 
Example 3
Source File: BaseTokenStreamTestCase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Returns a random AttributeFactory impl */
public static AttributeFactory newAttributeFactory(Random random) {
  switch (random.nextInt(3)) {
    case 0:
      return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
    case 1:
      return Token.TOKEN_ATTRIBUTE_FACTORY;
    case 2:
      return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
    default:
      throw new AssertionError("Please fix the Random.nextInt() call above");
  }
}
 
Example 4
Source File: Test2BTerms.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public MyTokenStream(Random random, int tokensPerDoc) {
  super(new MyAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
  this.tokensPerDoc = tokensPerDoc;
  addAttribute(TermToBytesRefAttribute.class);
  bytes.length = TOKEN_LEN;
  this.random = random;
  nextSave = TestUtil.nextInt(random, 500000, 1000000);
}
 
Example 5
Source File: CJKBigramFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
@Before
public void up() {
    analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenStream result = new CJKBigramFilter(source);
            return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
    };
}
 
Example 6
Source File: MyanmarSyllableTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, false));
            return new TokenStreamComponents(tokenizer);
        }
    };
}
 
Example 7
Source File: IcuTokenizerCJKTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private static Analyzer create() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(true, true)));
        }
    };
}
 
Example 8
Source File: SegmentationIcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
 
Example 9
Source File: SegmentationIcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testHugeDoc() throws Exception {
    StringBuilder sb = new StringBuilder();
    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, ' ');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
 
Example 10
Source File: OpenKoreanTextTokenizer.java    From elasticsearch-analysis-openkoreantext with Apache License 2.0 4 votes vote down vote up
public OpenKoreanTextTokenizer() {
    super(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
}
 
Example 11
Source File: PreAnalyzedField.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public PreAnalyzedTokenizer(PreAnalyzedParser parser) {
  // we don't pack attributes: since we are used for (de)serialization and dont want bloat.
  super(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
  this.parser = parser;
}
 
Example 12
Source File: TestConcatenatingTokenStream.java    From lucene-solr with Apache License 2.0 3 votes vote down vote up
public void testInconsistentAttributeFactories() throws IOException {

    final MockTokenizer first = new MockTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
    final MockTokenizer second = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);

    expectThrows(IllegalArgumentException.class, () -> new ConcatenatingTokenStream(first, second));

  }
 
Example 13
Source File: LegacyNumericTokenStream.java    From lucene-solr with Apache License 2.0 2 votes vote down vote up
/**
 * Creates a token stream for numeric values using the default <code>precisionStep</code>
 * {@link org.apache.solr.legacy.LegacyNumericUtils#PRECISION_STEP_DEFAULT} (16). The stream is not yet initialized,
 * before using set a value using the various set<em>???</em>Value() methods.
 */
public LegacyNumericTokenStream() {
  this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, LegacyNumericUtils.PRECISION_STEP_DEFAULT);
}
 
Example 14
Source File: LegacyNumericTokenStream.java    From lucene-solr with Apache License 2.0 2 votes vote down vote up
/**
 * Creates a token stream for numeric values with the specified
 * <code>precisionStep</code>. The stream is not yet initialized,
 * before using set a value using the various set<em>???</em>Value() methods.
 */
public LegacyNumericTokenStream(final int precisionStep) {
  this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, precisionStep);
}