Java Code Examples for org.apache.lucene.util.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY

The following examples show how to use org.apache.lucene.util.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: QueryParserImpl.java From AdSearch_Endpoints with Apache License 2.0

6 votes

@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }

Example 2

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

public void testHugeTerm2() throws Exception {
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 40960; i++) {
        sb.append('a');
    }
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    char token[] = new char[4096];
    Arrays.fill(token, 'a');
    String expectedToken = new String(token);
    String expected[] = {
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken
    };
    assertTokenStreamContents(tokenizer, expected);
}

Example 3

Source File: BaseTokenStreamTestCase.java From lucene-solr with Apache License 2.0

5 votes

/** Returns a random AttributeFactory impl */
public static AttributeFactory newAttributeFactory(Random random) {
  switch (random.nextInt(3)) {
    case 0:
      return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
    case 1:
      return Token.TOKEN_ATTRIBUTE_FACTORY;
    case 2:
      return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
    default:
      throw new AssertionError("Please fix the Random.nextInt() call above");
  }
}

Example 4

Source File: Test2BTerms.java From lucene-solr with Apache License 2.0

5 votes

public MyTokenStream(Random random, int tokensPerDoc) {
  super(new MyAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
  this.tokensPerDoc = tokensPerDoc;
  addAttribute(TermToBytesRefAttribute.class);
  bytes.length = TOKEN_LEN;
  this.random = random;
  nextSave = TestUtil.nextInt(random, 500000, 1000000);
}

Example 5

Source File: CJKBigramFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

@Before
public void up() {
    analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenStream result = new CJKBigramFilter(source);
            return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
    };
}

Example 6

Source File: MyanmarSyllableTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, false));
            return new TokenStreamComponents(tokenizer);
        }
    };
}

Example 7

Source File: IcuTokenizerCJKTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private static Analyzer create() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(true, true)));
        }
    };
}

Example 8

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}

Example 9

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testHugeDoc() throws Exception {
    StringBuilder sb = new StringBuilder();
    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, ' ');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}

Example 10

Source File: OpenKoreanTextTokenizer.java From elasticsearch-analysis-openkoreantext with Apache License 2.0

4 votes

public OpenKoreanTextTokenizer() {
    super(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
}

Example 11

Source File: PreAnalyzedField.java From lucene-solr with Apache License 2.0

4 votes

public PreAnalyzedTokenizer(PreAnalyzedParser parser) {
  // we don't pack attributes: since we are used for (de)serialization and dont want bloat.
  super(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
  this.parser = parser;
}

Example 12

Source File: TestConcatenatingTokenStream.java From lucene-solr with Apache License 2.0

3 votes

public void testInconsistentAttributeFactories() throws IOException {

    final MockTokenizer first = new MockTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
    final MockTokenizer second = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);

    expectThrows(IllegalArgumentException.class, () -> new ConcatenatingTokenStream(first, second));

  }

Example 13

Source File: LegacyNumericTokenStream.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Creates a token stream for numeric values using the default <code>precisionStep</code>
 * {@link org.apache.solr.legacy.LegacyNumericUtils#PRECISION_STEP_DEFAULT} (16). The stream is not yet initialized,
 * before using set a value using the various set<em>???</em>Value() methods.
 */
public LegacyNumericTokenStream() {
  this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, LegacyNumericUtils.PRECISION_STEP_DEFAULT);
}

Example 14

Source File: LegacyNumericTokenStream.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Creates a token stream for numeric values with the specified
 * <code>precisionStep</code>. The stream is not yet initialized,
 * before using set a value using the various set<em>???</em>Value() methods.
 */
public LegacyNumericTokenStream(final int precisionStep) {
  this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, precisionStep);
}