org.apache.lucene.analysis.tokenattributes.CharTermAttribute Java Examples

The following examples show how to use org.apache.lucene.analysis.tokenattributes.CharTermAttribute. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestSnowball.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testFilterTokens() throws Exception {
  SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
  TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
  FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
  
  filter.incrementToken();

  assertEquals("accent", termAtt.toString());
  assertEquals(2, offsetAtt.startOffset());
  assertEquals(7, offsetAtt.endOffset());
  assertEquals("wrd", typeAtt.type());
  assertEquals(3, posIncAtt.getPositionIncrement());
  assertEquals(77, flagsAtt.getFlags());
  assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}
 
Example #2
Source File: XmlInterpolationTest.java    From SolrTextTagger with Apache License 2.0 6 votes vote down vote up
private String[] analyzeReturnTokens(String docText) {
  List<String> result = new ArrayList<>();

  Reader filter = new HTMLStripCharFilter(new StringReader(docText),
          Collections.singleton("unescaped"));
  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      result.add(termAttribute.toString());
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result.toArray(new String[result.size()]);
}
 
Example #3
Source File: TestAnalyzers.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Test that LowercaseFilter handles the lowercasing correctly if the term
 * buffer has a trailing surrogate character leftover and the current term in
 * the buffer ends with a corresponding leading surrogate.
 */
public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
  // test if the limit of the termbuffer is correctly used with supplementary
  // chars
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("BogustermBogusterm\udc16"));
  LowerCaseFilter filter = new LowerCaseFilter(tokenizer);
  assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
  filter.reset();
  String highSurEndingUpper = "BogustermBoguster\ud801";
  String highSurEndingLower = "bogustermboguster\ud801";
  tokenizer.setReader(new StringReader(highSurEndingUpper));
  assertTokenStreamContents(filter, new String[] {highSurEndingLower});
  assertTrue(filter.hasAttribute(CharTermAttribute.class));
  char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
  int length = highSurEndingLower.length();
  assertEquals('\ud801', termBuffer[length - 1]);
}
 
Example #4
Source File: AnalysisUtils.java    From stratio-cassandra with Apache License 2.0 6 votes vote down vote up
public static List<String> analyzeAsTokens(String field, String value, Analyzer analyzer) {
    List<String> result = new ArrayList<>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(field, new StringReader(value));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
    return result;
}
 
Example #5
Source File: HanLpQueryAnalyzerTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #6
Source File: HanLPAnalyzerTest.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #7
Source File: LuceneAnalyzerTest.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void test2() throws Exception {
    MynlpAnalyzer analyzer = new MynlpAnalyzer(Lexers.core().filterReader(true, true));

    TokenStream tokenStream = analyzer.tokenStream("title", "俞正声主持召开全国政协第五十三次主席会议");
    tokenStream.reset();

    StringBuffer sb = new StringBuffer();

    while (tokenStream.incrementToken()) {
        sb.append(tokenStream.getAttribute(CharTermAttribute.class));
        sb.append("\t");
        sb.append(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
        sb.append("\t");
        sb.append(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
        sb.append("\n");
    }
    System.out.println(sb.toString());
    analyzer.close();
}
 
Example #8
Source File: Utils.java    From fuzzy-matcher with Apache License 2.0 6 votes vote down vote up
public static Stream<String> getNGrams(String value, int size) {
    Stream.Builder<String> stringStream = Stream.builder();
    if (value.length() <= size) {
        stringStream.add(value);
    } else {
        NGramTokenizer nGramTokenizer = new NGramTokenizer(size, size);
        CharTermAttribute charTermAttribute = nGramTokenizer.addAttribute(CharTermAttribute.class);
        nGramTokenizer.setReader(new StringReader(value));
        try {
            nGramTokenizer.reset();
            while (nGramTokenizer.incrementToken()) {
                stringStream.add(charTermAttribute.toString());
            }
            nGramTokenizer.end();
            nGramTokenizer.close();
        } catch (IOException io) {
            throw new MatchException("Failure in creating tokens : ", io);
        }
    }
    return stringStream.build();
}
 
Example #9
Source File: VietnameseAnalysisTest.java    From elasticsearch-analysis-vietnamese with Apache License 2.0 6 votes vote down vote up
public void testTokenOffset() throws IOException {

        TestAnalysis analysis = createTestAnalysis();
        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
        assertNotNull(analyzer);

        TokenStream ts = analyzer.analyzer().tokenStream("test", "Phụ tùng xe Mazda bán tải dưới 7 chỗ: ống dẫn gió tới két làm mát khí nạp- cao su lưu hóa, mới 100%, phục vụ BHBD. Ms:1D0013246A");
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
        ts.reset();
        String[] expected = new String[]{"phụ tùng", "xe", "mazda", "bán", "tải", "7", "chỗ", "ống", "dẫn", "gió", "tới", "két", "làm", "mát", "khí", "nạp", "cao su", "lưu hóa", "mới", "100%", "phục vụ", "bhbd", "ms", "1", "d0", "013246", "a"};
        int[] expectedOffset = new int[]{0, 9, 12, 18, 22, 31, 33, 38, 42, 46, 50, 54, 58, 62, 66, 70, 75, 82, 91, 95, 101, 109, 115, 118, 119, 121, 127};

        for (int i = 0; i < expected.length; i++) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term.toString(), equalTo(expected[i]));
            assertTrue(offset.startOffset() == expectedOffset[i]);
        }
        assertThat(ts.incrementToken(), equalTo(false));
    }
 
Example #10
Source File: LuceneUtils.java    From SciGraph with Apache License 2.0 6 votes vote down vote up
public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
  List<String> ret = Lists.newArrayList();

  try {
    TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
    CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
      ret.add(token.toString());
    }
    stream.close();
  } catch (IOException e) {
    e.printStackTrace();
  }
  return ret;
}
 
Example #11
Source File: ShingleAnalyzerWrapperTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
  BooleanQuery.Builder q = new BooleanQuery.Builder();

  try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) {
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  
    ts.reset();
    while (ts.incrementToken()) {
      String termText =  termAtt.toString();
      q.add(new TermQuery(new Term("content", termText)),
          BooleanClause.Occur.SHOULD);
    }
    ts.end();
  }

  ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
  int[] ranks = new int[] { 1, 2, 0 };
  compareRanks(hits, ranks);
}
 
Example #12
Source File: AnalyzerFactoryTestCase.java    From airsonic with GNU General Public License v3.0 6 votes vote down vote up
private List<String> toTermString(String field, String str) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzerFactory.getAnalyzer().tokenStream(field,
                new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString()
                    .replaceAll("^term\\=", ""));
        }
        stream.close();
    } catch (IOException e) {
        LoggerFactory.getLogger(AnalyzerFactoryTestCase.class)
                .error("Error during Token processing.", e);
    }
    return result;
}
 
Example #13
Source File: SpellingQueryConverter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}
 
Example #14
Source File: URLTokenizerTest.java    From elasticsearch-analysis-url with Apache License 2.0 6 votes vote down vote up
@Test
public void testAll() throws IOException {
    URLTokenizer tokenizer = new URLTokenizer();
    tokenizer.setReader(new StringReader(TEST_HTTPS_URL));
    CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
    tokenizer.reset();
    tokenizer.clearAttributes();
    List<String> tokens = new ArrayList<>();
    while(tokenizer.incrementToken()){
        tokens.add(termAttribute.toString());
    }

    assertThat(tokens, hasItem(equalTo("https")));
    assertThat(tokens, hasItem(equalTo("foo.bar.com")));
    assertThat(tokens, hasItem(equalTo("www.foo.bar.com:9200")));
    assertThat(tokens, hasItem(equalTo("https://www.foo.bar.com")));

    tokenizer = createTokenizer("https://foo.com", null);
    assertThat(tokenizer, hasTokenAtOffset("https", 0, 5));
}
 
Example #15
Source File: DelimitedTermFrequencyTokenFilterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testTermFrequency() throws Exception {
  String test = "The quick|40 red|4 fox|06 jumped|1 over the lazy|2 brown|123 dogs|1024";
  DelimitedTermFrequencyTokenFilter filter =
      new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
  filter.reset();
  assertTermEquals("The", filter, termAtt, tfAtt, 1);
  assertTermEquals("quick", filter, termAtt, tfAtt, 40);
  assertTermEquals("red", filter, termAtt, tfAtt, 4);
  assertTermEquals("fox", filter, termAtt, tfAtt, 6);
  assertTermEquals("jumped", filter, termAtt, tfAtt, 1);
  assertTermEquals("over", filter, termAtt, tfAtt, 1);
  assertTermEquals("the", filter, termAtt, tfAtt, 1);
  assertTermEquals("lazy", filter, termAtt, tfAtt, 2);
  assertTermEquals("brown", filter, termAtt, tfAtt, 123);
  assertTermEquals("dogs", filter, termAtt, tfAtt, 1024);
  assertFalse(filter.incrementToken());
  filter.end();
  filter.close();
}
 
Example #16
Source File: StandardPreProcessorIterator.java    From Indra with MIT License 6 votes vote down vote up
private void initialize(String text) {
    String content = metadata.applyLowercase ? text.toLowerCase() : text;

    if (!transformers.isEmpty()) {
        StringBuilder sbContent = new StringBuilder(content);
        transformers.forEach(t -> t.transform(sbContent));
        content = sbContent.toString();
    }

    StringReader reader = new StringReader(content);

    tokenizer.setReader(reader);
    this.cattr = tokenStream.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
    } catch (IOException e) {
        String initialPart = text.substring(0, Math.min(30, text.length()));
        throw new IndraRuntimeException(String.format("Error parsing the input starting with '%s'...", initialPart), e);
    }
}
 
Example #17
Source File: TestJapaneseTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(500);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = TestUtil.randomUnicodeString(random(), 100);
    try (TokenStream ts = analyzer.tokenStream("foo", s)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    }
  }
}
 
Example #18
Source File: ElasticsearchSearchQueryBase.java    From vertexium with Apache License 2.0 6 votes vote down vote up
private String[] splitStringIntoTerms(String value) {
    try {
        List<String> results = new ArrayList<>();
        try (TokenStream tokens = analyzer.tokenStream("", value)) {
            CharTermAttribute term = tokens.getAttribute(CharTermAttribute.class);
            tokens.reset();
            while (tokens.incrementToken()) {
                String t = term.toString().trim();
                if (t.length() > 0) {
                    results.add(t);
                }
            }
        }
        return results.toArray(new String[results.size()]);
    } catch (IOException e) {
        throw new VertexiumException("Could not tokenize string: " + value, e);
    }
}
 
Example #19
Source File: DelimitedPayloadTokenFilterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testIntEncoding() throws Exception {
  String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
  DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(whitespaceMockTokenizer(test), '|', new IntegerEncoder());
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
  filter.reset();
  assertTermEquals("The", filter, termAtt, payAtt, null);
  assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeInt(1));
  assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeInt(2));
  assertTermEquals("fox", filter, termAtt, payAtt, PayloadHelper.encodeInt(3));
  assertTermEquals("jumped", filter, termAtt, payAtt, null);
  assertTermEquals("over", filter, termAtt, payAtt, null);
  assertTermEquals("the", filter, termAtt, payAtt, null);
  assertTermEquals("lazy", filter, termAtt, payAtt, PayloadHelper.encodeInt(5));
  assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeInt(99));
  assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeInt(83));
  assertFalse(filter.incrementToken());
  filter.end();
  filter.close();
}
 
Example #20
Source File: HanLPAnalyzerTest.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #21
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** 
 * TODO: rewrite tests not to use string comparison.
 */
private static String tsToString(TokenStream in) throws IOException {
  StringBuilder out = new StringBuilder();
  CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
  // extra safety to enforce, that the state is not preserved and also
  // assign bogus values
  in.clearAttributes();
  termAtt.setEmpty().append("bogusTerm");
  in.reset();
  while (in.incrementToken()) {
    if (out.length() > 0) {
      out.append(' ');
    }
    out.append(termAtt.toString());
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
  }

  in.close();
  return out.toString();
}
 
Example #22
Source File: StringUtils.java    From uncc2014watsonsim with GNU General Public License v2.0 6 votes vote down vote up
/** splits the given string into tokens */
public static List<String> tokenize(String text) {
	List<String> tokens = new ArrayList<>();
	
	try (TokenStream tokenStream = analyzer.tokenStream("text", text)) {
		//TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(text));
		//tokenStream = new org.apache.lucene.analysis.core.StopFilter(Version.LUCENE_46, tokenStream, EnglishAnalyzer.getDefaultStopSet());
		CharTermAttribute token = tokenStream.addAttribute(CharTermAttribute.class);
		
		// On the fence whether it is better to error here or not. Suggestions?
		tokenStream.reset();
	
		while (tokenStream.incrementToken()) {
			tokens.add(token.toString());
		}
	} catch (IOException e) {
		// If we can't trim it, so what?
		e.printStackTrace();
	}
	return tokens;
}
 
Example #23
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}
 
Example #24
Source File: IndexTimeSynonymTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String fieldName) {
  Tokenizer ts = new Tokenizer(Token.TOKEN_ATTRIBUTE_FACTORY) {
    final AttributeImpl reusableToken = (AttributeImpl) addAttribute(CharTermAttribute.class);
    int p = 0;
    
    @Override
    public boolean incrementToken() {
      if( p >= tokens.length ) return false;
      clearAttributes();
      tokens[p++].copyTo(reusableToken);
      return true;
    }

    @Override
    public void reset() throws IOException {
      super.reset();
      this.p = 0;
    }
  };
  return new TokenStreamComponents(ts);
}
 
Example #25
Source File: DexterAnalyzer.java    From dexter with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	String str = "<body>perchééééééééé";
	Analyzer anal = new DexterAnalyzer();
	TokenStream ts = anal.tokenStream("content", new StringReader(str));

	OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
	CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
	ts.reset();
	while (ts.incrementToken()) {
		System.out.println(termAtt.toString()
				.substring(0, termAtt.length()));
		System.out
				.println("token start offset: " + offsetAtt.startOffset());
		System.out.println("  token end offset: " + offsetAtt.endOffset());
	}
}
 
Example #26
Source File: QueryFactory.java    From airsonic-advanced with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Query generation expression extracted from
 * {@link org.airsonic.player.service.SearchService#searchByName( String, String, int, int, List, Class)}.
 *
 * @param fieldName {@link FieldNames}
 * @return Query
 * @throws IOException When parsing of QueryParser fails
 */
public Query searchByName(String fieldName, String name) throws IOException {

    BooleanQuery.Builder mainQuery = new BooleanQuery.Builder();

    Analyzer analyzer = analyzerFactory.getQueryAnalyzer();

    try (TokenStream stream = analyzer.tokenStream(fieldName, name)) {
        stream.reset();
        stream.incrementToken();

        /*
         *  XXX 3.x -> 8.x :
         * In order to support wildcards,
         * QueryParser has been replaced by the following process.
         */

        /* Wildcards apply only to tail tokens **/
        while (true) {
            String token = stream.getAttribute(CharTermAttribute.class).toString();
            if (stream.incrementToken()) {
                mainQuery.add(new TermQuery(new Term(fieldName, token)), Occur.SHOULD);
            } else {
                WildcardQuery wildcardQuery = new WildcardQuery(new Term(fieldName, token.concat(ASTERISK)));
                mainQuery.add(wildcardQuery, Occur.SHOULD);
                break;
            }
        }

    }

    return mainQuery.build();
}
 
Example #27
Source File: MMSegTokenizer.java    From mmseg4j-solr with Apache License 2.0 5 votes vote down vote up
public MMSegTokenizer(Seg seg) {
	this.seg = seg;

	termAtt = addAttribute(CharTermAttribute.class);
	offsetAtt = addAttribute(OffsetAttribute.class);
	typeAtt = addAttribute(TypeAttribute.class);
}
 
Example #28
Source File: TestToken.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

//        SynonymsLibrary.put(SynonymsLibrary.DEFAULT, "../../library/synonyms.dic");
//
//        DicLibrary.insert(DicLibrary.DEFAULT, "清华", "n", 2000);
//        DicLibrary.insert(DicLibrary.DEFAULT, "大学", "n", 2000);

        Map<String, String> map = new HashMap<String, String>();

        map.put("type", "base_ansj");
//        map.put(SynonymsLibrary.DEFAULT, SynonymsLibrary.DEFAULT);

        Analyzer ca = new AnsjAnalyzer(map);

        String content = "我爱北京天安门天安门上太阳升我美丽的清华大学";

        try {
            TokenStream tokenStream = ca.tokenStream(content, new StringReader(content));

            while (tokenStream.incrementToken()) {

                System.out.print(tokenStream.getAttribute(CharTermAttribute.class));
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                System.out.print("\t");
                System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        ca.close();
    }
 
Example #29
Source File: Test2BTerms.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
  if (attClass == TermToBytesRefAttribute.class)
    return new MyTermAttributeImpl();
  if (CharTermAttribute.class.isAssignableFrom(attClass))
    throw new IllegalArgumentException("no");
  return delegate.createAttributeInstance(attClass);
}
 
Example #30
Source File: TestJapaneseNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
  TokenStream stream = analyzer.tokenStream("dummy", reader);
  stream.reset();

  CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

  while (stream.incrementToken()) {
    writer.write(termAttr.toString());
    writer.write("\n");
  }

  reader.close();
  writer.close();
}