Java Code Examples for org.apache.lucene.analysis.tokenattributes.CharTermAttribute

The following examples show how to use org.apache.lucene.analysis.tokenattributes.CharTermAttribute. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestSnowball.java    License: Apache License 2.0 6 votes vote down vote up
public void testFilterTokens() throws Exception {
  SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
  TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
  FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
  
  filter.incrementToken();

  assertEquals("accent", termAtt.toString());
  assertEquals(2, offsetAtt.startOffset());
  assertEquals(7, offsetAtt.endOffset());
  assertEquals("wrd", typeAtt.type());
  assertEquals(3, posIncAtt.getPositionIncrement());
  assertEquals(77, flagsAtt.getFlags());
  assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}
 
Example 2
Source Project: SolrTextTagger   Source File: XmlInterpolationTest.java    License: Apache License 2.0 6 votes vote down vote up
private String[] analyzeReturnTokens(String docText) {
  List<String> result = new ArrayList<>();

  Reader filter = new HTMLStripCharFilter(new StringReader(docText),
          Collections.singleton("unescaped"));
  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      result.add(termAttribute.toString());
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result.toArray(new String[result.size()]);
}
 
Example 3
Source Project: lucene-solr   Source File: TestAnalyzers.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Test that LowercaseFilter handles the lowercasing correctly if the term
 * buffer has a trailing surrogate character leftover and the current term in
 * the buffer ends with a corresponding leading surrogate.
 */
public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
  // test if the limit of the termbuffer is correctly used with supplementary
  // chars
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("BogustermBogusterm\udc16"));
  LowerCaseFilter filter = new LowerCaseFilter(tokenizer);
  assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
  filter.reset();
  String highSurEndingUpper = "BogustermBoguster\ud801";
  String highSurEndingLower = "bogustermboguster\ud801";
  tokenizer.setReader(new StringReader(highSurEndingUpper));
  assertTokenStreamContents(filter, new String[] {highSurEndingLower});
  assertTrue(filter.hasAttribute(CharTermAttribute.class));
  char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
  int length = highSurEndingLower.length();
  assertEquals('\ud801', termBuffer[length - 1]);
}
 
Example 4
Source Project: stratio-cassandra   Source File: AnalysisUtils.java    License: Apache License 2.0 6 votes vote down vote up
public static List<String> analyzeAsTokens(String field, String value, Analyzer analyzer) {
    List<String> result = new ArrayList<>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(field, new StringReader(value));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
    return result;
}
 
Example 5
Source Project: jstarcraft-nlp   Source File: HanLpQueryAnalyzerTestCase.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 6
Source Project: hanlp-lucene-plugin   Source File: HanLPAnalyzerTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 7
Source Project: jstarcraft-nlp   Source File: LuceneAnalyzerTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test2() throws Exception {
    MynlpAnalyzer analyzer = new MynlpAnalyzer(Lexers.core().filterReader(true, true));

    TokenStream tokenStream = analyzer.tokenStream("title", "俞正声主持召开全国政协第五十三次主席会议");
    tokenStream.reset();

    StringBuffer sb = new StringBuffer();

    while (tokenStream.incrementToken()) {
        sb.append(tokenStream.getAttribute(CharTermAttribute.class));
        sb.append("\t");
        sb.append(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
        sb.append("\t");
        sb.append(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
        sb.append("\n");
    }
    System.out.println(sb.toString());
    analyzer.close();
}
 
Example 8
Source Project: fuzzy-matcher   Source File: Utils.java    License: Apache License 2.0 6 votes vote down vote up
public static Stream<String> getNGrams(String value, int size) {
    Stream.Builder<String> stringStream = Stream.builder();
    if (value.length() <= size) {
        stringStream.add(value);
    } else {
        NGramTokenizer nGramTokenizer = new NGramTokenizer(size, size);
        CharTermAttribute charTermAttribute = nGramTokenizer.addAttribute(CharTermAttribute.class);
        nGramTokenizer.setReader(new StringReader(value));
        try {
            nGramTokenizer.reset();
            while (nGramTokenizer.incrementToken()) {
                stringStream.add(charTermAttribute.toString());
            }
            nGramTokenizer.end();
            nGramTokenizer.close();
        } catch (IOException io) {
            throw new MatchException("Failure in creating tokens : ", io);
        }
    }
    return stringStream.build();
}
 
Example 9
public void testTokenOffset() throws IOException {

        TestAnalysis analysis = createTestAnalysis();
        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
        assertNotNull(analyzer);

        TokenStream ts = analyzer.analyzer().tokenStream("test", "Phụ tùng xe Mazda bán tải dưới 7 chỗ: ống dẫn gió tới két làm mát khí nạp- cao su lưu hóa, mới 100%, phục vụ BHBD. Ms:1D0013246A");
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
        ts.reset();
        String[] expected = new String[]{"phụ tùng", "xe", "mazda", "bán", "tải", "7", "chỗ", "ống", "dẫn", "gió", "tới", "két", "làm", "mát", "khí", "nạp", "cao su", "lưu hóa", "mới", "100%", "phục vụ", "bhbd", "ms", "1", "d0", "013246", "a"};
        int[] expectedOffset = new int[]{0, 9, 12, 18, 22, 31, 33, 38, 42, 46, 50, 54, 58, 62, 66, 70, 75, 82, 91, 95, 101, 109, 115, 118, 119, 121, 127};

        for (int i = 0; i < expected.length; i++) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term.toString(), equalTo(expected[i]));
            assertTrue(offset.startOffset() == expectedOffset[i]);
        }
        assertThat(ts.incrementToken(), equalTo(false));
    }
 
Example 10
Source Project: SciGraph   Source File: LuceneUtils.java    License: Apache License 2.0 6 votes vote down vote up
public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
  List<String> ret = Lists.newArrayList();

  try {
    TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
    CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
      ret.add(token.toString());
    }
    stream.close();
  } catch (IOException e) {
    e.printStackTrace();
  }
  return ret;
}
 
Example 11
Source Project: lucene-solr   Source File: ShingleAnalyzerWrapperTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
  BooleanQuery.Builder q = new BooleanQuery.Builder();

  try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) {
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  
    ts.reset();
    while (ts.incrementToken()) {
      String termText =  termAtt.toString();
      q.add(new TermQuery(new Term("content", termText)),
          BooleanClause.Occur.SHOULD);
    }
    ts.end();
  }

  ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
  int[] ranks = new int[] { 1, 2, 0 };
  compareRanks(hits, ranks);
}
 
Example 12
private List<String> toTermString(String field, String str) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzerFactory.getAnalyzer().tokenStream(field,
                new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString()
                    .replaceAll("^term\\=", ""));
        }
        stream.close();
    } catch (IOException e) {
        LoggerFactory.getLogger(AnalyzerFactoryTestCase.class)
                .error("Error during Token processing.", e);
    }
    return result;
}
 
Example 13
Source Project: lucene-solr   Source File: SpellingQueryConverter.java    License: Apache License 2.0 6 votes vote down vote up
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}
 
Example 14
@Test
public void testAll() throws IOException {
    URLTokenizer tokenizer = new URLTokenizer();
    tokenizer.setReader(new StringReader(TEST_HTTPS_URL));
    CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
    tokenizer.reset();
    tokenizer.clearAttributes();
    List<String> tokens = new ArrayList<>();
    while(tokenizer.incrementToken()){
        tokens.add(termAttribute.toString());
    }

    assertThat(tokens, hasItem(equalTo("https")));
    assertThat(tokens, hasItem(equalTo("foo.bar.com")));
    assertThat(tokens, hasItem(equalTo("www.foo.bar.com:9200")));
    assertThat(tokens, hasItem(equalTo("https://www.foo.bar.com")));

    tokenizer = createTokenizer("https://foo.com", null);
    assertThat(tokenizer, hasTokenAtOffset("https", 0, 5));
}
 
Example 15
public void testTermFrequency() throws Exception {
  String test = "The quick|40 red|4 fox|06 jumped|1 over the lazy|2 brown|123 dogs|1024";
  DelimitedTermFrequencyTokenFilter filter =
      new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
  filter.reset();
  assertTermEquals("The", filter, termAtt, tfAtt, 1);
  assertTermEquals("quick", filter, termAtt, tfAtt, 40);
  assertTermEquals("red", filter, termAtt, tfAtt, 4);
  assertTermEquals("fox", filter, termAtt, tfAtt, 6);
  assertTermEquals("jumped", filter, termAtt, tfAtt, 1);
  assertTermEquals("over", filter, termAtt, tfAtt, 1);
  assertTermEquals("the", filter, termAtt, tfAtt, 1);
  assertTermEquals("lazy", filter, termAtt, tfAtt, 2);
  assertTermEquals("brown", filter, termAtt, tfAtt, 123);
  assertTermEquals("dogs", filter, termAtt, tfAtt, 1024);
  assertFalse(filter.incrementToken());
  filter.end();
  filter.close();
}
 
Example 16
Source Project: Indra   Source File: StandardPreProcessorIterator.java    License: MIT License 6 votes vote down vote up
private void initialize(String text) {
    String content = metadata.applyLowercase ? text.toLowerCase() : text;

    if (!transformers.isEmpty()) {
        StringBuilder sbContent = new StringBuilder(content);
        transformers.forEach(t -> t.transform(sbContent));
        content = sbContent.toString();
    }

    StringReader reader = new StringReader(content);

    tokenizer.setReader(reader);
    this.cattr = tokenStream.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
    } catch (IOException e) {
        String initialPart = text.substring(0, Math.min(30, text.length()));
        throw new IndraRuntimeException(String.format("Error parsing the input starting with '%s'...", initialPart), e);
    }
}
 
Example 17
Source Project: lucene-solr   Source File: TestJapaneseTokenizer.java    License: Apache License 2.0 6 votes vote down vote up
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(500);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = TestUtil.randomUnicodeString(random(), 100);
    try (TokenStream ts = analyzer.tokenStream("foo", s)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    }
  }
}
 
Example 18
Source Project: vertexium   Source File: ElasticsearchSearchQueryBase.java    License: Apache License 2.0 6 votes vote down vote up
private String[] splitStringIntoTerms(String value) {
    try {
        List<String> results = new ArrayList<>();
        try (TokenStream tokens = analyzer.tokenStream("", value)) {
            CharTermAttribute term = tokens.getAttribute(CharTermAttribute.class);
            tokens.reset();
            while (tokens.incrementToken()) {
                String t = term.toString().trim();
                if (t.length() > 0) {
                    results.add(t);
                }
            }
        }
        return results.toArray(new String[results.size()]);
    } catch (IOException e) {
        throw new VertexiumException("Could not tokenize string: " + value, e);
    }
}
 
Example 19
public void testIntEncoding() throws Exception {
  String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
  DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(whitespaceMockTokenizer(test), '|', new IntegerEncoder());
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
  filter.reset();
  assertTermEquals("The", filter, termAtt, payAtt, null);
  assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeInt(1));
  assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeInt(2));
  assertTermEquals("fox", filter, termAtt, payAtt, PayloadHelper.encodeInt(3));
  assertTermEquals("jumped", filter, termAtt, payAtt, null);
  assertTermEquals("over", filter, termAtt, payAtt, null);
  assertTermEquals("the", filter, termAtt, payAtt, null);
  assertTermEquals("lazy", filter, termAtt, payAtt, PayloadHelper.encodeInt(5));
  assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeInt(99));
  assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeInt(83));
  assertFalse(filter.incrementToken());
  filter.end();
  filter.close();
}
 
Example 20
Source Project: hanlp-lucene-plugin   Source File: HanLPAnalyzerTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 21
Source Project: lucene-solr   Source File: TestSimplePatternTokenizer.java    License: Apache License 2.0 6 votes vote down vote up
/** 
 * TODO: rewrite tests not to use string comparison.
 */
private static String tsToString(TokenStream in) throws IOException {
  StringBuilder out = new StringBuilder();
  CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
  // extra safety to enforce, that the state is not preserved and also
  // assign bogus values
  in.clearAttributes();
  termAtt.setEmpty().append("bogusTerm");
  in.reset();
  while (in.incrementToken()) {
    if (out.length() > 0) {
      out.append(' ');
    }
    out.append(termAtt.toString());
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
  }

  in.close();
  return out.toString();
}
 
Example 22
/** splits the given string into tokens */
public static List<String> tokenize(String text) {
	List<String> tokens = new ArrayList<>();
	
	try (TokenStream tokenStream = analyzer.tokenStream("text", text)) {
		//TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(text));
		//tokenStream = new org.apache.lucene.analysis.core.StopFilter(Version.LUCENE_46, tokenStream, EnglishAnalyzer.getDefaultStopSet());
		CharTermAttribute token = tokenStream.addAttribute(CharTermAttribute.class);
		
		// On the fence whether it is better to error here or not. Suggestions?
		tokenStream.reset();
	
		while (tokenStream.incrementToken()) {
			tokens.add(token.toString());
		}
	} catch (IOException e) {
		// If we can't trim it, so what?
		e.printStackTrace();
	}
	return tokens;
}
 
Example 23
public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}
 
Example 24
Source Project: lucene-solr   Source File: IndexTimeSynonymTest.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String fieldName) {
  Tokenizer ts = new Tokenizer(Token.TOKEN_ATTRIBUTE_FACTORY) {
    final AttributeImpl reusableToken = (AttributeImpl) addAttribute(CharTermAttribute.class);
    int p = 0;
    
    @Override
    public boolean incrementToken() {
      if( p >= tokens.length ) return false;
      clearAttributes();
      tokens[p++].copyTo(reusableToken);
      return true;
    }

    @Override
    public void reset() throws IOException {
      super.reset();
      this.p = 0;
    }
  };
  return new TokenStreamComponents(ts);
}
 
Example 25
Source Project: dexter   Source File: DexterAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	String str = "<body>perchééééééééé";
	Analyzer anal = new DexterAnalyzer();
	TokenStream ts = anal.tokenStream("content", new StringReader(str));

	OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
	CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
	ts.reset();
	while (ts.incrementToken()) {
		System.out.println(termAtt.toString()
				.substring(0, termAtt.length()));
		System.out
				.println("token start offset: " + offsetAtt.startOffset());
		System.out.println("  token end offset: " + offsetAtt.endOffset());
	}
}
 
Example 26
/**
 * Query generation expression extracted from
 * {@link org.airsonic.player.service.SearchService#searchByName( String, String, int, int, List, Class)}.
 *
 * @param fieldName {@link FieldNames}
 * @return Query
 * @throws IOException When parsing of QueryParser fails
 */
public Query searchByName(String fieldName, String name) throws IOException {

    BooleanQuery.Builder mainQuery = new BooleanQuery.Builder();

    Analyzer analyzer = analyzerFactory.getQueryAnalyzer();

    try (TokenStream stream = analyzer.tokenStream(fieldName, name)) {
        stream.reset();
        stream.incrementToken();

        /*
         *  XXX 3.x -> 8.x :
         * In order to support wildcards,
         * QueryParser has been replaced by the following process.
         */

        /* Wildcards apply only to tail tokens **/
        while (true) {
            String token = stream.getAttribute(CharTermAttribute.class).toString();
            if (stream.incrementToken()) {
                mainQuery.add(new TermQuery(new Term(fieldName, token)), Occur.SHOULD);
            } else {
                WildcardQuery wildcardQuery = new WildcardQuery(new Term(fieldName, token.concat(ASTERISK)));
                mainQuery.add(wildcardQuery, Occur.SHOULD);
                break;
            }
        }

    }

    return mainQuery.build();
}
 
Example 27
Source Project: mmseg4j-solr   Source File: MMSegTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
public MMSegTokenizer(Seg seg) {
	this.seg = seg;

	termAtt = addAttribute(CharTermAttribute.class);
	offsetAtt = addAttribute(OffsetAttribute.class);
	typeAtt = addAttribute(TypeAttribute.class);
}
 
Example 28
Source Project: jstarcraft-nlp   Source File: TestToken.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

//        SynonymsLibrary.put(SynonymsLibrary.DEFAULT, "../../library/synonyms.dic");
//
//        DicLibrary.insert(DicLibrary.DEFAULT, "清华", "n", 2000);
//        DicLibrary.insert(DicLibrary.DEFAULT, "大学", "n", 2000);

        Map<String, String> map = new HashMap<String, String>();

        map.put("type", "base_ansj");
//        map.put(SynonymsLibrary.DEFAULT, SynonymsLibrary.DEFAULT);

        Analyzer ca = new AnsjAnalyzer(map);

        String content = "我爱北京天安门天安门上太阳升我美丽的清华大学";

        try {
            TokenStream tokenStream = ca.tokenStream(content, new StringReader(content));

            while (tokenStream.incrementToken()) {

                System.out.print(tokenStream.getAttribute(CharTermAttribute.class));
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                System.out.print("\t");
                System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        ca.close();
    }
 
Example 29
Source Project: lucene-solr   Source File: Test2BTerms.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
  if (attClass == TermToBytesRefAttribute.class)
    return new MyTermAttributeImpl();
  if (CharTermAttribute.class.isAssignableFrom(attClass))
    throw new IllegalArgumentException("no");
  return delegate.createAttributeInstance(attClass);
}
 
Example 30
Source Project: lucene-solr   Source File: TestJapaneseNumberFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
  TokenStream stream = analyzer.tokenStream("dummy", reader);
  stream.reset();

  CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

  while (stream.incrementToken()) {
    writer.write(termAttr.toString());
    writer.write("\n");
  }

  reader.close();
  writer.close();
}