Java Code Examples for org.apache.lucene.analysis.Tokenizer#getAttribute()

The following examples show how to use org.apache.lucene.analysis.Tokenizer#getAttribute() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HanLpTokenizerFactoryTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 2
Source File: HanLpQueryAnalyzerTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 3
Source File: NlpSegmenterTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}
 
Example 4
Source File: HanLPAnalyzerTest.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 5
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testBigLookahead() throws Exception {
  StringBuilder b = new StringBuilder();
  for(int i=0;i<100;i++) {
    b.append('a');
  }
  b.append('b');
  Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

  b = new StringBuilder();
  for(int i=0;i<200;i++) {
    b.append('a');
  }
  t.setReader(new StringReader(b.toString()));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals(b.toString(), termAtt.toString());
  assertFalse(t.incrementToken());
}
 
Example 6
Source File: URLTokenizer.java    From elasticsearch-analysis-url with Apache License 2.0 5 votes vote down vote up
/**
 * Get a list of {@link Token}s from the given {@link Tokenizer}
 * @param part the url part which should be used in {@link Token} creation
 * @param tokenizer the tokenizer from which tokens will be gleaned
 * @return a list of tokens
 * @throws IOException
 */
private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
    tokenizer.reset();
    List<Token> tokens = new ArrayList<>();
    OffsetAttribute offset;
    String token;
    while (tokenizer.incrementToken()) {
        token = tokenizer.getAttribute(CharTermAttribute.class).toString();
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset()));
    }
    return tokens;
}
 
Example 7
Source File: HanLPTokenizerFactoryTest.java    From hanlp-lucene-plugin with Apache License 2.0 5 votes vote down vote up
public void testCreate() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" +
                                                 "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" +
                                                 "突出外表、百變髮型及正面的形象,以至自己" +
                                                 "品牌的男士香水等商品,及長期擔任運動品牌" +
                                                 "Adidas的代言人,因此對大眾傳播媒介和時尚界" +
                                                 "等方面都具很大的影響力,在足球圈外所獲得的" +
                                                 "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 8
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndOffset() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a+");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
  t.setReader(new StringReader("aaabbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("bbb", termAtt.toString());
  assertFalse(t.incrementToken());
  t.end();
  assertEquals(6, offsetAtt.endOffset());
}
 
Example 9
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyStringPatternOneMatch() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbab"));
  assertTokenStreamContents(t,
                            new String[] {"bb", "b"},
                            new int[] {0, 3},
                            new int[] {2, 4});
}
 
Example 10
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testTrailingNonToken() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("a c   "));
  assertTokenStreamContents(t,
                            new String[] {"a", "c"},
                            new int[] {0, 2},
                            new int[] {1, 3});
}
 
Example 11
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testLeadingNonToken() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("    a c"));
  assertTokenStreamContents(t,
                            new String[] {"a", "c"},
                            new int[] {4, 6},
                            new int[] {5, 7});
}
 
Example 12
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSplitMultiCharWhitespace() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("a \tb   c"));
  assertTokenStreamContents(t,
                            new String[] {"a", "b", "c"},
                            new int[] {0, 3, 7},
                            new int[] {1, 4, 8});
}
 
Example 13
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSplitSingleCharWhitespace() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("a \tb   c"));
  assertTokenStreamContents(t,
                            new String[] {"a", "b", "c"},
                            new int[] {0, 3, 7},
                            new int[] {1, 4, 8});
}
 
Example 14
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyStringPatternNoMatch() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("bbb", termAtt.toString());
  assertFalse(t.incrementToken());
}
 
Example 15
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNoTokens() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer(".*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  String s;
  while (true) {
    s = TestUtil.randomUnicodeString(random());
    if (s.length() > 0) {
      break;
    }
  }
  t.setReader(new StringReader(s));
  t.reset();
  assertFalse(t.incrementToken());
}
 
Example 16
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndOffset() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a+");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
  t.setReader(new StringReader("aaabbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("aaa", termAtt.toString());
  assertFalse(t.incrementToken());
  t.end();
  assertEquals(6, offsetAtt.endOffset());
}
 
Example 17
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyStringPatternOneMatch() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbab"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("a", termAtt.toString());
  assertFalse(t.incrementToken());
}
 
Example 18
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOneToken() throws Exception {
  Tokenizer t = new SimplePatternTokenizer(".*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  String s;
  while (true) {
    s = TestUtil.randomUnicodeString(random());
    if (s.length() > 0) {
      break;
    }
  }
  t.setReader(new StringReader(s));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals(s, termAtt.toString());
}