Java Code Examples for org.apache.lucene.analysis.Tokenizer#reset()

The following examples show how to use org.apache.lucene.analysis.Tokenizer#reset() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testBigLookahead() throws Exception {
  StringBuilder b = new StringBuilder();
  for(int i=0;i<100;i++) {
    b.append('a');
  }
  b.append('b');
  Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

  b = new StringBuilder();
  for(int i=0;i<200;i++) {
    b.append('a');
  }
  t.setReader(new StringReader(b.toString()));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals(b.toString(), termAtt.toString());
  assertFalse(t.incrementToken());
}
 
Example 2
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 6 votes vote down vote up
private ArrayList<char[]> tokenize( String input ) throws IOException {
      
  Log.debug( "tokenize '" + input + "'" );
  ArrayList<char[]> tokens = new ArrayList<char[]>( );
  Tokenizer tk = getTokenizerImpl( input );
  
  CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
  tk.reset( );
  while (tk.incrementToken( ) ) {
    int bufLen = term.length();
    char[] copy = new char[ bufLen ];
    System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
    tokens.add( copy );
  }
      
  return tokens;
}
 
Example 3
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 6 votes vote down vote up
private ArrayList<char[]> tokenize( String input ) throws IOException {

    Log.debug( "tokenize '" + input + "'" );
    ArrayList<char[]> tokens = new ArrayList<char[]>( );
    Tokenizer tk = getTokenizerImpl( input );

    CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
    tk.reset( );
    while (tk.incrementToken( ) ) {
      int bufLen = term.length();
      char[] copy = new char[ bufLen ];
      System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
      tokens.add( copy );
    }

    return tokens;
  }
 
Example 4
Source File: HanLPAnalyzerTest.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 5
Source File: NlpSegmenterTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}
 
Example 6
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 6 votes vote down vote up
@Test
public void testShortSentence() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("꽃배달 꽃망울 오토바이"), 2);
  assertEquals(
      "꽃:N:NNG:null:1:1:0:1,배달:N:NNG:null:1:1:1:3,"
      + "꽃:N:NNG:null:1:1:4:5,꽃망울:COMPOUND:Compound:null:0:2:4:7,"
      + "망울:N:NNG:null:1:1:5:7,오토바이:N:NNG:null:1:1:8:12,",
      tokenizerToString(tokenizer));
 
  tokenizer.reset();
  tokenizer.setReader(new StringReader("소설 무궁화꽃이 피었습니다."));
  assertEquals(
      "소설:N:NNG:null:1:1:0:2,무궁:N:NNG:null:1:1:3:5,"
      + "무궁화:COMPOUND:Compound:null:0:2:3:6,화:N:NNG:null:1:1:5:6,"
      + "꽃이:EOJEOL:NNG+JKS:null:1:1:6:8,꽃:N:NNG:null:0:1:6:7,"
      + "피었습니다:EOJEOL:VV+EP+EF:null:1:1:9:14,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example 7
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testBigLookahead() throws Exception {
  StringBuilder b = new StringBuilder();
  for(int i=0;i<100;i++) {
    b.append('a');
  }
  b.append('b');
  Tokenizer t = new SimplePatternTokenizer(b.toString());

  b = new StringBuilder();
  for(int i=0;i<200;i++) {
    b.append('a');
  }
  t.setReader(new StringReader(b.toString()));
  t.reset();
  assertFalse(t.incrementToken());
}
 
Example 8
Source File: TestOpenNLPTokenizerFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Test
public void testClose() throws IOException {
  Map<String,String> args = new HashMap<String,String>() {{ put("sentenceModel", "en-test-sent.bin");
                                                            put("tokenizerModel", "en-test-tokenizer.bin"); }};
  OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args);
  factory.inform(new ClasspathResourceLoader(getClass()));

  Tokenizer ts = factory.create(newAttributeFactory());
  ts.setReader(new StringReader(SENTENCES));

  ts.reset();
  ts.close();
  ts.reset();
  ts.setReader(new StringReader(SENTENCES));
  assertTokenStreamContents(ts, SENTENCES_punc);
  ts.close();
  ts.reset();
  ts.setReader(new StringReader(SENTENCES));
  assertTokenStreamContents(ts, SENTENCES_punc);
}
 
Example 9
Source File: HanLpQueryAnalyzerTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 10
Source File: HanLpTokenizerFactoryTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 11
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyStringPatternOneMatch() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbab"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("a", termAtt.toString());
  assertFalse(t.incrementToken());
}
 
Example 12
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndOffset() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a+");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
  t.setReader(new StringReader("aaabbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("aaa", termAtt.toString());
  assertFalse(t.incrementToken());
  t.end();
  assertEquals(6, offsetAtt.endOffset());
}
 
Example 13
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOneToken() throws Exception {
  Tokenizer t = new SimplePatternTokenizer(".*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  String s;
  while (true) {
    s = TestUtil.randomUnicodeString(random());
    if (s.length() > 0) {
      break;
    }
  }
  t.setReader(new StringReader(s));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals(s, termAtt.toString());
}
 
Example 14
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNoTokens() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer(".*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  String s;
  while (true) {
    s = TestUtil.randomUnicodeString(random());
    if (s.length() > 0) {
      break;
    }
  }
  t.setReader(new StringReader(s));
  t.reset();
  assertFalse(t.incrementToken());
}
 
Example 15
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyStringPatternNoMatch() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("bbb", termAtt.toString());
  assertFalse(t.incrementToken());
}
 
Example 16
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndOffset() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a+");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
  t.setReader(new StringReader("aaabbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("bbb", termAtt.toString());
  assertFalse(t.incrementToken());
  t.end();
  assertEquals(6, offsetAtt.endOffset());
}
 
Example 17
Source File: HanLPTokenizerFactoryTest.java    From hanlp-lucene-plugin with Apache License 2.0 5 votes vote down vote up
public void testCreate() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" +
                                                 "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" +
                                                 "突出外表、百變髮型及正面的形象,以至自己" +
                                                 "品牌的男士香水等商品,及長期擔任運動品牌" +
                                                 "Adidas的代言人,因此對大眾傳播媒介和時尚界" +
                                                 "等方面都具很大的影響力,在足球圈外所獲得的" +
                                                 "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 18
Source File: URLTokenizer.java    From elasticsearch-analysis-url with Apache License 2.0 5 votes vote down vote up
/**
 * Get a list of {@link Token}s from the given {@link Tokenizer}
 * @param part the url part which should be used in {@link Token} creation
 * @param tokenizer the tokenizer from which tokens will be gleaned
 * @return a list of tokens
 * @throws IOException
 */
private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
    tokenizer.reset();
    List<Token> tokens = new ArrayList<>();
    OffsetAttribute offset;
    String token;
    while (tokenizer.incrementToken()) {
        token = tokenizer.getAttribute(CharTermAttribute.class).toString();
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset()));
    }
    return tokens;
}
 
Example 19
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testEmptyStringPatternNoMatch() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a*");
  t.setReader(new StringReader("bbb"));
  t.reset();
  assertFalse(t.incrementToken());
}
 
Example 20
Source File: NGramTokenizerTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
  // convert the string to code points
  final int[] codePoints = toCodePoints(s);
  final int[] offsets = new int[codePoints.length + 1];
  for (int i = 0; i < codePoints.length; ++i) {
    offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
  }
  final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) {
    @Override
    protected boolean isTokenChar(int chr) {
      return nonTokenChars.indexOf(chr) < 0;
    }
  };
  grams.setReader(new StringReader(s));
  final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
  final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
  final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
  final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
  grams.reset();
  for (int start = 0; start < codePoints.length; ++start) {
    nextGram:
    for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
      if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
        // not on an edge
        continue nextGram;
      }
      for (int j = start; j < end; ++j) {
        if (!isTokenChar(nonTokenChars, codePoints[j])) {
          continue nextGram;
        }
      }
      assertTrue(grams.incrementToken());
      assertArrayEquals(ArrayUtil.copyOfSubArray(codePoints, start, end), toCodePoints(termAtt));
      assertEquals(1, posIncAtt.getPositionIncrement());
      assertEquals(1, posLenAtt.getPositionLength());
      assertEquals(offsets[start], offsetAtt.startOffset());
      assertEquals(offsets[end], offsetAtt.endOffset());
    }
  }
  assertFalse(grams.incrementToken());
  grams.end();
  assertEquals(s.length(), offsetAtt.startOffset());
  assertEquals(s.length(), offsetAtt.endOffset());
}