org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute Java Examples

The following examples show how to use org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}
 
Example #2
Source File: HanLpTokenizerFactoryTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #3
Source File: Tagger.java    From SolrTextTagger with Apache License 2.0 6 votes vote down vote up
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
              TagClusterReducer tagClusterReducer, boolean skipAltTokens,
              boolean ignoreStopWords) throws IOException {
  this.terms = terms;
  this.liveDocs = liveDocs;
  this.tokenStream = tokenStream;
  this.skipAltTokens = skipAltTokens;
  this.ignoreStopWords = ignoreStopWords;
  byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
  posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
  taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
  tokenStream.reset();

  this.tagClusterReducer = tagClusterReducer;
}
 
Example #4
Source File: HanLPTokenizerTest.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
public void testPinyinTokenFilter() throws Exception
{
    Map<String, String> args = new HashMap<>();
    args.put("original", "true");
    args.put("pinyin", "false");
    args.put("pinyinFirstChar", "true");
    HanLPPinyinTokenFilterFactory factory = new HanLPPinyinTokenFilterFactory(args);
    TokenStream tokenStream = factory.create(tokenizer);
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #5
Source File: LuceneAnalyzerTest.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void test2() throws Exception {
    MynlpAnalyzer analyzer = new MynlpAnalyzer(Lexers.core().filterReader(true, true));

    TokenStream tokenStream = analyzer.tokenStream("title", "俞正声主持召开全国政协第五十三次主席会议");
    tokenStream.reset();

    StringBuffer sb = new StringBuffer();

    while (tokenStream.incrementToken()) {
        sb.append(tokenStream.getAttribute(CharTermAttribute.class));
        sb.append("\t");
        sb.append(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
        sb.append("\t");
        sb.append(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
        sb.append("\n");
    }
    System.out.println(sb.toString());
    analyzer.close();
}
 
Example #6
Source File: HanLpQueryAnalyzerTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateComponents() throws Exception {
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i) {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    try (Analyzer analyzer = new HanLpQueryAnalyzer("viterbi")) {
        TokenStream tokenStream = analyzer.tokenStream("field", text);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }
}
 
Example #7
Source File: AnalyzersTest.java    From russianmorphology with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
    Analyzer morphlogyAnalyzer = new RussianAnalyzer();
    InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8");

    TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
    tokenStream.reset();
    Set<String> foromsOfWine = new HashSet<String>();
    foromsOfWine.add("вина");
    foromsOfWine.add("винo");
    boolean wordSeen = false;
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
        if(foromsOfWine.contains(charTerm.toString()) && wordSeen){
            assertThat(position.getPositionIncrement(),equalTo(0));
        }
        if(foromsOfWine.contains(charTerm.toString())){
            wordSeen = true;
        }
    }
}
 
Example #8
Source File: QueryBuilder.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** 
 * Creates simple phrase query from the cached tokenstream contents 
 */
protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException {
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  builder.setSlop(slop);
  
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
  int position = -1;
  float phraseBoost = DEFAULT_BOOST;
  stream.reset();
  while (stream.incrementToken()) {
    if (enablePositionIncrements) {
      position += posIncrAtt.getPositionIncrement();
    } else {
      position += 1;
    }
    builder.add(new Term(field, termAtt.getBytesRef()), position);
    phraseBoost *= boostAtt.getBoost();
  }
  PhraseQuery query = builder.build();
  if (phraseBoost == DEFAULT_BOOST) {
    return query;
  }
  return new BoostQuery(query, phraseBoost);
}
 
Example #9
Source File: HanLpQueryAnalyzerTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #10
Source File: TestAnsj.java    From ansj4solr with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
Example #11
Source File: TestSnowball.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testFilterTokens() throws Exception {
  SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
  TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
  FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
  
  filter.incrementToken();

  assertEquals("accent", termAtt.toString());
  assertEquals(2, offsetAtt.startOffset());
  assertEquals(7, offsetAtt.endOffset());
  assertEquals("wrd", typeAtt.type());
  assertEquals(3, posIncAtt.getPositionIncrement());
  assertEquals(77, flagsAtt.getFlags());
  assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}
 
Example #12
Source File: ShingleAnalyzerWrapperTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) {
    int j = -1;
  
    PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  
    ts.reset();
    while (ts.incrementToken()) {
      j += posIncrAtt.getPositionIncrement();
      String termText = termAtt.toString();
      builder.add(new Term("content", termText), j);
    }
    ts.end();
  }

  PhraseQuery q = builder.build();
  ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs;
  int[] ranks = new int[] { 0 };
  compareRanks(hits, ranks);
}
 
Example #13
Source File: TestStopAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testStopListPositions() throws IOException {
  CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
  StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
  String s =             "This is a good test of the english stop analyzer with positions";
  int expectedIncr[] =  { 1,   1, 1,          3, 1,  1,      1,            2,   1};
  try (TokenStream stream = newStop.tokenStream("test", s)) {
    assertNotNull(stream);
    int i = 0;
    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);

    stream.reset();
    while (stream.incrementToken()) {
      String text = termAtt.toString();
      assertFalse(stopWordsSet.contains(text));
      assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
    }
    stream.end();
  }
  newStop.close();
}
 
Example #14
Source File: TestRemoveDuplicatesTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testDups(final String expected, final Token... tokens)
  throws Exception {

  final Iterator<Token> toks = Arrays.asList(tokens).iterator();
  final TokenStream ts = new RemoveDuplicatesTokenFilter(
    (new TokenStream() {
        CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
        @Override
        public boolean incrementToken() {
          if (toks.hasNext()) {
            clearAttributes();
            Token tok = toks.next();
            termAtt.setEmpty().append(tok);
            offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
            posIncAtt.setPositionIncrement(tok.getPositionIncrement());
            return true;
          } else {
            return false;
          }
        }
      }));
  
  assertTokenStreamContents(ts, expected.split("\\s"));   
}
 
Example #15
Source File: NlpSegmenterTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}
 
Example #16
Source File: Tagger.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
              TagClusterReducer tagClusterReducer, boolean skipAltTokens,
              boolean ignoreStopWords) throws IOException {
  this.terms = terms;
  this.liveDocs = liveDocs;
  this.tokenStream = tokenStream;
  this.skipAltTokens = skipAltTokens;
  this.ignoreStopWords = ignoreStopWords;
  byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
  posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
  taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
  tokenStream.reset();

  this.tagClusterReducer = tagClusterReducer;
}
 
Example #17
Source File: TransportExtendedAnalyzeAction.java    From elasticsearch-extended-analyze with Apache License 2.0 6 votes vote down vote up
private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
            lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}
 
Example #18
Source File: TestStopFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private void doTestStopwordsPositions(StopFilter stopfilter, List<Integer> stopwordPositions, final int numberOfTokens) throws IOException {
  CharTermAttribute termAtt = stopfilter.getAttribute(CharTermAttribute.class);
  PositionIncrementAttribute posIncrAtt = stopfilter.getAttribute(PositionIncrementAttribute.class);
  stopfilter.reset();
  log("Test stopwords positions:");
  for (int i=0; i<numberOfTokens; i++) {
    if (stopwordPositions.contains(i)){
      // if i is in stopwordPosition it is a stopword and we skip this position
      continue;
    }
    assertTrue(stopfilter.incrementToken());
    log(String.format(Locale.ROOT, "token %d: %s", i, termAtt.toString()));
    String token = English.intToEnglish(i).trim();
    assertEquals(String.format(Locale.ROOT, "expecting token %d to be %s", i, token), token, termAtt.toString());
  }
  assertFalse(stopfilter.incrementToken());
  stopfilter.end();
  stopfilter.close();
  log("----------");
}
 
Example #19
Source File: PinyinFilterTest.java    From elasticsearch-analysis-lc-pinyin with Artistic License 2.0 6 votes vote down vote up
public void testFirstLetterFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }
 
Example #20
Source File: PinyinFilterTest.java    From elasticsearch-analysis-lc-pinyin with Artistic License 2.0 6 votes vote down vote up
public void testFullPinyinFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }
 
Example #21
Source File: PinyinAnalysisTest.java    From elasticsearch-analysis-lc-pinyin with Artistic License 2.0 6 votes vote down vote up
@Test
public void testSearch() throws IOException {
    LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
    TokenStream tokenStream = analyzer.tokenStream("lc", "重qing");

    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    tokenStream.reset();
    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "重");
    Assert.assertEquals(offsetAttribute.startOffset(), 0);
    Assert.assertEquals(offsetAttribute.endOffset(), 1);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "qing");
    Assert.assertEquals(offsetAttribute.startOffset(), 1);
    Assert.assertEquals(offsetAttribute.endOffset(), 5);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    tokenStream.close();
}
 
Example #22
Source File: AutoPhrasingTokenFilter.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
private void emit(char[] tokenChars) {
    char[] token = tokenChars;
    if (replaceWhitespaceWith != null) {
        token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    if (termAttr != null) {
        termAttr.setEmpty();
        termAttr.append(new StringBuilder().append(token));
    }
    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
        int start = offAttr.endOffset() - token.length;
        offAttr.setOffset(start, offAttr.endOffset());
    }
    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
        pia.setPositionIncrement(++positionIncr);
    }
    lastEmitted = token;
}
 
Example #23
Source File: SimpleSynonymMap.java    From elasticsearch-dynamic-synonym with Apache License 2.0 6 votes vote down vote up
private Set<String> analyze(String text) throws IOException {
    Set<String> result = new HashSet<String>();
    Analyzer analyzer = configuration.getAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", text)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            int length = termAtt.length();
            if (length == 0) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
            }
            if (posIncAtt.getPositionIncrement() != 1) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
            }

            result.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }

        ts.end();
        return result;
    }
}
 
Example #24
Source File: PathTokenFilterTest.java    From SearchServices with GNU Lesser General Public License v3.0 6 votes vote down vote up
public void testAttributesAfterStreamEnd() throws IOException
{
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
    ts.setReader(reader);

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    
    // PathTokenFilter.end() will be called after all tokens consumed.
    tokenise(ts, new String[]{"uri1", "one"});
    
    // Check attributes cleaned up
    assertEquals("", termAtt.toString());
    assertEquals("word", typeAtt.type()); // the default
    assertEquals(0, posIncAtt.getPositionIncrement());
    // Final offset...
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
}
 
Example #25
Source File: TestDuelingAnalyzers.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
  left.reset();
  right.reset();
  CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
  CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
  OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
  OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
  PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
  PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
  
  while (left.incrementToken()) {
    assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
    assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
    assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
    assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
    assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  };
  assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
  left.end();
  right.end();
  assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  left.close();
  right.close();
}
 
Example #26
Source File: ConcatenatingTokenStream.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Create a new ConcatenatingTokenStream from a set of inputs
 * @param sources an array of TokenStream inputs to concatenate
 */
public ConcatenatingTokenStream(TokenStream... sources) {
  super(combineSources(sources));
  this.sources = sources;
  this.offsetAtt = addAttribute(OffsetAttribute.class);
  this.posIncAtt = addAttribute(PositionIncrementAttribute.class);
  this.sourceOffsets = new OffsetAttribute[sources.length];
  this.sourceIncrements = new PositionIncrementAttribute[sources.length];
  for (int i = 0; i < sources.length; i++) {
    this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class);
    this.sourceIncrements[i] = sources[i].addAttribute(PositionIncrementAttribute.class);
  }
}
 
Example #27
Source File: HanLPTokenizerTest.java    From hanlp-lucene-plugin with Apache License 2.0 5 votes vote down vote up
public void testIncrementToken() throws Exception
{
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #28
Source File: PayloadTokenizer.java    From clue with Apache License 2.0 5 votes vote down vote up
public PayloadTokenizer(String text)
        throws IOException {
  setReader(new StringReader(text));
  this.tokens = text.toLowerCase().split(",");
  
  termAttr = addAttribute(CharTermAttribute.class);
  termAttr.resizeBuffer(text.length()); // maximum size necessary is the size of the input
  payloadAttr = addAttribute(PayloadAttribute.class);
  payload = new BytesRef(new byte[4]);
  positionAttr = addAttribute(PositionIncrementAttribute.class);
  offsetAttr = addAttribute(OffsetAttribute.class);
}
 
Example #29
Source File: TestIndexWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNegativePositions() throws Throwable {
  final TokenStream tokens = new TokenStream() {
    final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

    final Iterator<String> terms = Arrays.asList("a","b","c").iterator();
    boolean first = true;

    @Override
    public boolean incrementToken() {
      if (!terms.hasNext()) return false;
      clearAttributes();
      termAtt.append(terms.next());
      posIncrAtt.setPositionIncrement(first ? 0 : 1);
      first = false;
      return true;
    }
  };

  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  Document doc = new Document();
  doc.add(new TextField("field", tokens));
  expectThrows(IllegalArgumentException.class, () -> {
    w.addDocument(doc);
  });

  w.close();
  dir.close();
}
 
Example #30
Source File: TestPayloadSpanUtil.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public PayloadFilter(TokenStream input) {
  super(input);
  pos = 0;
  entities.add("xx");
  entities.add("one");
  nopayload.add("nopayload");
  nopayload.add("np");
  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  payloadAtt = addAttribute(PayloadAttribute.class);
}