Java Code Examples for org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

The following are top voted examples for showing how to use org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to generate more good examples.
Example 1
Project: improved-journey   File: TestAnsj.java   View source code 7 votes vote down vote up
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
	System.out.println(parse);
	List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");
	
  
	//System.out.println(parse1);
	String text11="ZW321282050000000325";
	
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

    tokenizer.reset();
	while (tokenizer.incrementToken()){

	      System.out.print(new String(termAtt.toString()+" ") );
		//  System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		//System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
Example 2
Project: lucenelab   File: SynonymFilterExample.java   View source code 7 votes vote down vote up
@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}
 
Example 3
Project: improved-journey   File: TestAnsj.java   View source code 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
	System.out.println(parse);
	List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");
	
  
	//System.out.println(parse1);
	String text11="ZW321282050000000325";
	
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

    tokenizer.reset();
	while (tokenizer.incrementToken()){

	      System.out.print(new String(termAtt.toString()+" ") );
		//  System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		//System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
Example 4
Project: elasticsearch-analysis-voikko   File: VoikkoTokenFilterTests.java   View source code 6 votes vote down vote up
private List<TokenData> parse(String text) {
    NamedAnalyzer analyzer = getAnalysisService().indexAnalyzers.get("test");

    try {
        try (TokenStream ts = analyzer.tokenStream("test", new StringReader(text))) {
            List<TokenData> result = new ArrayList<>();
            CharTermAttribute charTerm = ts.addAttribute(CharTermAttribute.class);
            OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
            PositionIncrementAttribute position = ts.addAttribute(PositionIncrementAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                String original = text.substring(offset.startOffset(), offset.endOffset());
                result.add(token(original, charTerm.toString(), position.getPositionIncrement()));
            }
            ts.end();

            return result;
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
 
Example 5
Project: lams   File: PrefixAwareTokenFilter.java   View source code 6 votes vote down vote up
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
  super(suffix);
  this.suffix = suffix;
  this.prefix = prefix;
  prefixExhausted = false;
  
  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  payloadAtt = addAttribute(PayloadAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  flagsAtt = addAttribute(FlagsAttribute.class);

  p_termAtt = prefix.addAttribute(CharTermAttribute.class);
  p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class);
  p_payloadAtt = prefix.addAttribute(PayloadAttribute.class);
  p_offsetAtt = prefix.addAttribute(OffsetAttribute.class);
  p_typeAtt = prefix.addAttribute(TypeAttribute.class);
  p_flagsAtt = prefix.addAttribute(FlagsAttribute.class);
}
 
Example 6
Project: elasticsearch-dynamic-synonym   File: SimpleSynonymMap.java   View source code 6 votes vote down vote up
private Set<String> analyze(String text) throws IOException {
    Set<String> result = new HashSet<String>();
    Analyzer analyzer = configuration.getAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", text)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            int length = termAtt.length();
            if (length == 0) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
            }
            if (posIncAtt.getPositionIncrement() != 1) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
            }

            result.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }

        ts.end();
        return result;
    }
}
 
Example 7
Project: elasticsearch-analysis-ltp   File: LTPTokenizer.java   View source code 6 votes vote down vote up
/**
 * Lucene constructor
 *
 * @throws UnirestException
 * @throws JSONException
 * @throws IOException
 */
public LTPTokenizer(Set<String> filter)
        throws IOException, JSONException, UnirestException {
    super();
    logger.info("LTPTokenizer Initialize......");
    // Add token offset attribute
    offsetAttr = addAttribute(OffsetAttribute.class);
    // Add token content attribute
    charTermAttr = addAttribute(CharTermAttribute.class);
    // Add token type attribute
    typeAttr = addAttribute(TypeAttribute.class);
    // Add token position attribute
    piAttr = addAttribute(PositionIncrementAttribute.class);
    // Create a new word segmenter to get tokens
    LTPSeg = new LTPWordSegmenter(input);
    // Add filter words set
    this.filter = filter;
}
 
Example 8
Project: elasticsearch-analysis-lc-pinyin   File: PinyinAnalysisTest.java   View source code 6 votes vote down vote up
@Test
public void testSearch() throws IOException {
    LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
    TokenStream tokenStream = analyzer.tokenStream("lc", "重qing");

    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    tokenStream.reset();
    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "重");
    Assert.assertEquals(offsetAttribute.startOffset(), 0);
    Assert.assertEquals(offsetAttribute.endOffset(), 1);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "qing");
    Assert.assertEquals(offsetAttribute.startOffset(), 1);
    Assert.assertEquals(offsetAttribute.endOffset(), 5);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    tokenStream.close();
}
 
Example 9
Project: elasticsearch-analysis-lc-pinyin   File: PinyinFilterTest.java   View source code 6 votes vote down vote up
public void testFullPinyinFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }
 
Example 10
Project: elasticsearch-analysis-lc-pinyin   File: PinyinFilterTest.java   View source code 6 votes vote down vote up
public void testFirstLetterFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }
 
Example 11
Project: fastcatsearch3   File: Token.java   View source code 6 votes vote down vote up
@Override
public void copyTo(AttributeImpl target) {
  if (target instanceof Token) {
    final Token to = (Token) target;
    to.reinit(this);
    // reinit shares the payload, so clone it:
    if (payload !=null) {
      to.payload = payload.clone();
    }
  } else {
    super.copyTo(target);
    ((OffsetAttribute) target).setOffset(startOffset, endOffset);
    ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
    ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone());
    ((FlagsAttribute) target).setFlags(flags);
    ((TypeAttribute) target).setType(type);
  }
}
 
Example 12
Project: solrplugins   File: JsonReferencePayloadTokenizerTest.java   View source code 6 votes vote down vote up
@Test
public void testShorthand2() throws IOException {
  JsonReferencePayloadTokenizer tokenizer = new JsonReferencePayloadTokenizer();
  tokenizer.setReader(new StringReader("{\"filing\": \"something\", \"prefix\": \"The \"}"));
  tokenizer.reset();

  assertTrue(tokenizer.incrementToken());
  assertEquals("something", tokenizer.getAttribute(CharTermAttribute.class).toString());
  assertEquals(JsonReferencePayloadTokenizer.TYPE_FILING, tokenizer.getAttribute(TypeAttribute.class).type());
  assertEquals(1, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
  assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload());

  assertTrue(tokenizer.incrementToken());
  assertEquals("The ", tokenizer.getAttribute(CharTermAttribute.class).toString());
  assertEquals(JsonReferencePayloadTokenizer.TYPE_PREFIX, tokenizer.getAttribute(TypeAttribute.class).type());
  assertEquals(0, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
  assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload());

  assertFalse(tokenizer.incrementToken());
}
 
Example 13
Project: solrplugins   File: JsonReferencePayloadTokenizerTest.java   View source code 6 votes vote down vote up
@Test
public void testShorthand3() throws IOException {
  JsonReferencePayloadTokenizer tokenizer = new JsonReferencePayloadTokenizer();
  tokenizer.setReader(new StringReader("{\"prefix\": \"The \", \"filing\": \"something\"}"));
  tokenizer.reset();

  assertTrue(tokenizer.incrementToken());
  assertEquals("something", tokenizer.getAttribute(CharTermAttribute.class).toString());
  assertEquals(JsonReferencePayloadTokenizer.TYPE_FILING, tokenizer.getAttribute(TypeAttribute.class).type());
  assertEquals(1, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
  assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload());

  assertTrue(tokenizer.incrementToken());
  assertEquals("The ", tokenizer.getAttribute(CharTermAttribute.class).toString());
  assertEquals(JsonReferencePayloadTokenizer.TYPE_PREFIX, tokenizer.getAttribute(TypeAttribute.class).type());
  assertEquals(0, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
  assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload());

  assertFalse(tokenizer.incrementToken());
}
 
Example 14
Project: Alix   File: Demo.java   View source code 6 votes vote down vote up
public static MyToken[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException
{
  ;
  TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
  CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
  PositionIncrementAttribute positionIncrementAttr = stream.addAttribute(PositionIncrementAttribute.class);
  TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);
  OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class);

  ArrayList<MyToken> tokenList = new ArrayList<MyToken>();
  while (stream.incrementToken()) {
    tokenList.add(new MyToken(term.toString(), positionIncrementAttr.getPositionIncrement(), typeAttr.type(),
        offsetAttr.startOffset(), offsetAttr.endOffset()));
  }

  return tokenList.toArray(new MyToken[0]);
}
 
Example 15
Project: hanlp-lucene-plugin   File: HanLPAnalyzerTest.java   View source code 6 votes vote down vote up
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 16
Project: hanlp-lucene-plugin   File: HanLPAnalyzerTest.java   View source code 6 votes vote down vote up
public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 17
Project: hanlp-lucene-plugin   File: HanLPIndexAnalyzerTest.java   View source code 6 votes vote down vote up
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPIndexAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 18
Project: search   File: PrefixAwareTokenFilter.java   View source code 6 votes vote down vote up
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
  super(suffix);
  this.suffix = suffix;
  this.prefix = prefix;
  prefixExhausted = false;
  
  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  payloadAtt = addAttribute(PayloadAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  flagsAtt = addAttribute(FlagsAttribute.class);

  p_termAtt = prefix.addAttribute(CharTermAttribute.class);
  p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class);
  p_payloadAtt = prefix.addAttribute(PayloadAttribute.class);
  p_offsetAtt = prefix.addAttribute(OffsetAttribute.class);
  p_typeAtt = prefix.addAttribute(TypeAttribute.class);
  p_flagsAtt = prefix.addAttribute(FlagsAttribute.class);
}
 
Example 19
Project: search   File: TestRemoveDuplicatesTokenFilter.java   View source code 6 votes vote down vote up
public void testDups(final String expected, final Token... tokens)
  throws Exception {

  final Iterator<Token> toks = Arrays.asList(tokens).iterator();
  final TokenStream ts = new RemoveDuplicatesTokenFilter(
    (new TokenStream() {
        CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
        @Override
        public boolean incrementToken() {
          if (toks.hasNext()) {
            clearAttributes();
            Token tok = toks.next();
            termAtt.setEmpty().append(tok);
            offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
            posIncAtt.setPositionIncrement(tok.getPositionIncrement());
            return true;
          } else {
            return false;
          }
        }
      }));
  
  assertTokenStreamContents(ts, expected.split("\\s"));   
}
 
Example 20
Project: search   File: TestDuelingAnalyzers.java   View source code 6 votes vote down vote up
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
  left.reset();
  right.reset();
  CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
  CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
  OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
  OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
  PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
  PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
  
  while (left.incrementToken()) {
    assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
    assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
    assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
    assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
    assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  };
  assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
  left.end();
  right.end();
  assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  left.close();
  right.close();
}
 
Example 21
Project: search   File: TestStopFilter.java   View source code 6 votes vote down vote up
private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
  log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
  stpf.setEnablePositionIncrements(enableIcrements);
  CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
  PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
  stpf.reset();
  for (int i=0; i<20; i+=3) {
    assertTrue(stpf.incrementToken());
    log("Token "+i+": "+stpf);
    String w = English.intToEnglish(i).trim();
    assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
    assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
  }
  assertFalse(stpf.incrementToken());
  stpf.end();
  stpf.close();
}
 
Example 22
Project: search   File: TestStopAnalyzer.java   View source code 6 votes vote down vote up
public void testStopListPositions() throws IOException {
  CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
  StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
  String s =             "This is a good test of the english stop analyzer with positions";
  int expectedIncr[] =  { 1,   1, 1,          3, 1,  1,      1,            2,   1};
  TokenStream stream = newStop.tokenStream("test", s);
  try {
    assertNotNull(stream);
    int i = 0;
    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);

    stream.reset();
    while (stream.incrementToken()) {
      String text = termAtt.toString();
      assertFalse(stopWordsSet.contains(text));
      assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
    }
    stream.end();
  } finally {
    IOUtils.closeWhileHandlingException(stream);
  }
}
 
Example 23
Project: search   File: ShingleAnalyzerWrapperTest.java   View source code 6 votes vote down vote up
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
  PhraseQuery q = new PhraseQuery();

  TokenStream ts = analyzer.tokenStream("content", "this sentence");
  try {
    int j = -1;
  
    PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  
    ts.reset();
    while (ts.incrementToken()) {
      j += posIncrAtt.getPositionIncrement();
      String termText = termAtt.toString();
      q.add(new Term("content", termText), j);
    }
    ts.end();
  } finally {
    IOUtils.closeWhileHandlingException(ts);
  }

  ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
  int[] ranks = new int[] { 0 };
  compareRanks(hits, ranks);
}
 
Example 24
Project: search   File: TestSnowball.java   View source code 6 votes vote down vote up
public void testFilterTokens() throws Exception {
  SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
  TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
  FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
  
  filter.incrementToken();

  assertEquals("accent", termAtt.toString());
  assertEquals(2, offsetAtt.startOffset());
  assertEquals(7, offsetAtt.endOffset());
  assertEquals("wrd", typeAtt.type());
  assertEquals(3, posIncAtt.getPositionIncrement());
  assertEquals(77, flagsAtt.getFlags());
  assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}
 
Example 25
Project: search   File: SpellingQueryConverter.java   View source code 6 votes vote down vote up
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}
 
Example 26
Project: cc-analysis   File: CcWordsFilterTest.java   View source code 6 votes vote down vote up
private CharsRef analyze(Analyzer analyzer, String text) throws IOException {
	CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
	try (TokenStream ts = analyzer.tokenStream("", text)) {
		CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
		PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
		ts.reset();
		while (ts.incrementToken()) {
			int length = termAtt.length();
			if (length == 0) {
				throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
			}
			charsRefBuilder.grow(charsRefBuilder.length() + length + 1); /* current + word + separator */
			if (charsRefBuilder.length() > 0) {
				charsRefBuilder.append(CcWordSet.WORD_SEPARATOR);
			}
			charsRefBuilder.append(termAtt);
		}
		ts.end();
	}
	if (charsRefBuilder.length() == 0) {
		return null;
	}
	charsRefBuilder.append(CcWordSet.WORD_END);
	return charsRefBuilder.get();
}
 
Example 27
Project: solr-multilingual-analyzer   File: MultiLangTokenizer.java   View source code 6 votes vote down vote up
private void handleTokenStream(Map<Integer, List<Token>> tokenPosMap, TokenStream tokenStream) throws IOException {
    tokenStream.reset();
    int pos = 0;

    CharTermAttribute charTermAttribute = getCharTermAttribute(tokenStream);
    OffsetAttribute offsetAttribute = getOffsetAttribute(tokenStream);
    TypeAttribute typeAttribute = getTypeAttribute(tokenStream);
    PositionIncrementAttribute positionIncrementAttribute = getPositionIncrementAttribute(tokenStream);

    while (tokenStream.incrementToken()) {
        if (null == charTermAttribute || null == offsetAttribute) {
            return;
        }
        Token token = new Token(charTermAttribute.buffer(), 0, charTermAttribute.length(),
                offsetAttribute.startOffset(), offsetAttribute.endOffset());
        if (null != typeAttribute) {
            token.setType(typeAttribute.type());
        }
        pos += null != positionIncrementAttribute ? positionIncrementAttribute.getPositionIncrement() : 1;
        if (!tokenPosMap.containsKey(pos)) {
            tokenPosMap.put(pos, new LinkedList<Token>());
        }
        tokenPosMap.get(pos).add(token);
    }
    tokenStream.close();
}
 
Example 28
Project: resource-query-parser   File: QueryBuilder.java   View source code 6 votes vote down vote up
/**
 * Creates complex boolean query from the cached tokenstream contents
 */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator)
		throws IOException {
	BooleanQuery.Builder q = newBooleanQuery();
	List<Term> currentQuery = new ArrayList<>();

	TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
	PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);

	stream.reset();
	while (stream.incrementToken()) {
		if (posIncrAtt.getPositionIncrement() != 0) {
			add(q, currentQuery, operator);
			currentQuery.clear();
		}
		currentQuery.add(new Term(field, termAtt.getBytesRef()));
	}
	add(q, currentQuery, operator);

	return q.build();
}
 
Example 29
Project: resource-query-parser   File: QueryBuilder.java   View source code 6 votes vote down vote up
/**
 * Creates simple phrase query from the cached tokenstream contents
 */
protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException {
	PhraseQuery.Builder builder = new PhraseQuery.Builder();
	builder.setSlop(slop);

	TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
	PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
	int position = -1;

	stream.reset();
	while (stream.incrementToken()) {
		if (enablePositionIncrements) {
			position += posIncrAtt.getPositionIncrement();
		} else {
			position += 1;
		}
		builder.add(new Term(field, termAtt.getBytesRef()), position);
	}

	return builder.build();
}
 
Example 30
Project: community-edition-old   File: PathTokenFilterTest.java   View source code 6 votes vote down vote up
public void testAttributesAfterStreamEnd() throws IOException
{
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    
    // PathTokenFilter.end() will be called after all tokens consumed.
    tokenise(ts, new String[]{"uri1", "one"});
    
    // Check attributes cleaned up
    assertEquals("", termAtt.toString());
    assertEquals("word", typeAtt.type()); // the default
    assertEquals(0, posIncAtt.getPositionIncrement());
    // Final offset...
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
}
 
Example 31
Project: pinyinTokenFilter   File: TestPinyinTransformTokenFilter.java   View source code 6 votes vote down vote up
@Test
public void testFull() throws IOException {
    this.filter = new PinyinTransformTokenFilter(tokenizer);
    this.filter.reset();
    int position = 0;
    while (this.filter.incrementToken()) {
        CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class);
        String token = termAtt.toString();
        int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement();
        position += increment;
        OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class);
        TypeAttribute type = this.filter.getAttribute(TypeAttribute.class);
        System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type
                .type() + ") " + token);
    }
    assertTrue(position == 4);
}
 
Example 32
Project: pinyinTokenFilter   File: TestPinyinTransformTokenFilter.java   View source code 6 votes vote down vote up
@Test
public void testFullWithNoChineseOut() throws IOException {
    this.filter = new PinyinTransformTokenFilter(tokenizer, false, 1, false);
    this.filter.reset();
    int position = 0;
    while (this.filter.incrementToken()) {
        CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class);
        String token = termAtt.toString();
        int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement();
        position += increment;
        OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class);
        TypeAttribute type = this.filter.getAttribute(TypeAttribute.class);
        System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type
                .type() + ") " + token);
    }
    assertTrue(position == 3);
}
 
Example 33
Project: pinyinTokenFilter   File: TestPinyinTransformTokenFilter.java   View source code 6 votes vote down vote up
@Test
public void testShort() throws IOException {
    this.filter = new PinyinTransformTokenFilter(tokenizer, true);
    this.filter.reset();
    int position = 0;
    while (this.filter.incrementToken()) {
        CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class);
        String token = termAtt.toString();
        int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement();
        position += increment;
        OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class);
        TypeAttribute type = this.filter.getAttribute(TypeAttribute.class);
        System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type
                .type() + ") " + token);
    }
    assertTrue(position == 4);
}
 
Example 34
Project: lucenelab   File: PreAnnotatedTokenFilterTest.java   View source code 6 votes vote down vote up
private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        int len = -1;
        final BytesRef payload = payloadAtt.getPayload();
        if (info.len != -1) {
            assertThat(payload).isNotNull();
            in.reset(payload.bytes);
            len = in.readVInt();
        } else {
            assertThat(payload).isNull();
        }
        assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}
 
Example 35
Project: auto-phrase-tokenfilter   File: AutoPhrasingTokenFilter.java   View source code 6 votes vote down vote up
private void emit( char[] token ) {
System.out.println( "emit: " + new String( token ) );
if (replaceWhitespaceWith != null) {
	token = replaceWhiteSpace( token );
}
CharTermAttribute termAttr = getTermAttribute( );
termAttr.setEmpty( );
termAttr.append( new StringBuilder( ).append( token ) );

OffsetAttribute offAttr = getOffsetAttribute( );
if (offAttr != null && offAttr.endOffset() >= token.length){ 
  int start = offAttr.endOffset() - token.length;
  offAttr.setOffset( start, offAttr.endOffset());
}

PositionIncrementAttribute pia = getPositionIncrementAttribute( );
if (pia != null) {
	pia.setPositionIncrement( ++positionIncr );
}

lastEmitted = token;
 }
 
Example 36
Project: lucene-korean   File: KoreanAnalyzerTest.java   View source code 6 votes vote down vote up
public void testStandardTokenizer() throws Exception {

        String source = "우리나라라면에서부터 일본라면이 파생되었잖니?";
        source = "너는 너는 다시 내게 돌아 올거야. school is a good place 呵呵大笑 呵呵大笑";

        long start = System.currentTimeMillis();

        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        TokenStream stream = analyzer.tokenStream("s", new StringReader(source));
        TokenStream tok = new StandardFilter(Version.LUCENE_36, stream);

        while (tok.incrementToken()) {
            CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class);
            TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class);

            System.out.println(new String(termAttr.buffer(), 0, termAttr.length()));
        }

        System.out.println((System.currentTimeMillis() - start) + "ms");
    }
 
Example 37
Project: lucene-korean   File: KoreanAnalyzerTest.java   View source code 6 votes vote down vote up
public void testHanjaConvert() throws Exception {

        String source = "呵呵大笑  落落長松 ";

        long start = System.currentTimeMillis();

        KoreanAnalyzer analyzer = new KoreanAnalyzer();
        TokenStream stream = analyzer.tokenStream("s", new StringReader(source));
        TokenStream tok = new KoreanFilter(stream);

        while (tok.incrementToken()) {
            CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class);
            TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class);

            System.out.println(new String(termAttr.buffer()));
        }

        System.out.println((System.currentTimeMillis() - start) + "ms");
    }
 
Example 38
Project: NYBC   File: PrefixAwareTokenFilter.java   View source code 6 votes vote down vote up
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
  super(suffix);
  this.suffix = suffix;
  this.prefix = prefix;
  prefixExhausted = false;
  
  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  payloadAtt = addAttribute(PayloadAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  flagsAtt = addAttribute(FlagsAttribute.class);

  p_termAtt = prefix.addAttribute(CharTermAttribute.class);
  p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class);
  p_payloadAtt = prefix.addAttribute(PayloadAttribute.class);
  p_offsetAtt = prefix.addAttribute(OffsetAttribute.class);
  p_typeAtt = prefix.addAttribute(TypeAttribute.class);
  p_flagsAtt = prefix.addAttribute(FlagsAttribute.class);
}
 
Example 39
Project: NYBC   File: TestRemoveDuplicatesTokenFilter.java   View source code 6 votes vote down vote up
public void testDups(final String expected, final Token... tokens)
  throws Exception {

  final Iterator<Token> toks = Arrays.asList(tokens).iterator();
  final TokenStream ts = new RemoveDuplicatesTokenFilter(
    (new TokenStream() {
        CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
        @Override
        public boolean incrementToken() {
          if (toks.hasNext()) {
            clearAttributes();
            Token tok = toks.next();
            termAtt.setEmpty().append(tok);
            offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
            posIncAtt.setPositionIncrement(tok.getPositionIncrement());
            return true;
          } else {
            return false;
          }
        }
      }));
  
  assertTokenStreamContents(ts, expected.split("\\s"));   
}
 
Example 40
Project: NYBC   File: TestDuelingAnalyzers.java   View source code 6 votes vote down vote up
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
  left.reset();
  right.reset();
  CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
  CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
  OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
  OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
  PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
  PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
  
  while (left.incrementToken()) {
    assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
    assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
    assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
    assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
    assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  };
  assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
  left.end();
  right.end();
  assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  left.close();
  right.close();
}