org.apache.lucene.analysis.tokenattributes.OffsetAttribute Java Examples

The following examples show how to use org.apache.lucene.analysis.tokenattributes.OffsetAttribute. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FieldType.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String fieldName) {
  Tokenizer ts = new Tokenizer() {
    final char[] cbuf = new char[maxChars];
    final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    final BytesTermAttribute bytesAtt = isPointField() ? addAttribute(BytesTermAttribute.class) : null;
    final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    @Override
    public boolean incrementToken() throws IOException {
      clearAttributes();
      int n = input.read(cbuf,0,maxChars);
      if (n<=0) return false;
      if (isPointField()) {
        BytesRef b = ((PointField)FieldType.this).toInternalByteRef(new String(cbuf, 0, n));
        bytesAtt.setBytesRef(b);
      } else {
        String s = toInternal(new String(cbuf, 0, n));
        termAtt.setEmpty().append(s);
      }
      offsetAtt.setOffset(correctOffset(0),correctOffset(n));
      return true;
    }
  };

  return new TokenStreamComponents(ts);
}
 
Example #2
Source File: HanLpQueryAnalyzerTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateComponents() throws Exception {
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i) {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    try (Analyzer analyzer = new HanLpQueryAnalyzer("viterbi")) {
        TokenStream tokenStream = analyzer.tokenStream("field", text);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }
}
 
Example #3
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}
 
Example #4
Source File: Tagger.java    From SolrTextTagger with Apache License 2.0 6 votes vote down vote up
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
              TagClusterReducer tagClusterReducer, boolean skipAltTokens,
              boolean ignoreStopWords) throws IOException {
  this.terms = terms;
  this.liveDocs = liveDocs;
  this.tokenStream = tokenStream;
  this.skipAltTokens = skipAltTokens;
  this.ignoreStopWords = ignoreStopWords;
  byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
  posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
  taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
  tokenStream.reset();

  this.tagClusterReducer = tagClusterReducer;
}
 
Example #5
Source File: HanLpTokenizerFactoryTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #6
Source File: URLTokenFilter.java    From elasticsearch-analysis-url with Apache License 2.0 6 votes vote down vote up
/**
 * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
 * will be passed along to the tokenizer.
 * @param input a string to be tokenized
 * @return a list of tokens extracted from the input string
 * @throws IOException
 */
private List<Token> tokenize(String input) throws IOException {
    List<Token> tokens = new ArrayList<>();
    URLTokenizer tokenizer = new URLTokenizer();
    // create a copy of the parts list to avoid ConcurrentModificationException when sorting
    tokenizer.setParts(new ArrayList<>(parts));
    tokenizer.setUrlDecode(urlDeocde);
    tokenizer.setTokenizeHost(tokenizeHost);
    tokenizer.setTokenizePath(tokenizePath);
    tokenizer.setTokenizeQuery(tokenizeQuery);
    tokenizer.setAllowMalformed(allowMalformed || passthrough);
    tokenizer.setTokenizeMalformed(tokenizeMalformed);
    tokenizer.setReader(new StringReader(input));
    tokenizer.reset();

    String term;
    URLPart part;
    OffsetAttribute offset;
    while (tokenizer.incrementToken()) {
        term = tokenizer.getAttribute(CharTermAttribute.class).toString();
        part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
    }
    return tokens;
}
 
Example #7
Source File: SpellingQueryConverter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}
 
Example #8
Source File: NlpSegmenterTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}
 
Example #9
Source File: HanLpQueryAnalyzerTestCase.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #10
Source File: LuceneToken.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
public LuceneToken(TokenStream stream) {
    this.stream = stream;
    this.term = stream.getAttribute(CharTermAttribute.class);
    this.offset = stream.getAttribute(OffsetAttribute.class);
    try {
        this.flag = this.stream.incrementToken();
        if (!flag) {
            this.stream.close();
        }
    } catch (Exception exception) {
        try {
            this.stream.close();
        } catch (Exception throwable) {
        }
        throw new RuntimeException(exception);
    }
}
 
Example #11
Source File: TestDuelingAnalyzers.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
  left.reset();
  right.reset();
  CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
  CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
  OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
  OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
  PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
  PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
  
  while (left.incrementToken()) {
    assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
    assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
    assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
    assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
    assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  };
  assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
  left.end();
  right.end();
  assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  left.close();
  right.close();
}
 
Example #12
Source File: DexterAnalyzer.java    From dexter with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	String str = "<body>perchééééééééé";
	Analyzer anal = new DexterAnalyzer();
	TokenStream ts = anal.tokenStream("content", new StringReader(str));

	OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
	CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
	ts.reset();
	while (ts.incrementToken()) {
		System.out.println(termAtt.toString()
				.substring(0, termAtt.length()));
		System.out
				.println("token start offset: " + offsetAtt.startOffset());
		System.out.println("  token end offset: " + offsetAtt.endOffset());
	}
}
 
Example #13
Source File: PlainHighlighter.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}
 
Example #14
Source File: LuceneUtil.java    From jasperreports with GNU Lesser General Public License v3.0 6 votes vote down vote up
protected String displayTokens(String text, String elementId) throws IOException {
	Analyzer analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);;
	StringBuilder sb = new StringBuilder();
	sb.append(elementId).append(": ").append(text).append(": ");

	TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
	CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);

	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		int startOffset = offsetAttribute.startOffset();
		int endOffset = offsetAttribute.endOffset();
		String term = charTermAttribute.toString();
		sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
	}

	return sb.toString();
}
 
Example #15
Source File: DemoTest.java    From HongsCORE with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    Analyzer az = CustomAnalyzer.builder()
        //.withTokenizer("Standard")
        .withTokenizer("Name")
        .addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
        //.addTokenFilter("ICUTransform", "id", "Han-Latin;NFD;[[:NonspacingMark:][:Space:]] Remove")
        //.addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
        .build();

    StringReader      sr = new StringReader(args[0]);
    TokenStream       ts = az.tokenStream  ("" , sr);
    OffsetAttribute   oa = ts.addAttribute (OffsetAttribute.class);
    CharTermAttribute ta = ts.addAttribute (CharTermAttribute.class);

    try {
        ts.reset(); // Resets this stream to the beginning. (Required)
        while (ts.incrementToken()) {
            System.out.println(ta.toString() + "|" + ta.length()
                    + "[" + oa.startOffset() + "," + oa.endOffset() + "]");
        }
        ts.end(  ); // Perform end-of-stream operations, e.g. set the final offset.
    } finally {
        ts.close(); // Release resources associated with this stream.
    }

}
 
Example #16
Source File: AutoPhrasingTokenFilter.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
private void emit(char[] tokenChars) {
    char[] token = tokenChars;
    if (replaceWhitespaceWith != null) {
        token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    if (termAttr != null) {
        termAttr.setEmpty();
        termAttr.append(new StringBuilder().append(token));
    }
    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
        int start = offAttr.endOffset() - token.length;
        offAttr.setOffset(start, offAttr.endOffset());
    }
    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
        pia.setPositionIncrement(++positionIncr);
    }
    lastEmitted = token;
}
 
Example #17
Source File: ConcatenatingTokenStream.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  boolean newSource = false;
  while (sources[currentSource].incrementToken() == false) {
    if (currentSource >= sources.length - 1)
      return false;
    sources[currentSource].end();
    initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
    OffsetAttribute att = sourceOffsets[currentSource];
    if (att != null)
      offsetIncrement += att.endOffset();
    currentSource++;
    newSource = true;
  }

  clearAttributes();
  sources[currentSource].copyTo(this);
  offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
  if (newSource) {
    int posInc = posIncAtt.getPositionIncrement();
    posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
  }

  return true;
}
 
Example #18
Source File: PathTokenFilterTest.java    From SearchServices with GNU Lesser General Public License v3.0 6 votes vote down vote up
public void testTokenizerReuse() throws IOException
{
    // We should be able to use the same Tokenizer twice.
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
    ts.setReader(reader);

    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    
    // First use
    tokenise(ts, new String[]{"uri1", "one"});
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
    
    // Second use
    final String path2 = "/{uri1}one/uri2:two/";
    StringReader reader2 = new StringReader(path2);
    ts.setReader(reader2);
    tokenise(ts, new String[]{"uri1", "one", "uri2", "two"});
    assertEquals(path2.length(), offsetAtt.startOffset());
    assertEquals(path2.length(), offsetAtt.endOffset());
}
 
Example #19
Source File: PathTokenFilterTest.java    From SearchServices with GNU Lesser General Public License v3.0 6 votes vote down vote up
public void testAttributesAfterStreamEnd() throws IOException
{
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
    ts.setReader(reader);

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    
    // PathTokenFilter.end() will be called after all tokens consumed.
    tokenise(ts, new String[]{"uri1", "one"});
    
    // Check attributes cleaned up
    assertEquals("", termAtt.toString());
    assertEquals("word", typeAtt.type()); // the default
    assertEquals(0, posIncAtt.getPositionIncrement());
    // Final offset...
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
}
 
Example #20
Source File: HanLPAnalyzerTest.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example #21
Source File: PinyinAnalysisTest.java    From elasticsearch-analysis-lc-pinyin with Artistic License 2.0 6 votes vote down vote up
@Test
public void testSearch() throws IOException {
    LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
    TokenStream tokenStream = analyzer.tokenStream("lc", "重qing");

    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    tokenStream.reset();
    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "重");
    Assert.assertEquals(offsetAttribute.startOffset(), 0);
    Assert.assertEquals(offsetAttribute.endOffset(), 1);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "qing");
    Assert.assertEquals(offsetAttribute.startOffset(), 1);
    Assert.assertEquals(offsetAttribute.endOffset(), 5);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    tokenStream.close();
}
 
Example #22
Source File: PinyinFilterTest.java    From elasticsearch-analysis-lc-pinyin with Artistic License 2.0 6 votes vote down vote up
public void testFullPinyinFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }
 
Example #23
Source File: PinyinFilterTest.java    From elasticsearch-analysis-lc-pinyin with Artistic License 2.0 6 votes vote down vote up
public void testFirstLetterFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }
 
Example #24
Source File: MemoryIndex.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Convenience method; Creates and returns a token stream that generates a
 * token for each keyword in the given collection, "as is", without any
 * transforming text analysis. The resulting token stream can be fed into
 * {@link #addField(String, TokenStream)}, perhaps wrapped into another
 * {@link org.apache.lucene.analysis.TokenFilter}, as desired.
 * 
 * @param keywords
 *            the keywords to generate tokens for
 * @return the corresponding token stream
 */
public <T> TokenStream keywordTokenStream(final Collection<T> keywords) {
  // TODO: deprecate & move this method into AnalyzerUtil?
  if (keywords == null)
    throw new IllegalArgumentException("keywords must not be null");
  
  return new TokenStream() {
    private Iterator<T> iter = keywords.iterator();
    private int start = 0;
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    @Override
    public boolean incrementToken() {
      if (!iter.hasNext()) return false;
      
      T obj = iter.next();
      if (obj == null) 
        throw new IllegalArgumentException("keyword must not be null");
      
      String term = obj.toString();
      clearAttributes();
      termAtt.setEmpty().append(term);
      offsetAtt.setOffset(start, start+termAtt.length());
      start += term.length() + 1; // separate words by 1 (blank) character
      return true;
    }
  };
}
 
Example #25
Source File: HighlighterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String arg0) {
  Tokenizer stream = new MockTokenizer(MockTokenizer.SIMPLE, true);
  stream.addAttribute(CharTermAttribute.class);
  stream.addAttribute(PositionIncrementAttribute.class);
  stream.addAttribute(OffsetAttribute.class);
  return new TokenStreamComponents(stream, new SynonymTokenizer(stream, synonyms));
}
 
Example #26
Source File: TestToken.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

//        SynonymsLibrary.put(SynonymsLibrary.DEFAULT, "../../library/synonyms.dic");
//
//        DicLibrary.insert(DicLibrary.DEFAULT, "清华", "n", 2000);
//        DicLibrary.insert(DicLibrary.DEFAULT, "大学", "n", 2000);

        Map<String, String> map = new HashMap<String, String>();

        map.put("type", "base_ansj");
//        map.put(SynonymsLibrary.DEFAULT, SynonymsLibrary.DEFAULT);

        Analyzer ca = new AnsjAnalyzer(map);

        String content = "我爱北京天安门天安门上太阳升我美丽的清华大学";

        try {
            TokenStream tokenStream = ca.tokenStream(content, new StringReader(content));

            while (tokenStream.incrementToken()) {

                System.out.print(tokenStream.getAttribute(CharTermAttribute.class));
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                System.out.print("\t");
                System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        ca.close();
    }
 
Example #27
Source File: MeCabKoTokenizer.java    From mecab-ko-lucene-analyzer with Apache License 2.0 5 votes vote down vote up
private void setAttributes() {
  charTermAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  posLenAtt = addAttribute(PositionLengthAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  posAtt = addAttribute(PartOfSpeechAttribute.class);
  semanticClassAtt = addAttribute(SemanticClassAttribute.class);
}
 
Example #28
Source File: PreAnalyzedField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }
  
  AttributeSource.State state = it.next();
  restoreState(state.clone());
  // TODO: why can't I lookup the OffsetAttribute up in ctor instead?
  lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
  return true;
}
 
Example #29
Source File: IKTokenizer.java    From IKAnalyzer with Apache License 2.0 5 votes vote down vote up
/**
 * Lucene 3.5 Tokenizer适配器类构造函数
 *
 * @param in a {@link java.io.Reader} object.
 * @param useSmart a boolean.
 */
public IKTokenizer(Reader in , boolean useSmart){
    super(in);
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
	_IKImplement = new IKSegmenter(in , useSmart);
}
 
Example #30
Source File: ConditionalTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void end() throws IOException {
  if (endState == null) {
    super.end();
    endState = captureState();
  }
  else {
    restoreState(endState);
  }
  endOffset = getAttribute(OffsetAttribute.class).endOffset();
  if (lastTokenFiltered) {
    this.delegate.end();
    endState = captureState();
  }
}