Java Code Examples for org.apache.lucene.analysis.tokenattributes.TypeAttribute

The following examples show how to use org.apache.lucene.analysis.tokenattributes.TypeAttribute. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: jstarcraft-nlp   Source File: NlpSegmenterTestCase.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}
 
Example 2
Source Project: jstarcraft-nlp   Source File: HanLpIndexAnalyzerTestCase.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateComponents() throws Exception {
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i) {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    try (Analyzer analyzer = new HanLpIndexAnalyzer("viterbi")) {
        TokenStream tokenStream = analyzer.tokenStream("field", text);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }
}
 
Example 3
@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 4
Source Project: jstarcraft-nlp   Source File: HanLpQueryAnalyzerTestCase.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateComponents() throws Exception {
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i) {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    try (Analyzer analyzer = new HanLpQueryAnalyzer("viterbi")) {
        TokenStream tokenStream = analyzer.tokenStream("field", text);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }
}
 
Example 5
Source Project: jstarcraft-nlp   Source File: HanLpQueryAnalyzerTestCase.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 6
Source Project: jstarcraft-nlp   Source File: HanLpTokenizerTestCase.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testPinyinTokenFilter() throws Exception {
    Map<String, String> args = new HashMap<>();
    args.put("original", "true");
    args.put("pinyin", "false");
    args.put("pinyinFirstChar", "true");
    HanLpPinyinTokenFilterFactory factory = new HanLpPinyinTokenFilterFactory(args);
    TokenStream tokenStream = factory.create(tokenizer);
    while (tokenStream.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 7
public static void assertTokenStream(TokenStream tokenStream, String[] expectedCharTerms, String[] expectedTypes, int[] expectedStartOffsets, int[] expectedEndOffsets) throws IOException {
    tokenStream.reset();
    int index = 0;
    while (tokenStream.incrementToken() == true) {
        assertEquals(expectedCharTerms[index], tokenStream.getAttribute(CharTermAttribute.class).toString());

        if(expectedTypes != null) {
            assertEquals(expectedTypes[index], tokenStream.getAttribute(TypeAttribute.class).type());
        }

        OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class);

        if(expectedStartOffsets != null) {
            assertEquals(expectedStartOffsets[index], offsets.startOffset());
        }

        if(expectedEndOffsets != null) {
            assertEquals(expectedEndOffsets[index], offsets.endOffset());
        }

        index++;
    }
    tokenStream.end();
}
 
Example 8
public void testAttributesAfterStreamEnd() throws IOException
{
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
    ts.setReader(reader);

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    
    // PathTokenFilter.end() will be called after all tokens consumed.
    tokenise(ts, new String[]{"uri1", "one"});
    
    // Check attributes cleaned up
    assertEquals("", termAtt.toString());
    assertEquals("word", typeAtt.type()); // the default
    assertEquals(0, posIncAtt.getPositionIncrement());
    // Final offset...
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
}
 
Example 9
Source Project: lucene-solr   Source File: ShingleFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  Tokenizer wsTokenizer = new WhitespaceTokenizer();
  wsTokenizer.setReader(new StringReader("please divide this sentence"));
  TokenStream filter = new ShingleFilter(wsTokenizer, 2);
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
  wsTokenizer.setReader(new StringReader("please divide this sentence"));
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
}
 
Example 10
Source Project: lucene-solr   Source File: TestSnowball.java    License: Apache License 2.0 6 votes vote down vote up
public void testFilterTokens() throws Exception {
  SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
  TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
  FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
  
  filter.incrementToken();

  assertEquals("accent", termAtt.toString());
  assertEquals(2, offsetAtt.startOffset());
  assertEquals(7, offsetAtt.endOffset());
  assertEquals("wrd", typeAtt.type());
  assertEquals(3, posIncAtt.getPositionIncrement());
  assertEquals(77, flagsAtt.getFlags());
  assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}
 
Example 11
Source Project: lucene-solr   Source File: AnalysisRequestHandlerBase.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}
 
Example 12
Source Project: lucene-solr   Source File: SpellingQueryConverter.java    License: Apache License 2.0 6 votes vote down vote up
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}
 
Example 13
Source Project: lucene-solr   Source File: TestNumericTokenStream.java    License: Apache License 2.0 6 votes vote down vote up
public void testLongStream() throws Exception {
  @SuppressWarnings("resource")
  final LegacyNumericTokenStream stream=new LegacyNumericTokenStream().setLongValue(lvalue);
  final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  assertNotNull(bytesAtt);
  final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
  assertNotNull(typeAtt);
  final LegacyNumericTokenStream.LegacyNumericTermAttribute numericAtt = stream.getAttribute(LegacyNumericTokenStream.LegacyNumericTermAttribute.class);
  assertNotNull(numericAtt);
  stream.reset();
  assertEquals(64, numericAtt.getValueSize());
  for (int shift=0; shift<64; shift+= LegacyNumericUtils.PRECISION_STEP_DEFAULT) {
    assertTrue("New token is available", stream.incrementToken());
    assertEquals("Shift value wrong", shift, numericAtt.getShift());
    assertEquals("Term is incorrectly encoded", lvalue & ~((1L << shift) - 1L), LegacyNumericUtils.prefixCodedToLong(bytesAtt.getBytesRef()));
    assertEquals("Term raw value is incorrectly encoded", lvalue & ~((1L << shift) - 1L), numericAtt.getRawValue());
    assertEquals("Type incorrect", (shift == 0) ? LegacyNumericTokenStream.TOKEN_TYPE_FULL_PREC : LegacyNumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
  }
  assertFalse("More tokens available", stream.incrementToken());
  stream.end();
  stream.close();
}
 
Example 14
Source Project: lucene-solr   Source File: TestNumericTokenStream.java    License: Apache License 2.0 6 votes vote down vote up
public void testIntStream() throws Exception {
  @SuppressWarnings("resource")
  final LegacyNumericTokenStream stream=new LegacyNumericTokenStream().setIntValue(ivalue);
  final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  assertNotNull(bytesAtt);
  final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
  assertNotNull(typeAtt);
  final LegacyNumericTokenStream.LegacyNumericTermAttribute numericAtt = stream.getAttribute(LegacyNumericTokenStream.LegacyNumericTermAttribute.class);
  assertNotNull(numericAtt);
  stream.reset();
  assertEquals(32, numericAtt.getValueSize());
  for (int shift=0; shift<32; shift+= LegacyNumericUtils.PRECISION_STEP_DEFAULT) {
    assertTrue("New token is available", stream.incrementToken());
    assertEquals("Shift value wrong", shift, numericAtt.getShift());
    assertEquals("Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), LegacyNumericUtils.prefixCodedToInt(bytesAtt.getBytesRef()));
    assertEquals("Term raw value is incorrectly encoded", ((long) ivalue) & ~((1L << shift) - 1L), numericAtt.getRawValue());
    assertEquals("Type incorrect", (shift == 0) ? LegacyNumericTokenStream.TOKEN_TYPE_FULL_PREC : LegacyNumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
  }
  assertFalse("More tokens available", stream.incrementToken());
  stream.end();
  stream.close();
}
 
Example 15
Source Project: hanlp-lucene-plugin   Source File: HanLPAnalyzerTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 16
Source Project: hanlp-lucene-plugin   Source File: HanLPAnalyzerTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 17
Source Project: hanlp-lucene-plugin   Source File: HanLPTokenizerTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testPinyinTokenFilter() throws Exception
{
    Map<String, String> args = new HashMap<>();
    args.put("original", "true");
    args.put("pinyin", "false");
    args.put("pinyinFirstChar", "true");
    HanLPPinyinTokenFilterFactory factory = new HanLPPinyinTokenFilterFactory(args);
    TokenStream tokenStream = factory.create(tokenizer);
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 18
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPIndexAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 19
Source Project: elasticsearch-analysis-url   Source File: URLTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
 * will be passed along to the tokenizer.
 * @param input a string to be tokenized
 * @return a list of tokens extracted from the input string
 * @throws IOException
 */
private List<Token> tokenize(String input) throws IOException {
    List<Token> tokens = new ArrayList<>();
    URLTokenizer tokenizer = new URLTokenizer();
    // create a copy of the parts list to avoid ConcurrentModificationException when sorting
    tokenizer.setParts(new ArrayList<>(parts));
    tokenizer.setUrlDecode(urlDeocde);
    tokenizer.setTokenizeHost(tokenizeHost);
    tokenizer.setTokenizePath(tokenizePath);
    tokenizer.setTokenizeQuery(tokenizeQuery);
    tokenizer.setAllowMalformed(allowMalformed || passthrough);
    tokenizer.setTokenizeMalformed(tokenizeMalformed);
    tokenizer.setReader(new StringReader(input));
    tokenizer.reset();

    String term;
    URLPart part;
    OffsetAttribute offset;
    while (tokenizer.incrementToken()) {
        term = tokenizer.getAttribute(CharTermAttribute.class).toString();
        part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
    }
    return tokens;
}
 
Example 20
private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
            lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}
 
Example 21
Source Project: jstarcraft-nlp   Source File: CoreNlpTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() {
    clearAttributes();
    while (tokens == null || !tokens.hasNext())
        if (!getNextSentence())
            return false;
    CoreLabel token = tokens.next();
    // Use the lemmatized word:
    String word = token.get(LemmaAnnotation.class);
    if (word == null) { // Fallback when no lemmatization happens.
        word = token.get(TextAnnotation.class);
    }
    termAttribute.setLength(0);
    termAttribute.append(word);
    // NER or part of speech annotation
    String pos = token.get(NamedEntityTagAnnotation.class);
    pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos;
    typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE);
    // Token character offsets
    int be = token.get(CharacterOffsetBeginAnnotation.class).intValue();
    int en = token.get(CharacterOffsetEndAnnotation.class).intValue();
    offsetAttribute.setOffset(be, en);
    // Token in-document position increment:
    positionAttribute.setPositionIncrement(1 + skippedTokens);
    skippedTokens = 0;
    return true;
}
 
Example 22
Source Project: jstarcraft-nlp   Source File: TestToken.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

//        SynonymsLibrary.put(SynonymsLibrary.DEFAULT, "../../library/synonyms.dic");
//
//        DicLibrary.insert(DicLibrary.DEFAULT, "清华", "n", 2000);
//        DicLibrary.insert(DicLibrary.DEFAULT, "大学", "n", 2000);

        Map<String, String> map = new HashMap<String, String>();

        map.put("type", "base_ansj");
//        map.put(SynonymsLibrary.DEFAULT, SynonymsLibrary.DEFAULT);

        Analyzer ca = new AnsjAnalyzer(map);

        String content = "我爱北京天安门天安门上太阳升我美丽的清华大学";

        try {
            TokenStream tokenStream = ca.tokenStream(content, new StringReader(content));

            while (tokenStream.incrementToken()) {

                System.out.print(tokenStream.getAttribute(CharTermAttribute.class));
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                System.out.print("\t");
                System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        ca.close();
    }
 
Example 23
Source Project: jstarcraft-nlp   Source File: HanLpTokenizerTestCase.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testIncrementToken() throws Exception {
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 24
@Test
public void testSegmentFactory() throws Exception {
    String[] texts = {
            // 空格
            "     ",
            // 语句
            "中华人民共和国(People's Republic of China),简称'中国'",
            // 文本
            "JStarCraft AI 1.0的目标是提供一个完整的Java机器学习(Machine Learning/ML)框架,作为人工智能在学术界与工业界的桥梁. 让相关领域的研发人员能够在各种软硬件环境/数据结构/算法/模型之间无缝切换. 涵盖了从数据处理到模型的训练与评估各个环节,支持硬件加速和并行计算,是最快最全的Java机器学习库." };

    NlpSegmentFactory factory = getSegmenterFactory();
    for (String text : texts) {
        // 测试Segmenter分词
        try (Tokenizer segmenter = factory.create();) {
            segmenter.setReader(new StringReader(text));
            segmenter.reset();
            while (segmenter.incrementToken()) {
                // 词元
                CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
                // 偏移
                OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
                // 距离
                PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
                // 词性
                TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
                LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
                Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
            }
        }
    }
}
 
Example 25
Source Project: jstarcraft-nlp   Source File: NlpTokenizerTestCase.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTokenize() throws Exception {
    String[] texts = {
            // 空格
            "     ",
            // 语句
            "中华人民共和国(People's Republic of China),简称'中国'",
            // 文本
            "JStarCraft AI 1.0的目标是提供一个完整的Java机器学习(Machine Learning/ML)框架,作为人工智能在学术界与工业界的桥梁. 让相关领域的研发人员能够在各种软硬件环境/数据结构/算法/模型之间无缝切换. 涵盖了从数据处理到模型的训练与评估各个环节,支持硬件加速和并行计算,是最快最全的Java机器学习库." };

    for (String text : texts) {
        // 测试Tokenizer分词
        NlpTokenizer<? extends NlpToken> tokenizer = getTokenizer();
        Iterable<? extends NlpToken> tokens = tokenizer.tokenize(text);
        for (NlpToken token : tokens) {
            LOGGER.debug(StringUtility.format("tokenizer:term is {}, begin is {}, end is {}", token.getTerm(), token.getBegin(), token.getEnd()));
            Assert.assertEquals(token.getTerm().toLowerCase(), text.substring(token.getBegin(), token.getEnd()).toLowerCase());
        }

        // 测试Segmenter分词
        try (Tokenizer segmenter = new NlpSegmenter(BreakIterator.getSentenceInstance(), tokenizer)) {
            segmenter.setReader(new StringReader(text));
            segmenter.reset();
            while (segmenter.incrementToken()) {
                // 词元
                CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
                // 偏移
                OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
                // 距离
                PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
                // 词性
                TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
                LOGGER.debug(StringUtility.format("tokenizer:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
                Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
            }
        }
    }
}
 
Example 26
Source Project: ongdb-lab-apoc   Source File: IKTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Lucene 4.0 Tokenizer适配器类构造函数
 */
public IKTokenizer(Configuration configuration) {
    super();
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
    typeAtt = addAttribute(TypeAttribute.class);
    posIncrAtt = addAttribute(PositionIncrementAttribute.class);

    _IKImplement = new IKSegmenter(input, configuration);
}
 
Example 27
public HLSegTokenizer(SegOption option){
    super();
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
    typeAtt = addAttribute(TypeAttribute.class);

    segmenterAdapter = new HLSegmenterAdapter(option);
}
 
Example 28
Source Project: elasticsearch-jieba-plugin   Source File: JiebaTokenizer.java    License: MIT License 5 votes vote down vote up
protected JiebaTokenizer(String segModeName) {

    this.offsetAtt = addAttribute(OffsetAttribute.class);
    this.termAtt = addAttribute(CharTermAttribute.class);
    this.typeAtt = addAttribute(TypeAttribute.class);
    this.positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);

    jieba = new JiebaAdapter(segModeName);
  }
 
Example 29
/**
 * Lucene 4.0 Tokenizer适配器类构造函数
    */
public IKTokenizer(Configuration configuration){
    super();
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
    typeAtt = addAttribute(TypeAttribute.class);
       posIncrAtt = addAttribute(PositionIncrementAttribute.class);

       _IKImplement = new IKSegmenter(input,configuration);
}
 
Example 30
Source Project: Elasticsearch   Source File: TransportAnalyzeAction.java    License: Apache License 2.0 5 votes vote down vote up
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
    List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
    int lastPosition = -1;
    int lastOffset = 0;
    for (String text : request.text()) {
        try (TokenStream stream = analyzer.tokenStream(field, text)) {
            stream.reset();
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
            TypeAttribute type = stream.addAttribute(TypeAttribute.class);

            while (stream.incrementToken()) {
                int increment = posIncr.getPositionIncrement();
                if (increment > 0) {
                    lastPosition = lastPosition + increment;
                }
                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), null));

            }
            stream.end();
            lastOffset += offset.endOffset();
            lastPosition += posIncr.getPositionIncrement();

            lastPosition += analyzer.getPositionIncrementGap(field);
            lastOffset += analyzer.getOffsetGap(field);
        } catch (IOException e) {
            throw new ElasticsearchException("failed to analyze", e);
        }
    }
    return tokens;
}