org.wltea.analyzer.core.Lexeme Java Examples

The following examples show how to use org.wltea.analyzer.core.Lexeme. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IkTokenizer.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Override
public Iterable<IkToken> tokenize(CharSequence text) {
    try {
        segmenter.reset(new StringReader(text.toString()));
        LinkedList<Lexeme> iterator = new LinkedList<>();
        while (true) {
            Lexeme lexeme = segmenter.next();
            if (lexeme != null) {
                iterator.add(lexeme);
            } else {
                break;
            }
        }
        IkToken iterable = new IkToken(iterator.iterator());
        return iterable;
    } catch (Exception exception) {
        throw new RuntimeException(exception);
    }
}
 
Example #2
Source File: StrUtils.java    From Lottery with GNU General Public License v2.0 6 votes vote down vote up
/**
 * 
 * @param keyword 源词汇
 * @param smart 是否智能分词
 * @return 分词词组(,拼接)
 */
public static String getKeywords(String keyword, boolean smart) {
	StringReader reader = new StringReader(keyword);
	IKSegmenter iks = new IKSegmenter(reader, smart);
	StringBuilder buffer = new StringBuilder();
	try {
		Lexeme lexeme;
		while ((lexeme = iks.next()) != null) {
			buffer.append(lexeme.getLexemeText()).append(',');
		}
	} catch (IOException e) {
	}
	//去除最后一个,
	if (buffer.length() > 0) {
		buffer.setLength(buffer.length() - 1);
	}
	return buffer.toString();
}
 
Example #3
Source File: IKTokenizer.java    From ik-analyzer with GNU General Public License v3.0 6 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
    //清除所有的词元属性
    clearAttributes();
    Lexeme nextLexeme = ikimplement.next();
    if (nextLexeme != null) {
        //将Lexeme转成Attributes
        //设置词元文本
        termAtt.append(nextLexeme.getLexemeText());
        //设置词元长度
        termAtt.setLength(nextLexeme.getLength());
        //设置词元位移
        offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
        //记录分词的最后位置
        endPosition = nextLexeme.getEndPosition();
        //记录词元分类
        typeAtt.setType(nextLexeme.getLexemeTypeString());
        //返会true告知还有下个词元
        return true;
    }
    //返会false告知词元输出完毕
    return false;
}
 
Example #4
Source File: ChineseTokenizer.java    From RDMP1 with GNU General Public License v2.0 6 votes vote down vote up
/**
 * 
* @Title: segStr
* @Description: 返回LinkedHashMap的分词
* @param @param content
* @param @return    
* @return Map<String,Integer>   
* @throws
 */
public static Map<String, Long> segStr(String content){
    // 分词
    Reader input = new StringReader(content);
    // 智能分词关闭(对分词的精度影响很大)
    IKSegmenter iks = new IKSegmenter(input, true);
    Lexeme lexeme = null;
    Map<String, Long> words = new LinkedHashMap<String, Long>();
    try {
        while ((lexeme = iks.next()) != null) {
            if (words.containsKey(lexeme.getLexemeText())) {
                words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
            } else {
                words.put(lexeme.getLexemeText(), 1L);
            }
        }
    }catch(IOException e) {
        e.printStackTrace();
    }
    return words;
}
 
Example #5
Source File: IKTokenizer.java    From IKAnalyzer with Apache License 2.0 6 votes vote down vote up
/** {@inheritDoc} */
@Override
public boolean incrementToken() throws IOException {
	//清除所有的词元属性
	clearAttributes();
	Lexeme nextLexeme = _IKImplement.next();
	if(nextLexeme != null){
		//将Lexeme转成Attributes
		//设置词元文本
		termAtt.append(nextLexeme.getLexemeText());
		//设置词元长度
		termAtt.setLength(nextLexeme.getLength());
		//设置词元位移
		offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
		//记录分词的最后位置
		finalOffset = nextLexeme.getEndPosition();
		//返会true告知还有下个词元
		return true;
	}
	//返会false告知词元输出完毕
	return false;
}
 
Example #6
Source File: IKTokenizer.java    From es-ik with Apache License 2.0 6 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
    //清除所有的词元属性
    clearAttributes();
    Lexeme nextLexeme = _IKImplement.next();
    if (nextLexeme != null) {
        //将Lexeme转成Attributes
        //设置词元文本
        termAtt.append(nextLexeme.getLexemeText());
        //设置词元长度
        termAtt.setLength(nextLexeme.getLength());
        //设置词元位移
        offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
        //记录分词的最后位置
        endPosition = nextLexeme.getEndPosition();
        //记录词元分类
        typeAtt.setType(nextLexeme.getLexemeTypeString());
        //返会true告知还有下个词元
        return true;
    }
    //返会false告知词元输出完毕
    return false;
}
 
Example #7
Source File: TokenizerAnalyzerUtils.java    From JewelCrawler with GNU General Public License v3.0 6 votes vote down vote up
public static String getAnalyzerResult(String input) {
    StringReader sr=new StringReader(input);
    IKSegmenter ik=new IKSegmenter(sr, true);//true is use smart
    Lexeme lex=null;
    List<String> stopWordsList = getStopWordsList();
    StringBuilder stringBuilder = new StringBuilder();

    try {
        while((lex=ik.next())!=null){
            if(stopWordsList.contains(lex.getLexemeText())) {
                continue;
            }
            stringBuilder.append(lex.getLexemeText() + Constants.BLANKSPACE);
        }
    } catch (IOException e) {
        e.printStackTrace();
        System.out.println("failed to parse input content");
    }
    return stringBuilder.toString();
}
 
Example #8
Source File: IKAnalyzer.java    From hugegraph with Apache License 2.0 6 votes vote down vote up
@Override
public Set<String> segment(String text) {
    Set<String> result = InsertionOrderUtil.newSet();
    IKSegmenter ik = new IKSegmenter(new StringReader(text),
                                     this.smartSegMode);
    try {
        Lexeme word = null;
        while ((word = ik.next()) != null) {
            result.add(word.getLexemeText());
        }
    } catch (Exception e) {
        throw new HugeException("IKAnalyzer segment text '%s' failed",
                                e, text);
    }
    return result;
}
 
Example #9
Source File: IkTokenizer.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
    // 清除所有的词元属性
    clearAttributes();
    Lexeme nextLexeme = _IKImplement.next();
    if (nextLexeme != null) {
        // 将Lexeme转成Attributes
        // 设置词元文本
        termAttribute.append(nextLexeme.getLexemeText());
        // 设置词元长度
        termAttribute.setLength(nextLexeme.getLength());
        // 设置词元位移
        offsetAttribute.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
        // 记录分词的最后位置
        endPosition = nextLexeme.getEndPosition();
        // 记录词元分类
        typeAttribute.setType(nextLexeme.getLexemeTypeString());
        // 返会true告知还有下个词元
        return true;
    }
    // 返会false告知词元输出完毕
    return false;
}
 
Example #10
Source File: IKSegmenterTest.java    From es-ik with Apache License 2.0 5 votes vote down vote up
private void assertSegmenterCorrect(Lexeme nextLexeme, String lexemeText, int begin, int end, int length, String type) {
    Assert.assertEquals(nextLexeme.getLexemeText(), lexemeText);
    Assert.assertEquals(nextLexeme.getBeginPosition(), begin);
    Assert.assertEquals(nextLexeme.getEndPosition(), end);
    Assert.assertEquals(nextLexeme.getLength(), length);
    Assert.assertEquals(nextLexeme.getLexemeTypeString(), type);

}
 
Example #11
Source File: IKSegmenterTest.java    From es-ik with Apache License 2.0 5 votes vote down vote up
private void print(Lexeme nextLexeme){
    System.out.println(nextLexeme.getLexemeText());
    System.out.println(nextLexeme.getBeginPosition());
    System.out.println(nextLexeme.getEndPosition());
    System.out.println(nextLexeme.getLength());
    System.out.println(nextLexeme.getLexemeTypeString());
}
 
Example #12
Source File: SWMCQueryBuilder.java    From IKAnalyzer with Apache License 2.0 5 votes vote down vote up
/**
 * 生成SWMCQuery
 *
 * @param fieldName a {@link java.lang.String} object.
 * @param keywords a {@link java.lang.String} object.
 * @param quickMode a boolean.
 * @return Lucene Query
 */
public static Query create(String fieldName ,String keywords , boolean quickMode){
	if(fieldName == null || keywords == null){
		throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
	}
	//1.对keywords进行分词处理
	List<Lexeme> lexemes = doAnalyze(keywords);
	//2.根据分词结果,生成SWMCQuery
	Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
	return _SWMCQuery;
}
 
Example #13
Source File: SWMCQueryBuilder.java    From IKAnalyzer with Apache License 2.0 5 votes vote down vote up
/**
 * 分词切分,并返回结链表
 * @param keywords
 * @return
 */
private static List<Lexeme> doAnalyze(String keywords){
	List<Lexeme> lexemes = new ArrayList<Lexeme>();
	IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
	try{
		Lexeme l = null;
		while( (l = ikSeg.next()) != null){
			lexemes.add(l);
		}
	}catch(IOException e){
		e.printStackTrace();
	}
	return lexemes;
}
 
Example #14
Source File: IKTokenizer.java    From Elasticsearch-Tutorial-zh-CN with GNU General Public License v3.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
	//清除所有的词元属性
	clearAttributes();
       skippedPositions = 0;

       Lexeme nextLexeme = _IKImplement.next();
	if(nextLexeme != null){
           posIncrAtt.setPositionIncrement(skippedPositions +1 );

		//将Lexeme转成Attributes
		//设置词元文本
		termAtt.append(nextLexeme.getLexemeText());
		//设置词元长度
		termAtt.setLength(nextLexeme.getLength());
		//设置词元位移
           offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));

           //记录分词的最后位置
		endPosition = nextLexeme.getEndPosition();
		//记录词元分类
		typeAtt.setType(nextLexeme.getLexemeTypeString());			
		//返会true告知还有下个词元
		return true;
	}
	//返会false告知词元输出完毕
	return false;
}
 
Example #15
Source File: SWMCQueryBuilder.java    From ik-analyzer with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 生成SWMCQuery
 *
 * @param fieldName
 * @param keywords
 * @param quickMode
 *
 * @return Lucene Query
 */
public static Query create(String fieldName, String keywords, boolean quickMode) {
    if (fieldName == null || keywords == null) {
        throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
    }
    //1.对keywords进行分词处理
    List<Lexeme> lexemes = doAnalyze(keywords);
    //2.根据分词结果,生成SWMCQuery
    Query swmcQuery = getSWMCQuery(fieldName, lexemes, quickMode);
    return swmcQuery;
}
 
Example #16
Source File: SWMCQueryBuilder.java    From ik-analyzer with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 分词切分,并返回结链表
 *
 * @param keywords
 *
 * @return
 */
private static List<Lexeme> doAnalyze(String keywords) {
    List<Lexeme> lexemes = new ArrayList<Lexeme>();
    IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords), true);
    try {
        Lexeme l;
        while ((l = ikSeg.next()) != null) {
            lexemes.add(l);
        }
    } catch (IOException e) {
        LOG.error("io error.", e);
    }
    return lexemes;
}
 
Example #17
Source File: IkToken.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public IkToken(Iterator<Lexeme> iterator) {
    this.iterator = iterator;
}