org.wltea.analyzer.core.IKSegmenter Java Examples

The following examples show how to use org.wltea.analyzer.core.IKSegmenter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IKAnalyzer.java    From hugegraph with Apache License 2.0 6 votes vote down vote up
@Override
public Set<String> segment(String text) {
    Set<String> result = InsertionOrderUtil.newSet();
    IKSegmenter ik = new IKSegmenter(new StringReader(text),
                                     this.smartSegMode);
    try {
        Lexeme word = null;
        while ((word = ik.next()) != null) {
            result.add(word.getLexemeText());
        }
    } catch (Exception e) {
        throw new HugeException("IKAnalyzer segment text '%s' failed",
                                e, text);
    }
    return result;
}
 
Example #2
Source File: TokenizerAnalyzerUtils.java    From JewelCrawler with GNU General Public License v3.0 6 votes vote down vote up
public static String getAnalyzerResult(String input) {
    StringReader sr=new StringReader(input);
    IKSegmenter ik=new IKSegmenter(sr, true);//true is use smart
    Lexeme lex=null;
    List<String> stopWordsList = getStopWordsList();
    StringBuilder stringBuilder = new StringBuilder();

    try {
        while((lex=ik.next())!=null){
            if(stopWordsList.contains(lex.getLexemeText())) {
                continue;
            }
            stringBuilder.append(lex.getLexemeText() + Constants.BLANKSPACE);
        }
    } catch (IOException e) {
        e.printStackTrace();
        System.out.println("failed to parse input content");
    }
    return stringBuilder.toString();
}
 
Example #3
Source File: ChineseTokenizer.java    From RDMP1 with GNU General Public License v2.0 6 votes vote down vote up
/**
 * 
* @Title: segStr
* @Description: 返回LinkedHashMap的分词
* @param @param content
* @param @return    
* @return Map<String,Integer>   
* @throws
 */
public static Map<String, Long> segStr(String content){
    // 分词
    Reader input = new StringReader(content);
    // 智能分词关闭(对分词的精度影响很大)
    IKSegmenter iks = new IKSegmenter(input, true);
    Lexeme lexeme = null;
    Map<String, Long> words = new LinkedHashMap<String, Long>();
    try {
        while ((lexeme = iks.next()) != null) {
            if (words.containsKey(lexeme.getLexemeText())) {
                words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
            } else {
                words.put(lexeme.getLexemeText(), 1L);
            }
        }
    }catch(IOException e) {
        e.printStackTrace();
    }
    return words;
}
 
Example #4
Source File: StrUtils.java    From Lottery with GNU General Public License v2.0 6 votes vote down vote up
/**
 * 
 * @param keyword 源词汇
 * @param smart 是否智能分词
 * @return 分词词组(,拼接)
 */
public static String getKeywords(String keyword, boolean smart) {
	StringReader reader = new StringReader(keyword);
	IKSegmenter iks = new IKSegmenter(reader, smart);
	StringBuilder buffer = new StringBuilder();
	try {
		Lexeme lexeme;
		while ((lexeme = iks.next()) != null) {
			buffer.append(lexeme.getLexemeText()).append(',');
		}
	} catch (IOException e) {
	}
	//去除最后一个,
	if (buffer.length() > 0) {
		buffer.setLength(buffer.length() - 1);
	}
	return buffer.toString();
}
 
Example #5
Source File: IkSegmentFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected NlpTokenizer<? extends NlpToken> getNlpTokenizer(Map<String, String> configurations) {
    IKSegmenter segmenter = build(configurations);

    IkTokenizer tokenizer = new IkTokenizer(segmenter);
    return tokenizer;
}
 
Example #6
Source File: IkSegmentFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public IKSegmenter build(Map<String, String> configurations) {
    Boolean useSmart = getBoolean(configurations, "useSmart", false);
    IKSegmenter segmenter = new IKSegmenter(null, useSmart);

    return segmenter;
}
 
Example #7
Source File: IKTokenizer.java    From Elasticsearch-Tutorial-zh-CN with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Lucene 4.0 Tokenizer适配器类构造函数
    */
public IKTokenizer(Configuration configuration){
    super();
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
    typeAtt = addAttribute(TypeAttribute.class);
       posIncrAtt = addAttribute(PositionIncrementAttribute.class);

       _IKImplement = new IKSegmenter(input,configuration);
}
 
Example #8
Source File: IKTokenizer.java    From es-ik with Apache License 2.0 5 votes vote down vote up
public IKTokenizer(Reader in, DictionaryConfiguration configuration) {
    super(in);
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
    typeAtt = addAttribute(TypeAttribute.class);
    _IKImplement = new IKSegmenter(input, configuration);
}
 
Example #9
Source File: IKSegmenterTest.java    From es-ik with Apache License 2.0 5 votes vote down vote up
@Test
public void testSegment() throws Exception {
    Reader in = new StringReader("一一分 准确值就是它们听上去的那样。干柴诸如日期或用户ID。当然字符串也可以是准确值,如用户名或邮件地址。准确值Foo与准确值foo是不同的。准确值2014和准确值2014-09-15也是不同的。测试");
    boolean useSmart = true;
    IKSegmenter segmenter = new IKSegmenter(in, MockDictionary.smartModeSqlite3Configure());

    assertSegmenterCorrect(segmenter.next(), "一一分", 0, 3, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "准确值", 4, 7, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "听", 11, 12, 1, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "上去", 12, 14, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "干柴", 18, 20, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "诸如", 20, 22, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "日期", 22, 24, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "用户", 25, 27, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "id", 27, 29, 2, "ENGLISH");
    assertSegmenterCorrect(segmenter.next(), "当然", 30, 32, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "字符串", 32, 35, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "以是", 37, 39, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "准确值", 39, 42, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "用户名", 44, 47, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "邮件地址", 48, 52, 4, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "准确值", 53, 56, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "foo", 56, 59, 3, "ENGLISH");
    assertSegmenterCorrect(segmenter.next(), "准确值", 60, 63, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "foo", 63, 66, 3, "ENGLISH");
    assertSegmenterCorrect(segmenter.next(), "不同", 67, 69, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "准确值", 71, 74, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "2014", 74, 78, 4, "ARABIC");
    assertSegmenterCorrect(segmenter.next(), "准确值", 79, 82, 3, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "2014-09-15", 82, 92, 10, "LETTER");
    assertSegmenterCorrect(segmenter.next(), "也是", 92, 94, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "不同", 94, 96, 2, "CN_WORD");
    assertSegmenterCorrect(segmenter.next(), "测试", 98, 100, 2, "CN_WORD");
}
 
Example #10
Source File: SWMCQueryBuilder.java    From IKAnalyzer with Apache License 2.0 5 votes vote down vote up
/**
 * 分词切分,并返回结链表
 * @param keywords
 * @return
 */
private static List<Lexeme> doAnalyze(String keywords){
	List<Lexeme> lexemes = new ArrayList<Lexeme>();
	IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
	try{
		Lexeme l = null;
		while( (l = ikSeg.next()) != null){
			lexemes.add(l);
		}
	}catch(IOException e){
		e.printStackTrace();
	}
	return lexemes;
}
 
Example #11
Source File: IKTokenizer.java    From IKAnalyzer with Apache License 2.0 5 votes vote down vote up
/**
 * Lucene 3.5 Tokenizer适配器类构造函数
 *
 * @param in a {@link java.io.Reader} object.
 * @param useSmart a boolean.
 */
public IKTokenizer(Reader in , boolean useSmart){
    super(in);
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
	_IKImplement = new IKSegmenter(in , useSmart);
}
 
Example #12
Source File: SWMCQueryBuilder.java    From ik-analyzer with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 分词切分,并返回结链表
 *
 * @param keywords
 *
 * @return
 */
private static List<Lexeme> doAnalyze(String keywords) {
    List<Lexeme> lexemes = new ArrayList<Lexeme>();
    IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords), true);
    try {
        Lexeme l;
        while ((l = ikSeg.next()) != null) {
            lexemes.add(l);
        }
    } catch (IOException e) {
        LOG.error("io error.", e);
    }
    return lexemes;
}
 
Example #13
Source File: IKTokenizer.java    From ik-analyzer with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Lucene 4.0 Tokenizer适配器类构造函数
 *
 * @param useSmart
 */
public IKTokenizer(boolean useSmart) {
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
    typeAtt = addAttribute(TypeAttribute.class);
    ikimplement = new IKSegmenter(input, useSmart);
}
 
Example #14
Source File: IkTokenizer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
private void init(boolean useSmart) {
    _IKImplement = new IKSegmenter(input, useSmart);
}
 
Example #15
Source File: IkTokenizer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public IkTokenizer(IKSegmenter segmenter) {
    this.segmenter = segmenter;
}
 
Example #16
Source File: IkTokenizerTestCase.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
@Override
protected NlpTokenizer<? extends NlpToken> getTokenizer() {
    return new IkTokenizer(new IKSegmenter(null, true));
}