org.ansj.splitWord.analysis.ToAnalysis Java Examples

The following examples show how to use org.ansj.splitWord.analysis.ToAnalysis. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WordSegmenter.java    From SnowGraph with Apache License 2.0 6 votes vote down vote up
private static void tokenizeDocxFile(String filePath) {
    File file = new File(filePath);
    DocumentInfo doc = DocumentParser.parseFileToDocumentInfo(file);
    if(doc instanceof WordDocumentInfo) {
        String content = ((WordDocumentInfo) doc).getDocStr();
        Result terms = ToAnalysis.parse(content);
        for (int i = 0; i < terms.size(); i++) {
            String words = terms.get(i).getName();
            boolean filtered = false;
            for(String stopToken : stopTokens)
                if(words.equals(stopToken)) { filtered = true; break; }
            char firstLetter = words.charAt(0);
            if((firstLetter >= 'A' && firstLetter <= 'Z') ||
                    (firstLetter >= 'a' && firstLetter <= 'z') ||
                    (firstLetter >= '0' && firstLetter <= '9'))
                filtered = true;
            if(filtered) continue;
            wordsCN.add(words);
        }
    }
    else System.out.println("Not a docx file");
}
 
Example #2
Source File: TestAnsj.java    From ansj4solr with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
Example #3
Source File: AnsjImpl.java    From chinese-segmentation-evaluation with Apache License 2.0 5 votes vote down vote up
@Override
public List<Term> segment(String sentence) {
    Result result = ToAnalysis.parse(sentence);
    List<Term> terms = new ArrayList<>();
    for (org.ansj.domain.Term term : result) {
        terms.add(new Term(term.getName()));
    }
    return terms;
}
 
Example #4
Source File: ChineseSegmenter.java    From AliceBot with Apache License 2.0 5 votes vote down vote up
public static String analysis(String str) {

        if(str.getBytes().length == str.length()) {
            //如果不包含中文,就直接返回。
            return str;
        }else {
            //由于IK分词器,不支持特殊字符,所以将 * 改为中文字符“这是星号”,中文分词以后再将“这是星号”修正为为 *
            //同理将 _改为中文字符串“这是下划线”,中文分词以后再将“这是下划线”修正为 _
//            str= str.replaceAll("\\*","这是星号").replaceAll("_","这是下划线");
        }

        StringBuffer sb =new StringBuffer();
        byte[] bt =str.getBytes();
        InputStream ip = new ByteArrayInputStream(bt);
        Reader read = new InputStreamReader(ip);


//        System.out.println(ToAnalysis.parse(str));

        ToAnalysis toAnalysis = new ToAnalysis(read);

        try{
            while(true){
                Term term = toAnalysis.next();
                if(term == null){
                    break;
                }
                // 在每个分词元之后添加空格
                sb.append(term.toString() + " ");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        sb.delete(sb.length() - 1, sb.length());

        return sb.toString().replaceAll("这是星号","*").replaceAll("这是下划线","_");
    }
 
Example #5
Source File: WordSegmenter.java    From SnowGraph with Apache License 2.0 5 votes vote down vote up
public static ArrayList<String> demo(String strToParse) {
    String str = strToParse;
            //"我年纪还轻,阅历不深的时候,我父亲教导过我一句话,我至今还念念不忘。 \n" +
            //"“每逢你想要批评任何人的时候,”他对我说,“你就记住,这个世界上所有的人,并不是个个都有过你拥有的那些优越的条件。”";
    ArrayList<String> ret = new ArrayList<>();
    Result terms = ToAnalysis.parse(str);
    for (int i = 0; i < terms.size(); i++) {
        String words = terms.get(i).getName();// 获取单词
        String nominal = terms.get(i).getNatureStr();// 获取词性
        ret.add(words);
        //System.out.print(words + "\t" + nominal + "\n");
    }
    return ret;
}
 
Example #6
Source File: AnsjTokenizer.java    From word2vec with Apache License 2.0 5 votes vote down vote up
private static String[] tokenizeToArray(String toTokenize) {
  if (StringUtils.isEmpty(toTokenize)) return new String[0];
  return StreamSupport.stream(ToAnalysis.parse(toTokenize).spliterator(), false)
      .map(Term::getName)
      .filter(StringUtils::isNoneEmpty)
      .toArray(String[]::new);
}
 
Example #7
Source File: Segment.java    From Word2Vec with Apache License 2.0 5 votes vote down vote up
/**
 * 分词
 * @param sentence 待分词的句子
 * @return 分词结果
 */
public static List<Term> Seg(String sentence) {
    FilterRecognition filter = new FilterRecognition();
    //过滤标点符号
    filter.insertStopWord(",", " ", ".", ",", "。", ":", ":", "'", "‘", "’", " ", "“", "”", "《", "》", "[", "]", "-");
    return ToAnalysis.parse(sentence).recognition(filter).getTerms();
}
 
Example #8
Source File: TokenizerForSearchEngine.java    From LunarBase with GNU General Public License v2.0 5 votes vote down vote up
@Override
public HashMap<String, TermScore> tokenizeTerm(String input_str) {
	
	//long startt = System.nanoTime(); 
	tokens = ToAnalysis.parse(input_str);
	token_iterator = tokens.listIterator();
	
	HashMap<String, TermScore> hash = new HashMap<String, TermScore>();
	while(token_iterator.hasNext())
	{
		Term term = token_iterator.next();
		if(term.getName().length()>=2)
		{
			if(hash.get(term.getName()) == null)
				hash.put(term.getName(), new TermScore(term.getName(), 0));
			else
			{
				TermScore exist_term = hash.get(term.getName());
				int new_score = exist_term.getScore()+1;
				exist_term.setScore(new_score);
				hash.put(term.getName(), exist_term);
			}
		}
	}
	//long endd = System.nanoTime(); 
	//System.out.println("Tokenization costs: " + (endd - startt ) + " ns"); 
	
	return hash;
}
 
Example #9
Source File: NatureRecognition.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 通过规则 猜测词性
 * 
 * @param word
 * @return
 */
public static TermNatures guessNature(String word) {
    String nature = null;
    SmartForest<String[]> smartForest = SUFFIX_FOREST;
    int len = 0;
    for (int i = word.length() - 1; i >= 0; i--) {
        smartForest = smartForest.get(word.charAt(i));
        if (smartForest == null) {
            break;
        }
        len++;
        if (smartForest.getStatus() == 2) {
            nature = smartForest.getParam()[0];
        } else if (smartForest.getStatus() == 3) {
            nature = smartForest.getParam()[0];
            break;
        }
    }

    if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
        return TermNatures.NT;
    } else if ("ns".equals(nature)) {
        return TermNatures.NS;
    } else if (word.length() < 5) {
        Result parse = ToAnalysis.parse(word);
        for (Term term : parse.getTerms()) {
            if ("nr".equals(term.getNatureStr())) {
                return TermNatures.NR;
            }
        }
    } else if (ForeignPersonRecognition.isFName(word)) {
        return TermNatures.NRF;
    }

    return TermNatures.NW;
}
 
Example #10
Source File: MainTest.java    From AliceBot with Apache License 2.0 4 votes vote down vote up
@Test
public void test(){
    String str = "欢迎使用ansj_seg,(ansj中文分词)在这里如果你遇到什么问题都可以联系我.我一定尽我所能.帮助大家.ansj_seg更快,更准,更自由!" ;
    System.out.println(ToAnalysis.parse(str));
}
 
Example #11
Source File: MainTest.java    From AliceBot with Apache License 2.0 4 votes vote down vote up
@Test
public void test1() throws IOException {

    String str = "欢迎使用ansj_seg,(ansj中文分词)在这里如果你遇到什么问题都可以联系我.我一定尽我所能.帮助大家.ansj_seg更快,更准,更自由!" ;

    byte[] bt = str.getBytes();
    InputStream ip = new ByteArrayInputStream(bt);
    Reader read = new InputStreamReader(ip);

    ToAnalysis toAnalysis = new ToAnalysis(read);


    while(true){
        Term term = toAnalysis.next();
        if(term == null){
            break;
        }

        System.out.println(  term);
    }

}
 
Example #12
Source File: TFIDF.java    From NewsRecommendSystem with MIT License 4 votes vote down vote up
public static Result split(String text)
{
	return ToAnalysis.parse(text);
}
 
Example #13
Source File: TokenizerForSearchEngine.java    From LunarBase with GNU General Public License v2.0 4 votes vote down vote up
public void tokenize(String input_str)
{ 
	tokens = ToAnalysis.parse(input_str);
    token_iterator = tokens.listIterator();
}