Java Code Examples for org.ansj.splitWord.analysis.ToAnalysis#parse()

The following examples show how to use org.ansj.splitWord.analysis.ToAnalysis#parse() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: WordSegmenter.java From SnowGraph with Apache License 2.0

6 votes

private static void tokenizeDocxFile(String filePath) {
    File file = new File(filePath);
    DocumentInfo doc = DocumentParser.parseFileToDocumentInfo(file);
    if(doc instanceof WordDocumentInfo) {
        String content = ((WordDocumentInfo) doc).getDocStr();
        Result terms = ToAnalysis.parse(content);
        for (int i = 0; i < terms.size(); i++) {
            String words = terms.get(i).getName();
            boolean filtered = false;
            for(String stopToken : stopTokens)
                if(words.equals(stopToken)) { filtered = true; break; }
            char firstLetter = words.charAt(0);
            if((firstLetter >= 'A' && firstLetter <= 'Z') ||
                    (firstLetter >= 'a' && firstLetter <= 'z') ||
                    (firstLetter >= '0' && firstLetter <= '9'))
                filtered = true;
            if(filtered) continue;
            wordsCN.add(words);
        }
    }
    else System.out.println("Not a docx file");
}

Example 2

Source File: TestAnsj.java From ansj4solr with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上，媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上，媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}

Example 3

Source File: AnsjImpl.java From chinese-segmentation-evaluation with Apache License 2.0

5 votes

@Override
public List<Term> segment(String sentence) {
    Result result = ToAnalysis.parse(sentence);
    List<Term> terms = new ArrayList<>();
    for (org.ansj.domain.Term term : result) {
        terms.add(new Term(term.getName()));
    }
    return terms;
}

Example 4

Source File: WordSegmenter.java From SnowGraph with Apache License 2.0

5 votes

public static ArrayList<String> demo(String strToParse) {
    String str = strToParse;
            //"我年纪还轻，阅历不深的时候，我父亲教导过我一句话，我至今还念念不忘。 \n" +
            //"“每逢你想要批评任何人的时候，”他对我说，“你就记住，这个世界上所有的人，并不是个个都有过你拥有的那些优越的条件。”";
    ArrayList<String> ret = new ArrayList<>();
    Result terms = ToAnalysis.parse(str);
    for (int i = 0; i < terms.size(); i++) {
        String words = terms.get(i).getName();// 获取单词
        String nominal = terms.get(i).getNatureStr();// 获取词性
        ret.add(words);
        //System.out.print(words + "\t" + nominal + "\n");
    }
    return ret;
}

Example 5

Source File: TokenizerForSearchEngine.java From LunarBase with GNU General Public License v2.0

5 votes

@Override
public HashMap<String, TermScore> tokenizeTerm(String input_str) {
	
	//long startt = System.nanoTime(); 
	tokens = ToAnalysis.parse(input_str);
	token_iterator = tokens.listIterator();
	
	HashMap<String, TermScore> hash = new HashMap<String, TermScore>();
	while(token_iterator.hasNext())
	{
		Term term = token_iterator.next();
		if(term.getName().length()>=2)
		{
			if(hash.get(term.getName()) == null)
				hash.put(term.getName(), new TermScore(term.getName(), 0));
			else
			{
				TermScore exist_term = hash.get(term.getName());
				int new_score = exist_term.getScore()+1;
				exist_term.setScore(new_score);
				hash.put(term.getName(), exist_term);
			}
		}
	}
	//long endd = System.nanoTime(); 
	//System.out.println("Tokenization costs: " + (endd - startt ) + " ns"); 
	
	return hash;
}

Example 6

Source File: NatureRecognition.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 通过规则 猜测词性
 * 
 * @param word
 * @return
 */
public static TermNatures guessNature(String word) {
    String nature = null;
    SmartForest<String[]> smartForest = SUFFIX_FOREST;
    int len = 0;
    for (int i = word.length() - 1; i >= 0; i--) {
        smartForest = smartForest.get(word.charAt(i));
        if (smartForest == null) {
            break;
        }
        len++;
        if (smartForest.getStatus() == 2) {
            nature = smartForest.getParam()[0];
        } else if (smartForest.getStatus() == 3) {
            nature = smartForest.getParam()[0];
            break;
        }
    }

    if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
        return TermNatures.NT;
    } else if ("ns".equals(nature)) {
        return TermNatures.NS;
    } else if (word.length() < 5) {
        Result parse = ToAnalysis.parse(word);
        for (Term term : parse.getTerms()) {
            if ("nr".equals(term.getNatureStr())) {
                return TermNatures.NR;
            }
        }
    } else if (ForeignPersonRecognition.isFName(word)) {
        return TermNatures.NRF;
    }

    return TermNatures.NW;
}

Example 7

Source File: TFIDF.java From NewsRecommendSystem with MIT License

4 votes

public static Result split(String text)
{
	return ToAnalysis.parse(text);
}

Example 8

Source File: TokenizerForSearchEngine.java From LunarBase with GNU General Public License v2.0

4 votes

public void tokenize(String input_str)
{ 
	tokens = ToAnalysis.parse(input_str);
    token_iterator = tokens.listIterator();
}