org.ansj.domain.Term Java Examples

The following examples show how to use org.ansj.domain.Term. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestAnsj.java    From ansj4solr with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
Example #2
Source File: Graph.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * 取得最优路径的root Term
 * 
 * @return
 */
protected Term optimalRoot() {
    Term to = end;
    to.clearScore();
    Term from = null;
    while ((from = to.from()) != null) {
        for (int i = from.getOffe() + 1; i < to.getOffe(); i++) {
            terms[i] = null;
        }
        if (from.getOffe() > -1) {
            terms[from.getOffe()] = from;
        }
        // 断开横向链表.节省内存
        from.setNext(null);
        from.setTo(to);
        from.clearScore();
        to = from;
    }
    return root;
}
 
Example #3
Source File: StopRecognition.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * 判断一个词语是否停用..
 * 
 * @param term
 * @return
 */
public boolean filter(Term term) {

    if (!stop.isEmpty() && (stop.contains(term.getName()))) {
        return true;
    }

    if (!natureStop.isEmpty() && (natureStop.contains(term.natrue().natureStr))) {
        return true;
    }

    if (!regexList.isEmpty()) {
        for (Pattern stopwordPattern : regexList) {
            if (stopwordPattern.matcher(term.getName()).matches()) {
                return true;
            }
        }
    }

    return false;
}
 
Example #4
Source File: BaseAnalysis.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {
        @Override
        public List<Term> merger() {
            graph.walkPath();
            return getResult();
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] != null) {
                    result.add(graph.terms[i]);
                }
            }

            setRealName(graph, result);
            return result;
        }
    };
    return merger.merger();
}
 
Example #5
Source File: KeyWordComputer.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
private double getWeight(Term term, int length, int titleLength) {
    if (term.getName().trim().length() < 2) {
        return 0;
    }

    String pos = term.natrue().natureStr;

    Double posScore = POS_SCORE.get(pos);

    if (posScore == null) {
        posScore = 1.0;
    } else if (posScore == 0) {
        return 0;
    }

    if (titleLength > term.getOffe()) {
        return 5 * posScore;
    }
    return (length - term.getOffe()) * posScore / length;
}
 
Example #6
Source File: Graph.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * 删除无意义的节点,防止viterbi太多
 */
public void rmLittleSinglePath() {
    int maxTo = -1;
    Term temp = null;
    for (int i = 0; i < terms.length; i++) {
        if (terms[i] == null)
            continue;
        maxTo = terms[i].toValue();
        if (maxTo - i == 1 || i + 1 == terms.length)
            continue;
        for (int j = i; j < maxTo; j++) {
            temp = terms[j];
            if (temp != null && temp.toValue() <= maxTo && temp.getName().length() == 1) {
                terms[j] = null;
            }
        }
    }
}
 
Example #7
Source File: Graph.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * 干涉性增加相对权重
 * 
 * @param relationMap
 */
public void walkPath(Map<String, Double> relationMap) {
    Term term = null;
    // BEGIN先行打分
    merger(root, 0, relationMap);
    // 从第一个词开始往后打分
    for (int i = 0; i < terms.length; i++) {
        term = terms[i];
        while (term != null && term.from() != null && term != end) {
            int to = term.toValue();
            merger(term, to, relationMap);
            term = term.next();
        }
    }
    optimalRoot();
}
 
Example #8
Source File: Graph.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * 具体的遍历打分方法
 * 
 * @param i 起始位置
 * @param j 起始属性
 * @param to
 */
private void merger(Term fromTerm, int to, Map<String, Double> relationMap) {
    Term term = null;
    if (terms[to] != null) {
        term = terms[to];
        while (term != null) {
            // 关系式to.set(from)
            term.setPathScore(fromTerm, relationMap);
            term = term.next();
        }
    } else {
        char c = chars[to];
        TermNatures tn = DATDictionary.getItem(c).termNatures;
        if (tn == null || tn == TermNatures.NULL) {
            tn = TermNatures.NULL;
        }
        terms[to] = new Term(String.valueOf(c), to, tn);
        terms[to].setPathScore(fromTerm, relationMap);
    }
}
 
Example #9
Source File: Graph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public Graph(String str) {
    realStr = str;
    this.chars = str.toCharArray();
    terms = new Term[chars.length + 1];
    end = new Term(E, chars.length, AnsjItem.END);
    root = new Term(B, -1, AnsjItem.BEGIN);
    terms[chars.length] = end;
}
 
Example #10
Source File: TokenizerForSearchEngine.java    From LunarBase with GNU General Public License v2.0 5 votes vote down vote up
public void add(String token){

        if (token == null){
            return;
        }
        Term new_term = new Term(token, 0, token, 1);
        tokens.add(new_term);
    }
 
Example #11
Source File: KeyWordComputer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * @param content 正文
 * @return
 */
private List<Keyword> computeArticleTfidf(String content, int titleLength) {
    Map<String, Keyword> tm = new HashMap<>();

    List<Term> parse = analysisType.parseStr(content).getTerms();
    //FIXME: 这个依赖于用户自定义词典的词性,所以得需要另一个方法..
    //		parse = FilterModifWord.updateNature(parse) ;

    for (Term term : parse) {
        double weight = getWeight(term, content.length(), titleLength);
        if (weight == 0)
            continue;

        Keyword keyword = tm.get(term.getName());


        if (keyword == null) {
            keyword = new Keyword(term.getName(), term.natrue().allFrequency, weight);
            tm.put(term.getName(), keyword);
        } else {
            keyword.updateWeight(1);
        }
    }

    TreeSet<Keyword> treeSet = new TreeSet<>(tm.values());

    ArrayList<Keyword> arrayList = new ArrayList<>(treeSet);
    if (treeSet.size() <= nKeyword) {
        return arrayList;
    } else {
        return arrayList.subList(0, nKeyword);
    }

}
 
Example #12
Source File: SummaryComputer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 根据用户查询串计算摘要
 * 
 * @return
 */
public Summary toSummary(String query) {

    List<Term> parse = NlpAnalysis.parse(query).getTerms();

    List<Keyword> keywords = new ArrayList<>();
    for (Term term : parse) {
        if (FILTER_SET.contains(term.natrue().natureStr)) {
            continue;
        }
        keywords.add(new Keyword(term.getName(), term.termNatures().allFreq, 1));
    }

    return toSummary(keywords);
}
 
Example #13
Source File: Analysis.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 通过构造方法传入的reader直接获取到分词结果
 * 
 * @return
 * @throws IOException
 */
public Result parse() throws IOException {
    List<Term> list = new ArrayList<>();
    Term temp = null;
    while ((temp = next()) != null) {
        list.add(temp);
    }
    Result result = new Result(list);
    return result;
}
 
Example #14
Source File: Analysis.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 将为标准化的词语设置到分词中
 * 
 * @param gp
 * @param result
 */
protected void setRealName(Graph graph, List<Term> result) {

    if (!MyStaticValue.isRealName) {
        return;
    }

    String str = graph.realStr;

    for (Term term : result) {
        term.setRealName(str.substring(term.getOffe(), term.getOffe() + term.getName().length()));
    }
}
 
Example #15
Source File: Analysis.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 一整句话分词,用户设置的歧异优先
 * 
 * @param temp
 * @return
 */
private List<Term> analysisStr(String temp) {
    Graph gp = new Graph(temp);
    int startOffe = 0;

    if (this.ambiguityForest != null) {
        GetWord gw = new GetWord(this.ambiguityForest, gp.chars);
        String[] params = null;
        while ((gw.getFrontWords()) != null) {
            if (gw.offe > startOffe) {
                analysis(gp, startOffe, gw.offe);
            }
            params = gw.getParams();
            startOffe = gw.offe;
            for (int i = 0; i < params.length; i += 2) {
                gp.addTerm(new Term(params[i], startOffe, new TermNatures(new TermNature(params[i + 1], 1))));
                startOffe += params[i].length();
            }
        }
    }
    if (startOffe < gp.chars.length) {
        analysis(gp, startOffe, gp.chars.length);
    }
    List<Term> result = this.getResult(gp);

    return result;
}
 
Example #16
Source File: Analysis.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * while 循环调用.直到返回为null则分词结束
 * 
 * @return
 * @throws IOException
 */

public Term next() throws IOException {
    Term term = null;
    if (!terms.isEmpty()) {
        term = terms.poll();
        term.updateOffe(offe);
        return term;
    }

    String temp = br.readLine();
    offe = br.getStart();
    while (StringUtil.isBlank(temp)) {
        if (temp == null) {
            return null;
        } else {
            temp = br.readLine();
        }

    }

    // 歧异处理字符串

    fullTerms(temp);

    if (!terms.isEmpty()) {
        term = terms.poll();
        term.updateOffe(offe);
        return term;
    }

    return null;
}
 
Example #17
Source File: StopRecognition.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void recognition(Result result) {
    List<Term> list = result.getTerms();
    Iterator<Term> iterator = list.iterator();

    while (iterator.hasNext()) {
        Term term = iterator.next();
        if (filter(term)) {
            iterator.remove();
        }
    }

}
 
Example #18
Source File: DicSegment.java    From youkefu with Apache License 2.0 5 votes vote down vote up
public static String[] byNature(String content , Set<String> expectedNature){
	List<String> wordList = new ArrayList<String>();
	if (!StringUtils.isBlank(content) && expectedNature != null && expectedNature.size() > 0) {
		Result result = NlpAnalysis.parse(content,DicLibrary.gets(librarykeyList));//分词结果的一个封装,主要是一个List<Term>的terms
           List<Term> terms = result.getTerms(); //拿到terms
           for(int i=0; i<terms.size(); i++) {
               String word = terms.get(i).getName(); //拿到词
               String natureStr = terms.get(i).getNatureStr(); //拿到词性
               if(expectedNature.contains(natureStr)) {
                   wordList.add(word+"/"+natureStr);
               }
           }
	}
	return wordList.toArray(new String[wordList.size()]);
}
 
Example #19
Source File: Graph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 增加一个词语到图中
 * 
 * @param term
 */
public void addTerm(Term term) {
    // 是否有数字
    if (!hasNum && term.termNatures().numAttr.numFreq > 0) {
        hasNum = true;
    }
    // 是否有人名
    if (!hasPerson && term.termNatures().personAttr.flag) {
        hasPerson = true;
    }
    TermUtil.insertTerm(terms, term, InsertTermType.REPLACE);

}
 
Example #20
Source File: Graph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 得道最到本行最大term,也就是最右面的term
 * 
 * @param i
 * @return
 */
private Term getMaxTerm(int i) {
    Term maxTerm = terms[i];
    if (maxTerm == null) {
        return null;
    }
    Term term = maxTerm;
    while ((term = term.next()) != null) {
        maxTerm = term;
    }
    return maxTerm;
}
 
Example #21
Source File: Graph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public void walkPathByScore() {
    Term term = null;
    // BEGIN先行打分
    mergerByScore(root, 0);
    // 从第一个词开始往后打分
    for (int i = 0; i < terms.length; i++) {
        term = terms[i];
        while (term != null && term.from() != null && term != end) {
            int to = term.toValue();
            mergerByScore(term, to);
            term = term.next();
        }
    }
    optimalRoot();
}
 
Example #22
Source File: Graph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 根据分数
 * 
 * @param i 起始位置
 * @param j 起始属性
 * @param to
 */
private void mergerByScore(Term fromTerm, int to) {
    Term term = null;
    if (terms[to] != null) {
        term = terms[to];
        while (term != null) {
            // 关系式to.set(from)
            term.setPathSelfScore(fromTerm);
            term = term.next();
        }
    }

}
 
Example #23
Source File: Graph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 对graph进行调试用的
 */
public void printGraph() {
    for (Term term : terms) {
        if (term == null) {
            continue;
        }
        System.out.print(term.getName() + "\t" + term.score() + " ,");
        while ((term = term.next()) != null) {
            System.out.print(term + "\t" + term.score() + " ,");
        }
        System.out.println();
    }
}
 
Example #24
Source File: TermUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 将两个term合并为一个全新的term
 * 
 * @param termNatures
 * @return
 */
public static Term makeNewTermNum(Term from, Term to, TermNatures termNatures) {
    Term term = new Term(from.getName() + to.getName(), from.getOffe(), termNatures);
    term.termNatures().numAttr = from.termNatures().numAttr;
    TermUtil.termLink(term, to.to());
    TermUtil.termLink(term.from(), term);
    return term;
}
 
Example #25
Source File: TermUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 得到细颗粒度的分词,并且确定词性
 * 
 * @return 返回是null说明已经是最细颗粒度
 */
public static void parseNature(Term term) {
    if (!Nature.NW.equals(term.natrue())) {
        return;
    }

    String name = term.getName();

    if (name.length() <= 3) {
        return;
    }

    // 是否是外国人名
    if (ForeignPersonRecognition.isFName(name)) {
        term.setNature(NatureLibrary.getNature("nrf"));
        return;
    }

    List<Term> subTerm = term.getSubTerm();

    // 判断是否是机构名
    term.setSubTerm(subTerm);
    Term first = subTerm.get(0);
    Term last = subTerm.get(subTerm.size() - 1);
    int[] is = companyMap.get(first.getName());
    int all = 0;

    is = companyMap.get(last.getName());
    if (is != null) {
        all += is[1];
    }

    if (all > 1000) {
        term.setNature(NatureLibrary.getNature("nt"));
        return;
    }
}
 
Example #26
Source File: TermUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 从from到to生成subterm
 * 
 * @param terms
 * @param from
 * @param to
 * @return
 */
public static List<Term> getSubTerm(Term from, Term to) {

    List<Term> subTerm = new ArrayList<>(3);

    while ((from = from.to()) != to) {
        subTerm.add(from);
    }

    return subTerm;
}
 
Example #27
Source File: MathUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 从一个词的词性到另一个词的词的分数
 * 
 * @param form
 *            前面的词
 * @param to
 *            后面的词
 * @return 分数
 */
public static double compuScore(Term from, Term to, Map<String, Double> relationMap) {
    double frequency = from.termNatures().allFreq + 1;

    if (frequency < 0) {
        double score = from.score() + MAX_FREQUENCE;
        from.score(score);
        return score;
    }

    double nTwoWordsFreq = NgramLibrary.getTwoWordFreq(from, to);

    if (relationMap != null) {
        Double d = relationMap.get(from.getName() + TAB + to.getName());
        if (d != null) {
            nTwoWordsFreq += d;
        }
    }

    double value = -Math.log(D_SMOOTHING_PARA * frequency / (MAX_FREQUENCE + 80000)
                    + (1 - D_SMOOTHING_PARA) * ((1 - D_TEMP) * nTwoWordsFreq / frequency + D_TEMP));

    if (value < 0) {
        value += frequency;
    }
    return from.score() + value;
}
 
Example #28
Source File: NameFix.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 人名消歧,比如.邓颖超生前->邓颖 超生 前 fix to 丁颖超 生 前! 规则的方式增加如果两个人名之间连接是- , ·,•则连接
 */
public static void nameAmbiguity(Term[] terms, Forest... forests) {
    Term from = null;
    Term term = null;
    Term next = null;
    for (int i = 0; i < terms.length - 1; i++) {
        term = terms[i];
        if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) {
            next = terms[i + 2];
            if (next.termNatures().personAttr.split > 0) {
                term.setName(term.getName() + next.getName().charAt(0));
                terms[i + 2] = null;

                String name = next.getName().substring(1);
                terms[i + 3] = new Term(name, next.getOffe() + 1,
                                new NatureRecognition(forests).getTermNatures(name));
                TermUtil.termLink(term, terms[i + 3]);
                TermUtil.termLink(terms[i + 3], next.to());
            }
        }
    }

    // 外国人名修正
    for (int i = 0; i < terms.length; i++) {
        term = terms[i];
        if (term != null && term.getName().length() == 1 && i > 0
                        && WordAlert.CharCover(term.getName().charAt(0)) == '·') {
            from = term.from();
            next = term.to();

            if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) {
                from.setName(from.getName() + term.getName() + next.getName());
                TermUtil.termLink(from, next.to());
                terms[i] = null;
                terms[i + 1] = null;
            }
        }
    }

}
 
Example #29
Source File: NgramLibrary.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 查找两个词与词之间的频率
 * 
 * @param from
 * @param to
 * @return
 */
public static int getTwoWordFreq(Term from, Term to) {
    if (from.item().bigramEntryMap == null) {
        return 0;
    }
    Integer freq = from.item().bigramEntryMap.get(to.item().getIndex());
    if (freq == null) {
        return 0;
    } else {
        return freq;
    }
}
 
Example #30
Source File: NatureLibrary.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 获得两个term之间的频率
 * 
 * @param fromTerm
 * @param toTerm
 * @return
 */
public static int getTwoTermFreq(Term fromTerm, Term toTerm) {
    Nature from = fromTerm.natrue();
    Nature to = toTerm.natrue();
    if (from.index < 0 || to.index < 0) {
        return 0;
    }
    return NATURETABLE[from.index][to.index];
}