org.ansj.domain.Term Java Exaples

Source File: TestAnsj.java From ansj4solr with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上，媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上，媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}

Source File: Graph.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * 取得最优路径的root Term
 * 
 * @return
 */
protected Term optimalRoot() {
    Term to = end;
    to.clearScore();
    Term from = null;
    while ((from = to.from()) != null) {
        for (int i = from.getOffe() + 1; i < to.getOffe(); i++) {
            terms[i] = null;
        }
        if (from.getOffe() > -1) {
            terms[from.getOffe()] = from;
        }
        // 断开横向链表.节省内存
        from.setNext(null);
        from.setTo(to);
        from.clearScore();
        to = from;
    }
    return root;
}

Source File: StopRecognition.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * 判断一个词语是否停用..
 * 
 * @param term
 * @return
 */
public boolean filter(Term term) {

    if (!stop.isEmpty() && (stop.contains(term.getName()))) {
        return true;
    }

    if (!natureStop.isEmpty() && (natureStop.contains(term.natrue().natureStr))) {
        return true;
    }

    if (!regexList.isEmpty()) {
        for (Pattern stopwordPattern : regexList) {
            if (stopwordPattern.matcher(term.getName()).matches()) {
                return true;
            }
        }
    }

    return false;
}

Source File: BaseAnalysis.java From deeplearning4j with Apache License 2.0

6 votes

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {
        @Override
        public List<Term> merger() {
            graph.walkPath();
            return getResult();
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] != null) {
                    result.add(graph.terms[i]);
                }
            }

            setRealName(graph, result);
            return result;
        }
    };
    return merger.merger();
}

Source File: KeyWordComputer.java From deeplearning4j with Apache License 2.0

6 votes

private double getWeight(Term term, int length, int titleLength) {
    if (term.getName().trim().length() < 2) {
        return 0;
    }

    String pos = term.natrue().natureStr;

    Double posScore = POS_SCORE.get(pos);

    if (posScore == null) {
        posScore = 1.0;
    } else if (posScore == 0) {
        return 0;
    }

    if (titleLength > term.getOffe()) {
        return 5 * posScore;
    }
    return (length - term.getOffe()) * posScore / length;
}

Source File: Graph.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * 删除无意义的节点,防止viterbi太多
 */
public void rmLittleSinglePath() {
    int maxTo = -1;
    Term temp = null;
    for (int i = 0; i < terms.length; i++) {
        if (terms[i] == null)
            continue;
        maxTo = terms[i].toValue();
        if (maxTo - i == 1 || i + 1 == terms.length)
            continue;
        for (int j = i; j < maxTo; j++) {
            temp = terms[j];
            if (temp != null && temp.toValue() <= maxTo && temp.getName().length() == 1) {
                terms[j] = null;
            }
        }
    }
}

Source File: Graph.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * 干涉性增加相对权重
 * 
 * @param relationMap
 */
public void walkPath(Map<String, Double> relationMap) {
    Term term = null;
    // BEGIN先行打分
    merger(root, 0, relationMap);
    // 从第一个词开始往后打分
    for (int i = 0; i < terms.length; i++) {
        term = terms[i];
        while (term != null && term.from() != null && term != end) {
            int to = term.toValue();
            merger(term, to, relationMap);
            term = term.next();
        }
    }
    optimalRoot();
}

Source File: Graph.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * 具体的遍历打分方法
 * 
 * @param i 起始位置
 * @param j 起始属性
 * @param to
 */
private void merger(Term fromTerm, int to, Map<String, Double> relationMap) {
    Term term = null;
    if (terms[to] != null) {
        term = terms[to];
        while (term != null) {
            // 关系式to.set(from)
            term.setPathScore(fromTerm, relationMap);
            term = term.next();
        }
    } else {
        char c = chars[to];
        TermNatures tn = DATDictionary.getItem(c).termNatures;
        if (tn == null || tn == TermNatures.NULL) {
            tn = TermNatures.NULL;
        }
        terms[to] = new Term(String.valueOf(c), to, tn);
        terms[to].setPathScore(fromTerm, relationMap);
    }
}

Source File: Graph.java From deeplearning4j with Apache License 2.0

5 votes

public Graph(String str) {
    realStr = str;
    this.chars = str.toCharArray();
    terms = new Term[chars.length + 1];
    end = new Term(E, chars.length, AnsjItem.END);
    root = new Term(B, -1, AnsjItem.BEGIN);
    terms[chars.length] = end;
}

Source File: TokenizerForSearchEngine.java From LunarBase with GNU General Public License v2.0

5 votes

public void add(String token){

        if (token == null){
            return;
        }
        Term new_term = new Term(token, 0, token, 1);
        tokens.add(new_term);
    }

Source File: KeyWordComputer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * @param content 正文
 * @return
 */
private List<Keyword> computeArticleTfidf(String content, int titleLength) {
    Map<String, Keyword> tm = new HashMap<>();

    List<Term> parse = analysisType.parseStr(content).getTerms();
    //FIXME: 这个依赖于用户自定义词典的词性,所以得需要另一个方法..
    //		parse = FilterModifWord.updateNature(parse) ;

    for (Term term : parse) {
        double weight = getWeight(term, content.length(), titleLength);
        if (weight == 0)
            continue;

        Keyword keyword = tm.get(term.getName());


        if (keyword == null) {
            keyword = new Keyword(term.getName(), term.natrue().allFrequency, weight);
            tm.put(term.getName(), keyword);
        } else {
            keyword.updateWeight(1);
        }
    }

    TreeSet<Keyword> treeSet = new TreeSet<>(tm.values());

    ArrayList<Keyword> arrayList = new ArrayList<>(treeSet);
    if (treeSet.size() <= nKeyword) {
        return arrayList;
    } else {
        return arrayList.subList(0, nKeyword);
    }

}

Source File: SummaryComputer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 根据用户查询串计算摘要
 * 
 * @return
 */
public Summary toSummary(String query) {

    List<Term> parse = NlpAnalysis.parse(query).getTerms();

    List<Keyword> keywords = new ArrayList<>();
    for (Term term : parse) {
        if (FILTER_SET.contains(term.natrue().natureStr)) {
            continue;
        }
        keywords.add(new Keyword(term.getName(), term.termNatures().allFreq, 1));
    }

    return toSummary(keywords);
}

Source File: Analysis.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 通过构造方法传入的reader直接获取到分词结果
 * 
 * @return
 * @throws IOException
 */
public Result parse() throws IOException {
    List<Term> list = new ArrayList<>();
    Term temp = null;
    while ((temp = next()) != null) {
        list.add(temp);
    }
    Result result = new Result(list);
    return result;
}

Source File: Analysis.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 将为标准化的词语设置到分词中
 * 
 * @param gp
 * @param result
 */
protected void setRealName(Graph graph, List<Term> result) {

    if (!MyStaticValue.isRealName) {
        return;
    }

    String str = graph.realStr;

    for (Term term : result) {
        term.setRealName(str.substring(term.getOffe(), term.getOffe() + term.getName().length()));
    }
}

Source File: Analysis.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 一整句话分词,用户设置的歧异优先
 * 
 * @param temp
 * @return
 */
private List<Term> analysisStr(String temp) {
    Graph gp = new Graph(temp);
    int startOffe = 0;

    if (this.ambiguityForest != null) {
        GetWord gw = new GetWord(this.ambiguityForest, gp.chars);
        String[] params = null;
        while ((gw.getFrontWords()) != null) {
            if (gw.offe > startOffe) {
                analysis(gp, startOffe, gw.offe);
            }
            params = gw.getParams();
            startOffe = gw.offe;
            for (int i = 0; i < params.length; i += 2) {
                gp.addTerm(new Term(params[i], startOffe, new TermNatures(new TermNature(params[i + 1], 1))));
                startOffe += params[i].length();
            }
        }
    }
    if (startOffe < gp.chars.length) {
        analysis(gp, startOffe, gp.chars.length);
    }
    List<Term> result = this.getResult(gp);

    return result;
}

Source File: Analysis.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * while 循环调用.直到返回为null则分词结束
 * 
 * @return
 * @throws IOException
 */

public Term next() throws IOException {
    Term term = null;
    if (!terms.isEmpty()) {
        term = terms.poll();
        term.updateOffe(offe);
        return term;
    }

    String temp = br.readLine();
    offe = br.getStart();
    while (StringUtil.isBlank(temp)) {
        if (temp == null) {
            return null;
        } else {
            temp = br.readLine();
        }

    }

    // 歧异处理字符串

    fullTerms(temp);

    if (!terms.isEmpty()) {
        term = terms.poll();
        term.updateOffe(offe);
        return term;
    }

    return null;
}

Source File: StopRecognition.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void recognition(Result result) {
    List<Term> list = result.getTerms();
    Iterator<Term> iterator = list.iterator();

    while (iterator.hasNext()) {
        Term term = iterator.next();
        if (filter(term)) {
            iterator.remove();
        }
    }

}

Source File: DicSegment.java From youkefu with Apache License 2.0

5 votes

public static String[] byNature(String content , Set<String> expectedNature){
	List<String> wordList = new ArrayList<String>();
	if (!StringUtils.isBlank(content) && expectedNature != null && expectedNature.size() > 0) {
		Result result = NlpAnalysis.parse(content,DicLibrary.gets(librarykeyList));//分词结果的一个封装，主要是一个List<Term>的terms
           List<Term> terms = result.getTerms(); //拿到terms
           for(int i=0; i<terms.size(); i++) {
               String word = terms.get(i).getName(); //拿到词
               String natureStr = terms.get(i).getNatureStr(); //拿到词性
               if(expectedNature.contains(natureStr)) {
                   wordList.add(word+"/"+natureStr);
               }
           }
	}
	return wordList.toArray(new String[wordList.size()]);
}

Source File: Graph.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 增加一个词语到图中
 * 
 * @param term
 */
public void addTerm(Term term) {
    // 是否有数字
    if (!hasNum && term.termNatures().numAttr.numFreq > 0) {
        hasNum = true;
    }
    // 是否有人名
    if (!hasPerson && term.termNatures().personAttr.flag) {
        hasPerson = true;
    }
    TermUtil.insertTerm(terms, term, InsertTermType.REPLACE);

}

Source File: Graph.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 得道最到本行最大term,也就是最右面的term
 * 
 * @param i
 * @return
 */
private Term getMaxTerm(int i) {
    Term maxTerm = terms[i];
    if (maxTerm == null) {
        return null;
    }
    Term term = maxTerm;
    while ((term = term.next()) != null) {
        maxTerm = term;
    }
    return maxTerm;
}

Source File: Graph.java From deeplearning4j with Apache License 2.0

5 votes

public void walkPathByScore() {
    Term term = null;
    // BEGIN先行打分
    mergerByScore(root, 0);
    // 从第一个词开始往后打分
    for (int i = 0; i < terms.length; i++) {
        term = terms[i];
        while (term != null && term.from() != null && term != end) {
            int to = term.toValue();
            mergerByScore(term, to);
            term = term.next();
        }
    }
    optimalRoot();
}

Source File: Graph.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 根据分数
 * 
 * @param i 起始位置
 * @param j 起始属性
 * @param to
 */
private void mergerByScore(Term fromTerm, int to) {
    Term term = null;
    if (terms[to] != null) {
        term = terms[to];
        while (term != null) {
            // 关系式to.set(from)
            term.setPathSelfScore(fromTerm);
            term = term.next();
        }
    }

}

Source File: Graph.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 对graph进行调试用的
 */
public void printGraph() {
    for (Term term : terms) {
        if (term == null) {
            continue;
        }
        System.out.print(term.getName() + "\t" + term.score() + " ,");
        while ((term = term.next()) != null) {
            System.out.print(term + "\t" + term.score() + " ,");
        }
        System.out.println();
    }
}

Source File: TermUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 将两个term合并为一个全新的term
 * 
 * @param termNatures
 * @return
 */
public static Term makeNewTermNum(Term from, Term to, TermNatures termNatures) {
    Term term = new Term(from.getName() + to.getName(), from.getOffe(), termNatures);
    term.termNatures().numAttr = from.termNatures().numAttr;
    TermUtil.termLink(term, to.to());
    TermUtil.termLink(term.from(), term);
    return term;
}

Source File: TermUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 得到细颗粒度的分词，并且确定词性
 * 
 * @return 返回是null说明已经是最细颗粒度
 */
public static void parseNature(Term term) {
    if (!Nature.NW.equals(term.natrue())) {
        return;
    }

    String name = term.getName();

    if (name.length() <= 3) {
        return;
    }

    // 是否是外国人名
    if (ForeignPersonRecognition.isFName(name)) {
        term.setNature(NatureLibrary.getNature("nrf"));
        return;
    }

    List<Term> subTerm = term.getSubTerm();

    // 判断是否是机构名
    term.setSubTerm(subTerm);
    Term first = subTerm.get(0);
    Term last = subTerm.get(subTerm.size() - 1);
    int[] is = companyMap.get(first.getName());
    int all = 0;

    is = companyMap.get(last.getName());
    if (is != null) {
        all += is[1];
    }

    if (all > 1000) {
        term.setNature(NatureLibrary.getNature("nt"));
        return;
    }
}

Source File: TermUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 从from到to生成subterm
 * 
 * @param terms
 * @param from
 * @param to
 * @return
 */
public static List<Term> getSubTerm(Term from, Term to) {

    List<Term> subTerm = new ArrayList<>(3);

    while ((from = from.to()) != to) {
        subTerm.add(from);
    }

    return subTerm;
}

Source File: MathUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 从一个词的词性到另一个词的词的分数
 * 
 * @param form
 *            前面的词
 * @param to
 *            后面的词
 * @return 分数
 */
public static double compuScore(Term from, Term to, Map<String, Double> relationMap) {
    double frequency = from.termNatures().allFreq + 1;

    if (frequency < 0) {
        double score = from.score() + MAX_FREQUENCE;
        from.score(score);
        return score;
    }

    double nTwoWordsFreq = NgramLibrary.getTwoWordFreq(from, to);

    if (relationMap != null) {
        Double d = relationMap.get(from.getName() + TAB + to.getName());
        if (d != null) {
            nTwoWordsFreq += d;
        }
    }

    double value = -Math.log(D_SMOOTHING_PARA * frequency / (MAX_FREQUENCE + 80000)
                    + (1 - D_SMOOTHING_PARA) * ((1 - D_TEMP) * nTwoWordsFreq / frequency + D_TEMP));

    if (value < 0) {
        value += frequency;
    }
    return from.score() + value;
}

Source File: NameFix.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 人名消歧,比如.邓颖超生前->邓颖 超生 前 fix to 丁颖超 生 前! 规则的方式增加如果两个人名之间连接是- ， ·，•则连接
 */
public static void nameAmbiguity(Term[] terms, Forest... forests) {
    Term from = null;
    Term term = null;
    Term next = null;
    for (int i = 0; i < terms.length - 1; i++) {
        term = terms[i];
        if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) {
            next = terms[i + 2];
            if (next.termNatures().personAttr.split > 0) {
                term.setName(term.getName() + next.getName().charAt(0));
                terms[i + 2] = null;

                String name = next.getName().substring(1);
                terms[i + 3] = new Term(name, next.getOffe() + 1,
                                new NatureRecognition(forests).getTermNatures(name));
                TermUtil.termLink(term, terms[i + 3]);
                TermUtil.termLink(terms[i + 3], next.to());
            }
        }
    }

    // 外国人名修正
    for (int i = 0; i < terms.length; i++) {
        term = terms[i];
        if (term != null && term.getName().length() == 1 && i > 0
                        && WordAlert.CharCover(term.getName().charAt(0)) == '·') {
            from = term.from();
            next = term.to();

            if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) {
                from.setName(from.getName() + term.getName() + next.getName());
                TermUtil.termLink(from, next.to());
                terms[i] = null;
                terms[i + 1] = null;
            }
        }
    }

}

Source File: NgramLibrary.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 查找两个词与词之间的频率
 * 
 * @param from
 * @param to
 * @return
 */
public static int getTwoWordFreq(Term from, Term to) {
    if (from.item().bigramEntryMap == null) {
        return 0;
    }
    Integer freq = from.item().bigramEntryMap.get(to.item().getIndex());
    if (freq == null) {
        return 0;
    } else {
        return freq;
    }
}

Source File: NatureLibrary.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * 获得两个term之间的频率
 * 
 * @param fromTerm
 * @param toTerm
 * @return
 */
public static int getTwoTermFreq(Term fromTerm, Term toTerm) {
    Nature from = fromTerm.natrue();
    Nature to = toTerm.natrue();
    if (from.index < 0 || to.index < 0) {
        return 0;
    }
    return NATURETABLE[from.index][to.index];
}

org.ansj.domain.Term Java Examples