com.hankcs.hanlp.corpus.tag.Nature Java Examples

The following examples show how to use com.hankcs.hanlp.corpus.tag.Nature. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RemoteMonitor.java    From elasticsearch-analysis-hanlp with Apache License 2.0 6 votes vote down vote up
/**
 * 分析词性和频次
 *
 * @param defaultNature 默认词性
 * @param param         行数据
 * @return 返回[单词] [词性A] [A的频次] [词性B] [B的频次] ...
 */
private String analysisNatureWithFrequency(Nature defaultNature, String[] param) {
    int natureCount = (param.length - 1) / 2;
    StringBuilder builder = new StringBuilder();
    if (natureCount == 0) {
        builder.append(defaultNature).append(" ").append(1000);
    } else {
        for (int i = 0; i < natureCount; ++i) {
            Nature nature = LexiconUtility.convertStringToNature(param[1 + 2 * i]);
            int frequency = Integer.parseInt(param[2 + 2 * i]);
            builder.append(nature).append(" ").append(frequency);
            if (i != natureCount - 1) {
                builder.append(" ");
            }
        }
    }
    return builder.toString();
}
 
Example #2
Source File: EmailSegment.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
@Override
protected List<Term> segSentence(char[] chars)
{
    String text = new String(chars);
    final Matcher matcher = emailPattern.matcher(text);
    List<Term> resultList = new ArrayList<>();
    while (matcher.find())
    {
        final int start = matcher.start();
        resultList.add(new Term(matcher.group(), Nature.nx)
        {{
            offset = start;
        }});
        final String uName = matcher.group(1);

        resultList.add(new Term(uName, Nature.nx)
        {{
            offset = start;
        }});
        resultList.add(new Term(matcher.group(2), Nature.nx)
        {{
            offset = start;
        }});
    }
    return resultList;
}
 
Example #3
Source File: HanLpToken.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public NlpTag getTag() {
    Nature nature = term.nature;
    if (nature == Nature.begin) {
        return NlpTag.X;
    }
    if (nature == Nature.end) {
        return NlpTag.X;
    }
    /** 学术词 */
    if (nature.firstChar() == 'g') {
        return NlpTag.N;
    }
    return PekingUniversityTagger.CHINESE_TAGGER.getTag(nature.toString());
}
 
Example #4
Source File: DependencyParser.java    From AHANLP with Apache License 2.0 5 votes vote down vote up
/**
 * 获得词语依存路径中的词语
 * @param word 词语
 * @param maxReturn 最大路径长度
 * @return 依存路径词语列表
 */
public static List<Term> getWordsInPath(CoNLLWord word, int maxReturn) {
    List<Term> words = new ArrayList<Term>();
    if (word == CoNLLWord.ROOT || maxReturn < 1) return words;
    while (word != CoNLLWord.ROOT) {
        words.add(new Term(word.LEMMA, Nature.fromString(word.POSTAG)));
        word = word.HEAD;
        if (--maxReturn < 1) break;
    }
    return words;
}
 
Example #5
Source File: RemoteMonitor.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
/**
 * 解析默认信息
 *
 * @param location 配置路径
 * @return 返回new Tuple<路径, 默认词性>
 */
private Tuple<String, Nature> analysisDefaultInfo(String location) {
    Nature defaultNature = Nature.n;
    String path = location;
    int cut = location.indexOf(' ');
    if (cut > 0) {
        // 有默认词性
        String nature = location.substring(cut + 1);
        path = location.substring(0, cut);
        defaultNature = LexiconUtility.convertStringToNature(nature);
    }
    return Tuple.tuple(path, defaultNature);
}
 
Example #6
Source File: HanLPTokenizer.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
@Override
final public boolean incrementToken() throws IOException {
    clearAttributes();
    int position = 0;
    Term term;
    boolean unIncreased = true;
    do {
        term = segment.next();
        if (term == null) {
            totalOffset += segment.offset;
            return false;
        }
        if (TextUtility.isBlank(term.word)) {
            totalOffset += term.length();
            continue;
        }
        if (configuration.isEnablePorterStemming() && term.nature == Nature.nx) {
            term.word = stemmer.stem(term.word);
        }
        final Term copyTerm = term;
        if ((!this.configuration.isEnableStopDictionary()) || (!AccessController.doPrivileged(
            (PrivilegedAction<Boolean>)() -> CoreStopWordDictionary.shouldRemove(copyTerm)))) {
            position++;
            unIncreased = false;
        } else {
            totalOffset += term.length();
        }
    }
    while (unIncreased);

    positionAttr.setPositionIncrement(position);
    termAtt.setEmpty().append(term.word);
    offsetAtt.setOffset(correctOffset(term.offset), correctOffset(term.offset + term.word.length()));
    typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
    totalOffset += term.length();
    return true;
}
 
Example #7
Source File: HanLPTokenizer.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
    clearAttributes();
    int position = 0;
    Term term;
    boolean un_increased = true;
    do {
        term = segment.next();
        if (term == null) {
            break;
        }
        if (enablePorterStemming && term.nature == Nature.nx) {
            term.word = stemmer.stem(term.word);
        }

        if (filter != null && filter.containsKey(term.word)) {
            continue;
        } else {
            ++position;
            un_increased = false;
        }
    }
    while (un_increased);

    if (term != null) {
        positionAttr.setPositionIncrement(position);
        termAtt.setEmpty().append(term.word);
        offsetAtt.setOffset(correctOffset(totalOffset + term.offset),
                            correctOffset(totalOffset + term.offset + term.word.length()));
        typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
        return true;
    } else {
        totalOffset += segment.offset;
        return false;
    }
}
 
Example #8
Source File: HanLpToken.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
@Override
public String getNature() {
    Nature nature = term.nature;
    return nature.toString();
}
 
Example #9
Source File: CustomDictionaryUtility.java    From elasticsearch-analysis-hanlp with Apache License 2.0 4 votes vote down vote up
/**
 * 加载用户词典(追加)
 *
 * @param path                  词典路径
 * @param defaultNature         默认词性
 * @param customNatureCollector 收集用户词性
 * @return
 */
private static boolean load(String path, Nature defaultNature, TreeMap<String, CoreDictionary.Attribute> map, LinkedHashSet<Nature> customNatureCollector) {
    try {
        String splitter = "\\s";
        if (path.endsWith(".csv")) {
            splitter = ",";
        }
        BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
        String line;
        boolean firstLine = true;
        while ((line = br.readLine()) != null) {
            if (firstLine) {
                line = IOUtil.removeUTF8BOM(line);
                firstLine = false;
            }
            String[] param = line.split(splitter);
            // 排除空行
            if (param[0].length() == 0) {
                continue;
            }
            // 正规化
            if (HanLP.Config.Normalization) {
                param[0] = CharTable.convert(param[0]);
            }
            int natureCount = (param.length - 1) / 2;
            CoreDictionary.Attribute attribute;
            if (natureCount == 0) {
                attribute = new CoreDictionary.Attribute(defaultNature);
            } else {
                attribute = new CoreDictionary.Attribute(natureCount);
                for (int i = 0; i < natureCount; ++i) {
                    attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i], customNatureCollector);
                    attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
                    attribute.totalFrequency += attribute.frequency[i];
                }
            }
            map.put(param[0], attribute);
        }
        br.close();
    } catch (Exception e) {
        logger.error("hanlp custom dictionary [{}] read failed!", path, e);
        return false;
    }
    return true;
}
 
Example #10
Source File: HanLPTokenizer.java    From hanlp-lucene-plugin with Apache License 2.0 4 votes vote down vote up
@Override
final public boolean incrementToken() throws IOException
{
    clearAttributes();
    int position = 0;
    Term term;
    boolean un_increased = true;
    do
    {
        term = segment.next();
        if (term == null)
        {
            break;
        }
        if (TextUtility.isBlank(term.word)) // 过滤掉空白符,提高索引效率
        {
            continue;
        }
        if (enablePorterStemming && term.nature == Nature.nx)
        {
            term.word = stemmer.stem(term.word);
        }

        if (filter != null && filter.containsKey(term.word))
        {
            continue;
        }
        else
        {
            ++position;
            un_increased = false;
        }
    }
    while (un_increased);

    if (term != null)
    {
        positionAttr.setPositionIncrement(position);
        termAtt.setEmpty().append(term.word);
        offsetAtt.setOffset(correctOffset(totalOffset + term.offset),
                            correctOffset(totalOffset + term.offset + term.word.length()));
        typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
        return true;
    }
    else
    {
        totalOffset += segment.offset;
        return false;
    }
}