com.hankcs.hanlp.utility.TextUtility Java Examples

The following examples show how to use com.hankcs.hanlp.utility.TextUtility. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Nlputil.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 添加词库
 *
 * @param filePath 新的词库文件,每个词使用回车换行分隔
 * @param encoding 编码
 * @return 空—完成,其它—错误信息
 */
public static String addCK(String filePath, String encoding)
{
    if (filePath == null || encoding == null) return String.format("参数错误:addCK(%s, %s)", filePath, encoding);
    try
    {
        BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(filePath), encoding));
        String line;
        synchronized (lockCustomDictionary)
        {
            while ((line = br.readLine()) != null)
            {
                CustomDictionary.insert(line);
            }
        }
        br.close();
    } catch (Exception e) {
        System.out.println(e);
        return TextUtility.exceptionToString(e);

    }

    return "添加成功";
}
 
Example #2
Source File: DKNLPBase.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 添加词库
 *
 * @param filePath 新的词库文件,每个词使用回车换行分隔
 * @param encoding 编码
 * @return 空—完成,其它—错误信息
 */
public static String addCK(String filePath, String encoding)
{
    if (filePath == null || encoding == null) return String.format("参数错误:addCK(%s, %s)", filePath, encoding);
    try
    {
        BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(filePath), encoding));
        String line;
        synchronized (lockCustomDictionary)
        {
            while ((line = br.readLine()) != null)
            {
                CustomDictionary.insert(line);
            }
        }
        br.close();
    }
    catch (Exception e)
    {
        return TextUtility.exceptionToString(e);
    }

    return null;
}
 
Example #3
Source File: HanLpTokenizer.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
    clearAttributes();
    int position = 0;
    Term term;
    boolean un_increased = true;
    do {
        term = segment.next();
        if (term == null) {
            break;
        }
        if (TextUtility.isBlank(term.word)) {
            // 过滤掉空白符,提高索引效率
            continue;
        }

        if (filter != null && filter.containsKey(term.word)) {
            continue;
        } else {
            ++position;
            un_increased = false;
        }
    } while (un_increased);

    if (term != null) {
        positionAttribute.setPositionIncrement(position);
        termAttribute.setEmpty().append(term.word);
        offsetAttribute.setOffset(correctOffset(totalOffset + term.offset), correctOffset(totalOffset + term.offset + term.word.length()));
        typeAttribute.setType(term.nature == null ? "null" : term.nature.toString());
        return true;
    } else {
        totalOffset += segment.offset;
        return false;
    }
}
 
Example #4
Source File: HanLPTokenizer.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
@Override
final public boolean incrementToken() throws IOException {
    clearAttributes();
    int position = 0;
    Term term;
    boolean unIncreased = true;
    do {
        term = segment.next();
        if (term == null) {
            totalOffset += segment.offset;
            return false;
        }
        if (TextUtility.isBlank(term.word)) {
            totalOffset += term.length();
            continue;
        }
        if (configuration.isEnablePorterStemming() && term.nature == Nature.nx) {
            term.word = stemmer.stem(term.word);
        }
        final Term copyTerm = term;
        if ((!this.configuration.isEnableStopDictionary()) || (!AccessController.doPrivileged(
            (PrivilegedAction<Boolean>)() -> CoreStopWordDictionary.shouldRemove(copyTerm)))) {
            position++;
            unIncreased = false;
        } else {
            totalOffset += term.length();
        }
    }
    while (unIncreased);

    positionAttr.setPositionIncrement(position);
    termAtt.setEmpty().append(term.word);
    offsetAtt.setOffset(correctOffset(term.offset), correctOffset(term.offset + term.word.length()));
    typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
    totalOffset += term.length();
    return true;
}
 
Example #5
Source File: HanLPTokenizer.java    From hanlp-lucene-plugin with Apache License 2.0 4 votes vote down vote up
@Override
final public boolean incrementToken() throws IOException
{
    clearAttributes();
    int position = 0;
    Term term;
    boolean un_increased = true;
    do
    {
        term = segment.next();
        if (term == null)
        {
            break;
        }
        if (TextUtility.isBlank(term.word)) // 过滤掉空白符,提高索引效率
        {
            continue;
        }
        if (enablePorterStemming && term.nature == Nature.nx)
        {
            term.word = stemmer.stem(term.word);
        }

        if (filter != null && filter.containsKey(term.word))
        {
            continue;
        }
        else
        {
            ++position;
            un_increased = false;
        }
    }
    while (un_increased);

    if (term != null)
    {
        positionAttr.setPositionIncrement(position);
        termAtt.setEmpty().append(term.word);
        offsetAtt.setOffset(correctOffset(totalOffset + term.offset),
                            correctOffset(totalOffset + term.offset + term.word.length()));
        typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
        return true;
    }
    else
    {
        totalOffset += segment.offset;
        return false;
    }
}