com.hankcs.hanlp.corpus.io.IOUtil Java Examples

The following examples show how to use com.hankcs.hanlp.corpus.io.IOUtil. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Nlputil.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 添加词库
 *
 * @param filePath 新的词库文件,每个词使用回车换行分隔
 * @param encoding 编码
 * @return 空—完成,其它—错误信息
 */
public static String addCK(String filePath, String encoding)
{
    if (filePath == null || encoding == null) return String.format("参数错误:addCK(%s, %s)", filePath, encoding);
    try
    {
        BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(filePath), encoding));
        String line;
        synchronized (lockCustomDictionary)
        {
            while ((line = br.readLine()) != null)
            {
                CustomDictionary.insert(line);
            }
        }
        br.close();
    } catch (Exception e) {
        System.out.println(e);
        return TextUtility.exceptionToString(e);

    }

    return "添加成功";
}
 
Example #2
Source File: Nlputil.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 训练分类模型
 *  @param corpusPath 语料库目录
 * @param modelPath  模型保存路径
 */
public static String trainModel(String corpusPath, String modelPath)
{
    IClassifier classifier = new LinearSVMClassifier();
    try
    {
        IDataSet dataSet = DKNLPBase.configuration == null ? new MemoryDataSet().load(corpusPath) : new HDFSDataSet(DKNLPBase.configuration).load(corpusPath);
        classifier.train(dataSet);
        IOUtil.saveObjectTo(classifier.getModel(), modelPath);
    }
    catch (Exception e)
    {
        e.printStackTrace();
        return e.toString();
    }

    return "完成";
}
 
Example #3
Source File: Nlputil.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 文本分类
 *
 * @param modelPath 模型保存目录
 * @param filePath  待分类文本保存目录
 * @return 分类信息
 */
public static String classify(String modelPath, String filePath)
{
    if (modelPath == null || filePath == null){
        return "路径为空";
    }
    Object model = GlobalObjectPool.get(modelPath);
    if (model == null){
        model = IOUtil.readObjectFrom(modelPath);
        GlobalObjectPool.put(modelPath, model);
    }
    if (model instanceof AbstractModel){
        return classify((AbstractModel) model, filePath);
    }
    return "成功";
}
 
Example #4
Source File: Nlputil.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 文本分类
 *
 * @param model    模型
 * @param filePath 待分类文本保存目录
 * @return 分类信息
 */
public static String classify(AbstractModel model, String filePath)
{
    String txt = IOUtil.readTxt(filePath);
    if (txt == null) return null;

    IClassifier classifier;
    if (model instanceof LinearSVMModel)
    {
        classifier = new LinearSVMClassifier((LinearSVMModel) model);
    }
    else if (model instanceof NaiveBayesModel)
    {
        classifier = new NaiveBayesClassifier((NaiveBayesModel) model);
    }
    else return null;

    return classifier.classify(txt);
}
 
Example #5
Source File: DKNLPBase.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 添加词库
 *
 * @param filePath 新的词库文件,每个词使用回车换行分隔
 * @param encoding 编码
 * @return 空—完成,其它—错误信息
 */
public static String addCK(String filePath, String encoding)
{
    if (filePath == null || encoding == null) return String.format("参数错误:addCK(%s, %s)", filePath, encoding);
    try
    {
        BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(filePath), encoding));
        String line;
        synchronized (lockCustomDictionary)
        {
            while ((line = br.readLine()) != null)
            {
                CustomDictionary.insert(line);
            }
        }
        br.close();
    }
    catch (Exception e)
    {
        return TextUtility.exceptionToString(e);
    }

    return null;
}
 
Example #6
Source File: DKNLPClassification.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 训练分类模型
 *
 * @param corpusPath 语料库目录
 * @param modelPath  模型保存路径
 */
public static void trainModel(String corpusPath, String modelPath)
{
    IClassifier classifier = new LinearSVMClassifier();
    try
    {
        IDataSet dataSet = DKNLPBase.configuration == null ? new MemoryDataSet().load(corpusPath) :
                new HDFSDataSet(DKNLPBase.configuration).load(corpusPath);
        classifier.train(dataSet);
        IOUtil.saveObjectTo(classifier.getModel(), modelPath);
    }
    catch (Exception e)
    {
        e.printStackTrace();
    }
}
 
Example #7
Source File: DKNLPClassification.java    From dk-fitting with Apache License 2.0 6 votes vote down vote up
/**
 * 文本分类
 *
 * @param model    模型
 * @param filePath 待分类文本保存目录
 * @return 分类信息
 */
public static String classify(AbstractModel model, String filePath)
{
    String txt = IOUtil.readTxt(filePath);
    if (txt == null) return null;

    IClassifier classifier;
    if (model instanceof LinearSVMModel)
    {
        classifier = new LinearSVMClassifier((LinearSVMModel) model);
    }
    else if (model instanceof NaiveBayesModel)
    {
        classifier = new NaiveBayesClassifier((NaiveBayesModel) model);
    }
    else return null;

    return classifier.classify(txt);
}
 
Example #8
Source File: WordFreqStatistics.java    From similarity with Apache License 2.0 6 votes vote down vote up
public static void statistics(Segment segment, String inputFilePath) {
    try {
        //词频统计
        WordFreqStatistics statistic = new WordFreqStatistics(segment);
        BufferedReader reader = IOUtil.newBufferedReader(inputFilePath);
        String t;
        StringBuilder s = new StringBuilder();
        while ((t = reader.readLine()) != null) {
            s.append(t);
        }
        statistic.seg(s.toString());
        statistic.setResultPath(inputFilePath.replace(".txt", "") + "-WordFrequencyStatistics-Result.txt");
        statistic.dump();
        reader.close();
    } catch (IOException e) {
        logger.error("IO error: " + e.getLocalizedMessage());
    }
}
 
Example #9
Source File: HanLpTokenizerFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
/**
 * 初始化工厂类
 *
 * @param configuration 通过这个Map保存xml中的配置项
 */
public HanLpTokenizerFactory(Map<String, String> configuration) {
    super(configuration);
    enableIndexMode = getBoolean(configuration, "enableIndexMode", true);
    enableNumberQuantifierRecognize = getBoolean(configuration, "enableNumberQuantifierRecognize", false);
    enableCustomDictionary = getBoolean(configuration, "enableCustomDictionary", true);
    enableCustomDictionaryForcing = getBoolean(configuration, "enableCustomDictionaryForcing", true);
    enableTranslatedNameRecognize = getBoolean(configuration, "enableTranslatedNameRecognize", false);
    enableJapaneseNameRecognize = getBoolean(configuration, "enableJapaneseNameRecognize", false);
    enableOrganizationRecognize = getBoolean(configuration, "enableOrganizationRecognize", false);
    enableNameRecognize = getBoolean(configuration, "enableNameRecognize", false);
    enablePlaceRecognize = getBoolean(configuration, "enablePlaceRecognize", false);
    enableTraditionalChineseMode = getBoolean(configuration, "enableTraditionalChineseMode", false);
    HanLP.Config.Normalization = getBoolean(configuration, "enableNormalization", HanLP.Config.Normalization);
    algorithm = getString(configuration, "algorithm", "viterbi");
    Set<String> customDictionaryPathSet = getSet(configuration, "customDictionaryPath");
    if (customDictionaryPathSet != null) {
        HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
    }
    String stopWordDictionaryPath = get(configuration, "stopWordDictionaryPath");
    if (stopWordDictionaryPath != null) {
        stopWordDictionary = new TreeSet<>();
        stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
    }
    if (getBoolean(configuration, "enableDebug", false)) {
        HanLP.Config.enableDebug();
    }
}
 
Example #10
Source File: DKNLPClassification.java    From dk-fitting with Apache License 2.0 5 votes vote down vote up
/**
 * 文本分类
 *
 * @param modelPath 模型保存目录
 * @param filePath  待分类文本保存目录
 * @return 分类信息
 */
public static String classify(String modelPath, String filePath)
{
    if (modelPath == null || filePath == null) return null;
    Object model = GlobalObjectPool.get(modelPath);
    if (model == null)
    {
        model = IOUtil.readObjectFrom(modelPath);
        GlobalObjectPool.put(modelPath, model);
    }
    if (model instanceof AbstractModel)
        return classify((AbstractModel) model, filePath);
    return null;
}
 
Example #11
Source File: CustomDictionaryUtility.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
public static boolean reload() {
    CustomDictionary.dat.getSize();
    String[] paths = HanLP.Config.CustomDictionaryPath;
    if (paths == null || paths.length == 0) {
        return false;
    }
    logger.debug("begin delete hanlp custom dictionary cache");
    IOUtil.deleteFile(paths[0] + Predefine.BIN_EXT);
    logger.debug("delete hanlp custom dictionary cache successfully");
    return loadMainDictionary(paths[0]);
}
 
Example #12
Source File: HanLPTokenizerFactory.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
/**
 * 初始化工厂类
 *
 * @param args 通过这个Map保存xml中的配置项
 */
public HanLPTokenizerFactory(Map<String, String> args) {
    super(args);
    enableIndexMode = getBoolean(args, "enableIndexMode", true);
    enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
    enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
    enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
    enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
    enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
    enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
    enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
    enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
    enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
    HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
    Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
    if (customDictionaryPathSet != null) {
        HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
    }
    String stopWordDictionaryPath = get(args, "stopWordDictionaryPath");
    if (stopWordDictionaryPath != null) {
        stopWordDictionary = new TreeSet<>();
        stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
    }
    if (getBoolean(args, "enableDebug", false)) {
        HanLP.Config.enableDebug();
    }
}
 
Example #13
Source File: Tokenizer.java    From similarity with Apache License 2.0 5 votes vote down vote up
public static void fileSegment(Segment segment, String inputFilePath, String outputFilePath) {
    try {
        WordFreqStatistics.statistics(segment, inputFilePath);
        BufferedReader reader = IOUtil.newBufferedReader(inputFilePath);
        long allCount = 0;
        long lexCount = 0;
        long start = System.currentTimeMillis();
        String outPath = inputFilePath.replace(".txt", "") + "-Segment-Result.txt";
        if (outputFilePath != null && outputFilePath.trim().length() > 0) outPath = outputFilePath;
        FileOutputStream fos = new FileOutputStream(new File(outPath));
        String temp;
        while ((temp = reader.readLine()) != null) {
            List<Term> parse = segment.seg(temp);
            StringBuilder sb = new StringBuilder();
            for (Term term : parse) {
                sb.append(term.toString() + "\t");
                if (term.word.trim().length() > 0) {
                    allCount += term.length();
                    lexCount += 1;
                }
            }
            fos.write(sb.toString().trim().getBytes());
            fos.write("\n".getBytes());
        }

        fos.flush();
        fos.close();
        reader.close();
        long end = System.currentTimeMillis();
        System.out.println("segment result save:" + outPath);
        System.out.println("total " + allCount + " chars, " + lexCount + " words, spend" + (end - start) + "ms ");
    } catch (IOException e) {
        logger.error("IO error: " + e.getLocalizedMessage());
    }
}
 
Example #14
Source File: HanLPTokenizerFactory.java    From hanlp-lucene-plugin with Apache License 2.0 5 votes vote down vote up
/**
 * 初始化工厂类
 *
 * @param args 通过这个Map保存xml中的配置项
 */
public HanLPTokenizerFactory(Map<String, String> args)
{
    super(args);
    enableIndexMode = getBoolean(args, "enableIndexMode", true);
    enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
    enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
    enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
    enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", false);
    enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
    enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
    enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
    enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
    enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
    enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
    HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
    algorithm = getString(args, "algorithm", "viterbi");
    Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
    if (customDictionaryPathSet != null)
    {
        HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
    }
    String stopWordDictionaryPath = get(args, "stopWordDictionaryPath");
    if (stopWordDictionaryPath != null)
    {
        stopWordDictionary = new TreeSet<>();
        stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
    }
    if (getBoolean(args, "enableDebug", false))
    {
        HanLP.Config.enableDebug();
    }
}
 
Example #15
Source File: HDFSDataSet.java    From dk-fitting with Apache License 2.0 4 votes vote down vote up
@Override
public IDataSet load(String folderPath, String charsetName, double rate) throws IllegalArgumentException, IOException
{
    if (folderPath == null) throw new IllegalArgumentException("参数 folderPath == null");
    Path root = new Path(folderPath);
    if (!fileSystem.exists(root)) throw new IllegalArgumentException(String.format("目录 %s 不存在", root.getName()));
    if (!fileSystem.isDirectory(root))
        throw new IllegalArgumentException(String.format("目录 %s 不是一个目录", root.getName()));
    if (rate > 1.0 || rate < -1.0) throw new IllegalArgumentException("rate 的绝对值必须介于[0, 1]之间");

    FileStatus[] folders = fileSystem.listStatus(root);
    if (folders == null) return null;
    logger.start("模式:%s\n文本编码:%s\n根目录:%s\n加载中...\n", isTestingDataSet() ? "测试集" : "训练集", charsetName, folderPath);
    for (FileStatus fs : folders)
    {
        if (fs.isFile()) continue;
        Path folder = fs.getPath();
        FileStatus[] files = fileSystem.listStatus(folder);
        if (files == null) continue;
        String category = folder.getName();
        logger.out("[%s]...", category);
        int b, e;
        if (rate > 0)
        {
            b = 0;
            e = (int) (files.length * rate);
        }
        else
        {
            b = (int) (files.length * (1 + rate));
            e = files.length;
        }

        for (int i = b; i < e; i++)
        {
            String path = files[i].getPath().getName();
            if (!path.startsWith(folderPath))
            {
                path = folderPath + "/" + folder.getName() + "/" + path;
            }
            add(folder.getName(), IOUtil.readTxt(path, charsetName));
            if (i % 100 == 0)
                logger.out("%.2f%%...", MathUtility.percentage(i - b, e - b));
        }
        logger.out(" %d 篇文档\n", e - b);
    }
    logger.finish(" 加载了 %d 个类目,共 %d 篇文档\n", getCatalog().size(), size());
    return this;
}
 
Example #16
Source File: CustomDictionaryUtility.java    From elasticsearch-analysis-hanlp with Apache License 2.0 4 votes vote down vote up
/**
 * 加载用户词典(追加)
 *
 * @param path                  词典路径
 * @param defaultNature         默认词性
 * @param customNatureCollector 收集用户词性
 * @return
 */
private static boolean load(String path, Nature defaultNature, TreeMap<String, CoreDictionary.Attribute> map, LinkedHashSet<Nature> customNatureCollector) {
    try {
        String splitter = "\\s";
        if (path.endsWith(".csv")) {
            splitter = ",";
        }
        BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
        String line;
        boolean firstLine = true;
        while ((line = br.readLine()) != null) {
            if (firstLine) {
                line = IOUtil.removeUTF8BOM(line);
                firstLine = false;
            }
            String[] param = line.split(splitter);
            // 排除空行
            if (param[0].length() == 0) {
                continue;
            }
            // 正规化
            if (HanLP.Config.Normalization) {
                param[0] = CharTable.convert(param[0]);
            }
            int natureCount = (param.length - 1) / 2;
            CoreDictionary.Attribute attribute;
            if (natureCount == 0) {
                attribute = new CoreDictionary.Attribute(defaultNature);
            } else {
                attribute = new CoreDictionary.Attribute(natureCount);
                for (int i = 0; i < natureCount; ++i) {
                    attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i], customNatureCollector);
                    attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
                    attribute.totalFrequency += attribute.frequency[i];
                }
            }
            map.put(param[0], attribute);
        }
        br.close();
    } catch (Exception e) {
        logger.error("hanlp custom dictionary [{}] read failed!", path, e);
        return false;
    }
    return true;
}