org.nlpcn.commons.lang.util.StringUtil Java Examples

The following examples show how to use org.nlpcn.commons.lang.util.StringUtil. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PinyinFormatter.java    From nlp-lang with Apache License 2.0 6 votes vote down vote up
public static String formatPinyin(String pinyinStr, TYPE type) {
	if(StringUtil.isBlank(pinyinStr)){
		return pinyinStr ;
	}
	StringBuilder sb = null ;

	switch (type){
		case UNICODE_PINYIN_FORMAT:
			return convertToneNumber2ToneMark(pinyinStr) ;
		case WITHOUT_NUM_PINYIN_FORMAT:
			return pinyinStr.replaceAll("[1-5]", "");
		case DEFAULT_PINYIN_FORMAT:
			return pinyinStr ;
		case FIRST_CHAR_PINYIN_FORMAT:
			return String.valueOf(pinyinStr.charAt(0)) ;
	}

	return pinyinStr ;

}
 
Example #2
Source File: MemoryIndex.java    From nlp-lang with Apache License 2.0 6 votes vote down vote up
private Set<String> getPrexSplit(final String[] fields) {
	HashSet<String> hs = new HashSet<String>();
	for (String string : fields) {
		if (StringUtil.isBlank(string)) {
			continue;
		}

		string = string.trim();

		for (int i = 1; i < string.length() + 1; i++) {
			hs.add(string.substring(0, i));
		}
	}

	return hs;
}
 
Example #3
Source File: MemoryIndex.java    From nlp-lang with Apache License 2.0 6 votes vote down vote up
public List<T> suggest(String key) {
	if (StringUtil.isBlank(key)) {
		return Collections.emptyList();
	}

	key = key.replace("\\s", "");

	List<T> result = new LinkedList<T>();
	TreeSet<Entry> treeSet = index.get(key);
	if (treeSet == null) {
		return result;
	}

	for (Entry entry : treeSet) {
		result.add(entry.t);
	}
	return result;
}
 
Example #4
Source File: DoubleArrayTire.java    From nlp-lang with Apache License 2.0 5 votes vote down vote up
/**
 * 获得一个词语的item
 */
@SuppressWarnings("unchecked")
public <T extends Item> T getItem(String str) {
	if (StringUtil.isBlank(str)) {
		return null;
	}
	if (str.length() == 1) {
		return (T) dat[str.charAt(0)];
	}

	Item item = dat[str.charAt(0)];
	if (item == null) {
		return null;
	}
	for (int i = 1; i < str.length(); i++) {
		final int checkValue = item.index;
		if (item.base + str.charAt(i) > dat.length - 1) {
			return null;
		}

		item = dat[item.base + str.charAt(i)];
		if (item == null) {
			return null;
		}
		if (item.check != -1 && item.check != checkValue) {
			return null;
		}
	}
	return (T) item;
}
 
Example #5
Source File: Term.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 进行term合并
 * 
 * @param term
 * @param maxNature
 */
public Term merage(Term to) {
    this.name = this.name + to.getName();
    if (StringUtil.isNotBlank(this.realName) && StringUtil.isNotBlank(to.getRealName())) {
        this.realName = this.realName + to.getRealName();
    }
    this.setTo(to.to);
    return this;
}
 
Example #6
Source File: KeyWordComputer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * @param title   标题
 * @param content 正文
 * @return
 */
public List<Keyword> computeArticleTfidf(String title, String content) {
    if (StringUtil.isBlank(title)) {
        title = "";
    }
    if (StringUtil.isBlank(content)) {
        content = "";
    }
    return computeArticleTfidf(title + "\t" + content, title.length());
}
 
Example #7
Source File: SplitWord.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public List<String> cut(String line) {

        if (StringUtil.isBlank(line)) {
            return Collections.emptyList();
        }

        List<Element> elements = vterbi(line);

        List<String> result = new ArrayList<>();

        Element e = null;
        int begin = 0;
        int end = 0;
        int size = elements.size() - 1;
        for (int i = 0; i < elements.size(); i++) {
            e = elements.get(i);
            switch (e.getTag()) {
                case 0:
                    end += e.len;
                    result.add(line.substring(begin, end));
                    begin = end;
                    break;
                case 1:
                    end += e.len;
                    while (i < size && (e = elements.get(++i)).getTag() != 3) {
                        end += e.len;
                    }
                    end += e.len;
                    result.add(line.substring(begin, end));
                    begin = end;
                default:
                    break;
            }
        }
        return result;
    }
 
Example #8
Source File: CRFppTxtModel.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * 加载特征标签转换
 * 
 * @param br
 * @return
 * @throws Exception
 */
private int[] loadTagCoven(BufferedReader br) throws Exception {

    int[] conver = new int[Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM];

    String temp = null;

    // TODO: 这个是个写死的过程,如果标签发生改变需要重新来写这里
    for (int i = 0; i < Config.TAG_NUM; i++) {
        String line = br.readLine();
        if (StringUtil.isBlank(line)) {
            i--;
            continue;
        }

        char c = line.charAt(0);
        switch (c) {
            case 'S':
                conver[i] = Config.S;
                break;
            case 'B':
                conver[i] = Config.B;
                break;
            case 'M':
                conver[i] = Config.M;
                break;
            case 'E':
                conver[i] = Config.E;
                break;
            default:
                throw new Exception("err tag named " + c + " in model " + temp);
        }
    }

    for (int i = Config.TAG_NUM; i < conver.length; i++) {
        conver[i] = conver[(i - 4) / Config.TAG_NUM] * Config.TAG_NUM + conver[i % Config.TAG_NUM] + Config.TAG_NUM;
    }

    return conver;
}
 
Example #9
Source File: Analysis.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * while 循环调用.直到返回为null则分词结束
 * 
 * @return
 * @throws IOException
 */

public Term next() throws IOException {
    Term term = null;
    if (!terms.isEmpty()) {
        term = terms.poll();
        term.updateOffe(offe);
        return term;
    }

    String temp = br.readLine();
    offe = br.getStart();
    while (StringUtil.isBlank(temp)) {
        if (temp == null) {
            return null;
        } else {
            temp = br.readLine();
        }

    }

    // 歧异处理字符串

    fullTerms(temp);

    if (!terms.isEmpty()) {
        term = terms.poll();
        term.updateOffe(offe);
        return term;
    }

    return null;
}
 
Example #10
Source File: SmartGetWordTest.java    From nlp-lang with Apache License 2.0 5 votes vote down vote up
@Test
public void test() {
	/**
	 * 词典的构造.一行一个词后面是参数.可以从文件读取.可以是read流.
	 */
	long start = System.currentTimeMillis();
	SmartForest<Integer> forest = new SmartForest<Integer>();

	forest.add("中国", 3);

	forest.add("android", 3);

	forest.add("java", 3);

	forest.add("中国人", 3);

	String content = " Android-java-中国人";
	
	
	forest.remove("中国人") ;
	
	content = StringUtil.rmHtmlTag(content);

	for (int i = 0; i < 1; i++) {
		SmartGetWord<Integer> udg = forest.getWord(content.toLowerCase().toCharArray());

		String temp;
		while ((temp = udg.getFrontWords()) != null) {
			System.out.println(temp + "\t" + udg.getParam());
		}
	}
	System.out.println(System.currentTimeMillis() - start);
}
 
Example #11
Source File: AllWordTest.java    From nlp-lang with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	/**
	 * 词典的构造.一行一个词后面是参数.可以从文件读取.可以是read流.
	 */
	long start = System.currentTimeMillis();
	SmartForest<Integer> forest = new SmartForest<Integer>();

	forest.add("中国", 3);

	forest.add("android", 3);

	forest.add("java", 3);
	
	forest.add("jav", 3);

	forest.add("中国人", 3);
	forest.add("国人", 3);
	
	forest.add("0",3);
	forest.add("3",3);

	String content = " Android-java-中国人00000000000000 1230 013 33333";
	
	
	content = StringUtil.rmHtmlTag(content);

	for (int i = 0; i < 1; i++) {
		SmartGetWord<Integer> udg = forest.getWord(content.toLowerCase().toCharArray());

		String temp;
		while ((temp = udg.getAllWords()) != null) {
			System.out.println(temp + "\t" + udg.getParam());
		}
	}
	System.out.println(System.currentTimeMillis() - start);
}
 
Example #12
Source File: GetWordTest.java    From nlp-lang with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	/**
	 * 词典的构造.一行一个词后面是参数.可以从文件读取.可以是read流.
	 */
	long start = System.currentTimeMillis();
	String dic = "android\t10\t孙健\nc\t100\nC++\t10\nc++\t5\nc#\t100\nVC++\t100".toLowerCase();
	System.out.println(dic);
	Forest forest = Library.makeForest(new BufferedReader(new StringReader(dic)));
	/**
	 * 删除一个单词
	 */
	Library.removeWord(forest, "中国");
	/**
	 * 增加一个新词
	 */
	Library.insertWord(forest, "中国人");
	String content = "Android--中国人";
	content = StringUtil.rmHtmlTag(content);

	for (int i = 0; i < 1; i++) {
		GetWord udg = forest.getWord(content.toLowerCase().toCharArray());

		String temp = null;
		while ((temp = udg.getFrontWords()) != null) {
			System.out.println(temp + "\t\t" + udg.getParam()[0] + "\t\t" + udg.getParam()[1]);
			System.out.println(udg.offe);
		}
	}
	System.out.println(System.currentTimeMillis() - start);
}
 
Example #13
Source File: MemoryIndex.java    From nlp-lang with Apache License 2.0 5 votes vote down vote up
private Set<String> getAllSplit(final String[] fields) {
	HashSet<String> hs = new HashSet<String>();
	for (String string : fields) {
		if (StringUtil.isBlank(string)) {
			continue;
		}
		string = string.trim();
		for (int i = 0; i < string.length(); i++) {
			for (int j = i + 1; j < string.length() + 1; j++) {
				hs.add(string.substring(i, j));
			}
		}
	}
	return hs;
}
 
Example #14
Source File: MemoryIndex.java    From nlp-lang with Apache License 2.0 5 votes vote down vote up
/**
 * 搜索提示
 * 
 * @param value
 *            返回内容
 * @param score
 *            分数
 * @param fields
 *            提示内容
 */
public void addItem(T value, Double score, String... fields) {
	Set<String> result = null;

	if (fields == null || fields.length == 0) {
		fields = new String[] { value.toString() };
	}

	switch (model) {
	case ALL:
		result = getAllSplit(fields);
		break;
	case PREX:
		result = getPrexSplit(fields);
		break;
	}

	TreeSet<Entry> treeSet;
	for (String key : result) {
		if (StringUtil.isBlank(key)) {
			continue;
		}
		treeSet = index.get(key);

		if (treeSet == null) {
			treeSet = new TreeSet<Entry>();
			index.put(key, treeSet);
		}
		treeSet.add(new Entry(value, score(value, score)));

		if (treeSet.size() > this.size) {
			treeSet.pollLast();
		}
	}
}
 
Example #15
Source File: DATMaker.java    From nlp-lang with Apache License 2.0 5 votes vote down vote up
/**
 * 构建用户自定义的dat
 * 
 * @throws FileNotFoundException
 * @throws IllegalAccessException
 * @throws InstantiationException
 */
public void maker(final String dicPath, final Class<? extends Item> cla) throws FileNotFoundException, InstantiationException, IllegalAccessException {
	long start = System.currentTimeMillis();
	LOG.info("make basic tire begin !");

	final SmartForest<Item> forest = new SmartForest<Item>();
	final FileIterator it = IOUtil.instanceFileIterator(dicPath, IOUtil.UTF8);
	if (it == null) {
		throw new FileNotFoundException();
	}
	try {
		String temp;
		while (it.hasNext()) {
			temp = it.next();
			if (StringUtil.isBlank(temp)) {
				continue;
			}
			final Item item = cla.newInstance();
			final String[] split = temp.split("\t");
			item.init(split);
			forest.add(split[0], item);
		}
	} finally {
		it.close();
	}
	LOG.info("make basic tire over use time " + (System.currentTimeMillis() - start) + " ms");

	start = System.currentTimeMillis();
	LOG.info("make dat tire begin !");
	makeDAT(tree2List(cla, forest));
	LOG.info("make dat tire over use time " + (System.currentTimeMillis() - start) + " ms! dat len is " + datArrLen() + "! dat size is " + datItemSize());

}
 
Example #16
Source File: CRFppTxtModel.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * 加载特征值 //11:*6:_x-1/的,
 * 
 * @param maxId
 * 
 * @param featureIndex
 * 
 * @param br
 * @return
 * @throws Exception
 */

private TreeMap<Integer, Pair<String, String>> loadFeatureName(Map<String, Integer> featureIndex, BufferedReader br)
                throws Exception {

    TreeMap<Integer, Pair<String, String>> featureNames = new TreeMap<>();

    String temp = null;
    while (StringUtil.isNotBlank(temp = br.readLine())) {

        int indexOf = temp.indexOf(" ");

        int id = ObjConver.getIntValue(temp.substring(0, indexOf));

        if (indexOf > 0) {
            temp = temp.substring(indexOf);
        }

        String[] split = temp.split(":");

        if (split.length == 1) {
            featureNames.put(id, Pair.with(temp.trim(), ""));
        } else {
            String name = split[1];
            if (split.length > 2) {
                for (int j = 2; j < split.length; j++) {
                    name += ":" + split[j];
                }
            }

            int lastFeatureId = featureIndex.get(split[0].trim());

            if ("/".equals(name)) {
                name = "//";
            }

            if (name.contains("//")) {
                name = name.replaceAll("//", "/XIEGANG/");
            }
            String featureName = toFeatureName(name.trim().split("/"), lastFeatureId);

            featureNames.put(id, Pair.with(split[0].trim(), featureName));

        }

    }

    return featureNames;

}
 
Example #17
Source File: WapitiCRFModel.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * 加载特征值 //11:*6:_x-1/的,
 * 
 * @param featureIndex
 * 
 * @param br
 * @return
 * @throws Exception
 */

private List<Pair<String, String>> loadFeatureName(Map<String, Integer> featureIndex, BufferedReader br)
                throws Exception {
    String temp = br.readLine();// #qrk#num
    int featureNum = ObjConver.getIntValue(StringUtil.matcherFirst("\\d+", temp)); // 找到特征个数

    List<Pair<String, String>> featureNames = new ArrayList<>();

    for (int i = 0; i < featureNum; i++) {
        temp = br.readLine();

        String[] split = temp.split(":");

        if (split.length == 2) {
            featureNames.add(Pair.with(split[1], ""));
            continue;
        } else {

            String name = split[2];

            if (split.length > 3) {
                for (int j = 3; j < split.length; j++) {
                    name += ":" + split[j];
                }
            }

            // 去掉最后的空格
            name = name.substring(0, name.length() - 1);

            int lastFeatureId = featureIndex.get(split[1]);

            if ("/".equals(name)) {
                name = "//";
            }

            if (name.contains("//")) {
                name = name.replaceAll("//", "/XIEGANG/");
            }
            String featureName = toFeatureName(name.trim().split("/"), lastFeatureId);

            featureNames.add(Pair.with(split[1], featureName));

        }
    }

    return featureNames;

}
 
Example #18
Source File: FingerprintService.java    From nlp-lang with Apache License 2.0 4 votes vote down vote up
/**
 * 根据一个 文章的正文.计算文章的指纹
 * 
 * @param content
 * @return
 */
public String fingerprint(String content) {

    content = StringUtil.rmHtmlTag(content);

    GetWord word = new GetWord(forest, content);

    String temp = null;

    int middleLength = content.length() / 2;

    double factory;

    HashMap<String, MyFingerprint> hm = new HashMap<String, MyFingerprint>();

    MyFingerprint myFingerprint = null;
    while ((temp = word.getFrontWords()) != null) {
        if (temp != null && temp.length() == 0) {
            continue;
        }
        temp = temp.toLowerCase();
        factory = 1 - (Math.abs(middleLength - word.offe) / (double) middleLength);
        if ((myFingerprint = hm.get(temp)) != null) {
            myFingerprint.updateScore(factory);
        } else {
            hm.put(temp, new MyFingerprint(temp, Double.parseDouble(word.getParam(1)), factory));
        }
    }

    Set<MyFingerprint> set = new TreeSet<MyFingerprint>();
    set.addAll(hm.values());

    int size = Math.min(set.size() / 10, 4) + 1;

    Iterator<MyFingerprint> iterator = set.iterator();
    int j = 0;
    Set<String> hs = new TreeSet<String>();
    for (; j < size && iterator.hasNext(); j++) {
        myFingerprint = iterator.next();
        hs.add(myFingerprint.getName() + " ");
    }
    String finger = MD5.code(hs.toString());

    return finger;
}
 
Example #19
Source File: Result.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public String toString(String split) {
    return StringUtil.joiner(this.terms, split);
}
 
Example #20
Source File: StopLibrary.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * 用户自定义词典加载
 * 
 * @param key
 * @param path
 * @return
 */
private synchronized static StopRecognition init(String key, KV<String, StopRecognition> kv, boolean reload) {
    StopRecognition stopRecognition = kv.getV();

    if (stopRecognition != null) {
        if (reload) {
            stopRecognition.clear();
        } else {
            return stopRecognition;
        }
    } else {
        stopRecognition = new StopRecognition();
    }

    try {
        LOG.debug("begin init FILTER !");
        long start = System.currentTimeMillis();
        String temp = null;
        String[] strs = null;
        try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "UTF-8")) {
            while ((temp = br.readLine()) != null) {
                if (StringUtil.isNotBlank(temp)) {
                    temp = StringUtil.trim(temp);
                    strs = temp.split("\t");

                    if (strs.length == 1) {
                        stopRecognition.insertStopWords(strs[0]);
                    } else {
                        switch (strs[1]) {
                            case "nature":
                                stopRecognition.insertStopNatures(strs[0]);
                                break;
                            case "regex":
                                stopRecognition.insertStopRegexes(strs[0]);
                                break;
                            default:
                                stopRecognition.insertStopWords(strs[0]);
                                break;
                        }
                    }

                }
            }
        }
        LOG.info("load stop use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
        kv.setV(stopRecognition);
        return stopRecognition;
    } catch (Exception e) {
        LOG.error("Init Stop library error :" + e.getMessage() + ", path: " + kv.getK());
        STOP.remove(key);
        return null;
    }
}
 
Example #21
Source File: SynonymsLibrary.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * 加载词典
 * 
 * @param key
 * @param kv
 * @param reload 是否更新词典
 * @return
 */
private static synchronized SmartForest<List<String>> init(String key, KV<String, SmartForest<List<String>>> kv,
                boolean reload) {

    SmartForest<List<String>> forest = kv.getV();

    if (forest != null) {
        if (reload) {
            forest.clear();
        } else {
            return forest;
        }
    } else {
        forest = new SmartForest<>();
    }

    LOG.debug("begin init synonyms " + kv.getK());
    long start = System.currentTimeMillis();

    try (BufferedReader reader = IOUtil.getReader(PathToStream.stream(kv.getK()), IOUtil.UTF8)) {
        String temp = null;
        while ((temp = reader.readLine()) != null) {
            if (StringUtil.isBlank(temp)) {
                continue;
            }
            String[] split = temp.split("\t");

            List<String> list = new ArrayList<>();
            for (String word : split) {
                if (StringUtil.isBlank(word)) {
                    continue;
                }
                list.add(word);
            }

            if (split.length <= 1) {
                LOG.warn(temp + " in synonymsLibrary not in to library !");
                continue;
            }

            for (int i = 0; i < split.length; i++) {
                forest.add(split[i], list);
            }
        }
        kv.setV(forest);
        LOG.info("load synonyms use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
        return forest;
    } catch (Exception e) {
        LOG.error("Init synonyms library error :" + e.getMessage() + ", path: " + kv.getK());
        SYNONYMS.remove(key);
        return null;
    }
}
 
Example #22
Source File: CRFppTxtModel.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
private Map<String, Integer> loadConfig(BufferedReader br) throws IOException {

        Map<String, Integer> featureIndex = new HashMap<>();

        String temp = br.readLine();// #rdr#8/0/0

        List<int[]> list = new ArrayList<>();

        while (StringUtil.isNotBlank((temp = br.readLine()))) {

            List<String> matcherAll = StringUtil.matcherAll("\\[.*?\\]", temp);

            if (matcherAll.isEmpty()) {
                continue;
            }

            int[] is = new int[matcherAll.size()];
            for (int j = 0; j < is.length; j++) {
                is[j] = ObjConver.getIntValue(StringUtil.matcherFirst("[-\\d]+", matcherAll.get(j)));
            }

            featureIndex.put(temp.split(":")[0].trim(), list.size());

            list.add(is);
        }

        int[][] template = new int[list.size()][0]; // 构建特征模板

        for (int i = 0; i < template.length; i++) {
            template[i] = list.get(i);
        }

        config = new Config(template);

        return featureIndex;
    }
 
Example #23
Source File: WapitiCRFModel.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * 加载特征模板
 * 
 * @param br
 * @return
 * @throws IOException
 */
private Map<String, Integer> loadConfig(BufferedReader br) throws IOException {

    Map<String, Integer> featureIndex = new HashMap<>();

    String temp = br.readLine();// #rdr#8/0/0

    int featureNum = ObjConver.getIntValue(StringUtil.matcherFirst("\\d+", temp)); // 找到特征个数

    List<int[]> list = new ArrayList<>();

    for (int i = 0; i < featureNum; i++) {
        temp = br.readLine();

        List<String> matcherAll = StringUtil.matcherAll("\\[.*?\\]", temp);

        if (matcherAll.isEmpty()) {
            continue;
        }

        int[] is = new int[matcherAll.size()];
        for (int j = 0; j < is.length; j++) {
            is[j] = ObjConver.getIntValue(StringUtil.matcherFirst("[-\\d]+", matcherAll.get(j)));
        }

        featureIndex.put(temp.split(":")[1], list.size());

        list.add(is);
    }

    int[][] template = new int[list.size()][0]; // 构建特征模板

    for (int i = 0; i < template.length; i++) {
        template[i] = list.get(i);
    }

    config = new Config(template);

    return featureIndex;
}