com.hankcs.hanlp.HanLP Java Exaples

Source File: SignUpControll.java From rebuild with GNU General Public License v3.0

7 votes

@RequestMapping("checkout-name")
public void checkoutName(HttpServletRequest request, HttpServletResponse response) throws IOException {
	String fullName = getParameterNotNull(request, "fullName");
	
	fullName = fullName.replaceAll("[^a-zA-Z0-9\u4e00-\u9fa5]", "");
	String loginName = HanLP.convertToPinyinString(fullName, "", false);
	if (loginName.length() > 20) {
		loginName = loginName.substring(0, 20);
	}
	if (BlackList.isBlack(loginName)) {
		writeSuccess(response);
		return;
	}
	
	for (int i = 0; i < 100; i++) {
		if (Application.getUserStore().existsName(loginName)) {
			loginName += RandomUtils.nextInt(99);
		} else {
			break;
		}
	}
	
	loginName = loginName.toLowerCase();
	writeSuccess(response, loginName);
}

Source File: HanLPTokenizerTest.java From hanlp-lucene-plugin with Apache License 2.0

6 votes

public void testMultiText() throws Exception
{
    String[] sentences = new String[]{
            "中华人民共和国",
            "地大物博"
    };
    tokenizer = new HanLPTokenizer(HanLP.newSegment()
                                           .enableJapaneseNameRecognize(true)
                                           .enableIndexMode(true), null, false);
    for (String sentence : sentences)
    {
        tokenizer.setReader(new StringReader(sentence));
        tokenizer.reset();
        testIncrementToken();
        tokenizer.close();
    }
}

Source File: HanLPNLPAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(
        TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() -> {
            try {
                return new PerceptronLexicalAnalyzer();
            } catch (IOException e) {
                logger.error("can not use nlp analyzer, provider default", e);
                return HanLP.newSegment();
            }
        }), configuration));
}

Source File: Nlputil.java From dk-fitting with Apache License 2.0

5 votes

/**
 * 拼音转换
 *
 * @param txt 要转换拼音的语句
 * @return 拼音列表
 */
public static String convertToPinyinList(String txt){
    String pinyinString = "";
    if (txt == null){
        return String.valueOf(Collections.emptyList());
    }
    List<Pinyin> pinyinList = HanLP.convertToPinyinList(txt);
    for(Pinyin s : pinyinList){
        pinyinString = s +","+ pinyinString;
    }
    return pinyinString;
}

Source File: TranslatedNameRecognition.java From danyuan-application with Apache License 2.0

5 votes

/**
 * 方法名： TranslatedName
 * 功 能： 取得音译人名识别
 * 参 数： @param str
 * 参 数： @return
 * 返 回： List<String>
 * 作 者 ： Tenghui.Wang
 * @throws
 */
public static List<String> TranslatedName(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableTranslatedNameRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nrf")) {
				list.add(term.word);
			}
		}
	}
	return list;
}

Source File: DKNLPBase.java From dk-fitting with Apache License 2.0

5 votes

/**
 * 拼音转换
 *
 * @param txt 要转换拼音的语句
 * @return 拼音列表
 */
public static List<Pinyin> convertToPinyinList(String txt)
{
    if (txt == null) return Collections.emptyList();

    return HanLP.convertToPinyinList(txt);
}

Source File: Field2Schema.java From rebuild with GNU General Public License v3.0

5 votes

/**
 * 中文 -> 拼音（仅保留字母数字）
 * 全英文+数字直接返回，不支持的字符会使用随机数
 * 
 * @param text
 * @return
 */
protected String toPinyinName(final String text) {
	String identifier = text;
	if (text.length() < 4) {
		identifier = "rb" + text + RandomUtils.nextInt(10);
	}
	
	// 全英文直接返回
	if (identifier.matches("[a-zA-Z0-9]+")) {
		if (!CharSet.ASCII_ALPHA.contains(identifier.charAt(0))
				|| BlackList.isBlack(identifier) || BlackList.isSQLKeyword(identifier)) {
			identifier = "rb" + identifier;
		}
		return identifier;
	}
	
	identifier = HanLP.convertToPinyinString(identifier, "", false);
	identifier = identifier.replaceAll("[^a-zA-Z0-9]", "");
	if (StringUtils.isBlank(identifier)) {
		identifier = String.valueOf(System.currentTimeMillis() / 1000);
	}

	char start = identifier.charAt(0);
	if (!CharSet.ASCII_ALPHA.contains(start)) {
		identifier = "rb" + identifier;
	}
	
	identifier = identifier.toLowerCase();
	if (identifier.length() > 42) {
		identifier = identifier.substring(0, 42);
	}
	
	if (!StringHelper.isIdentifier(identifier)) {
		throw new ModifiyMetadataException("无效名称 : " + text);
	}
	return identifier;
}

Source File: JapaneseNameRecognition.java From danyuan-application with Apache License 2.0

5 votes

/**
 * 方法名： TranslatedName
 * 功 能： 取得日本人名
 * 参 数： @param str
 * 参 数： @return
 * 返 回： List<String>
 * 作 者 ： Tenghui.Wang
 * @throws
 */
public static List<String> JapaneseName(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableJapaneseNameRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nrj")) {
				list.add(term.word);
			}
		}
	}
	return list;
}

Source File: OrganizationRecognition.java From danyuan-application with Apache License 2.0

5 votes

/**
 * 方法名： Organization
 * 功 能： 取得组织机构
 * 参 数： @param str
 * 参 数： @return
 * 返 回： List<String>
 * 作 者 ： Tenghui.Wang
 * @throws
 */
public static List<String> Organization(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableOrganizationRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nt")) {
				list.add(term.word);
			}
		}
	}
	return list;
}

Source File: Nlputil.java From dk-fitting with Apache License 2.0

5 votes

/**
 * 关键词提取
 *
 * @param txt    要提取关键词的语句
 * @param keySum 要提取关键字的数量
 * @return 关键词列表
 */
public static String extractKeyword(String txt, int keySum){
    String keyString = "";
    if (txt == null || keySum <= 0){
        return String.valueOf(Collections.emptyList());
    }
    List<String> keyList = HanLP.extractKeyword(txt, keySum);
    for(String s : keyList){

        keyString = s +","+  keyString;
    }
    return keyString;
}

Source File: Nlputil.java From dk-fitting with Apache License 2.0

5 votes

/**
 * 短语提取
 *
 * @param txt   文本
 * @param phSum 需要多少个短语
 * @return 短语列表
 */
public static String extractPhrase(String txt, int phSum) {
    String phraseString = "";
    if (txt == null || phSum <= 0){
        return String.valueOf(Collections.emptyList());
    }
    List<String> phraseList = HanLP.extractPhrase(txt, phSum);
    for(String s : phraseList){
        phraseString = s +","+ phraseString;
    }
    return phraseString;
}

Source File: BaseAction.java From o2oa with GNU Affero General Public License v3.0

5 votes

protected List<String> keys(String key) {
	List<String> os = new ArrayList<>();
	for (Term term : HanLP.segment(key)) {
		/* 字段不要太长 */
		if (StringUtils.length(term.word) < 31) {
			os.add(StringUtils.lowerCase(term.word));
		}
	}
	return os;
}

Source File: ExtMonitor.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

@Override
public void run() {
    List<DictionaryFile> originalDictionaryFileList = DictionaryFileCache.getCustomDictionaryFileList();
    logger.debug("hanlp original custom dictionary: {}", Arrays.toString(originalDictionaryFileList.toArray()));
    reloadProperty();
    List<DictionaryFile> currentDictironaryFileList = getCurrentDictionaryFileList(HanLP.Config.CustomDictionaryPath);
    logger.debug("hanlp current custom dictionary: {}", Arrays.toString(currentDictironaryFileList.toArray()));
    boolean isModified = false;
    for (DictionaryFile currentDictionaryFile : currentDictironaryFileList) {
        if (!originalDictionaryFileList.contains(currentDictionaryFile)) {
            isModified = true;
            break;
        }
    }
    if (isModified) {
        logger.info("reloading hanlp custom dictionary");
        try {
            AccessController.doPrivileged((PrivilegedAction) CustomDictionaryUtility::reload);
        } catch (Exception e) {
            logger.error("can not reload hanlp custom dictionary", e);
        }
        DictionaryFileCache.setCustomDictionaryFileList(currentDictironaryFileList);
        DictionaryFileCache.writeCache();
        logger.info("finish reload hanlp custom dictionary");
    } else {
        logger.info("hanlp custom dictionary isn't modified, so no need reload");
    }
}

Source File: ExtMonitor.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

private void reloadProperty() {
    Properties p = new Properties();
    try {
        ClassLoader loader = AccessController.doPrivileged((PrivilegedAction<ClassLoader>) () -> Thread.currentThread().getContextClassLoader());
        if (loader == null) {
            loader = HanLP.Config.class.getClassLoader();
        }
        p.load(new InputStreamReader(Predefine.HANLP_PROPERTIES_PATH == null ? loader.getResourceAsStream("hanlp.properties") : new FileInputStream(Predefine.HANLP_PROPERTIES_PATH), "UTF-8"));
        String root = p.getProperty("root", "").replaceAll("\\\\", "/");
        if (root.length() > 0 && !root.endsWith("/")) {
            root += "/";
        }
        String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";");
        String prePath = root;
        for (int i = 0; i < pathArray.length; ++i) {
            if (pathArray[i].startsWith(" ")) {
                pathArray[i] = prePath + pathArray[i].trim();
            } else {
                pathArray[i] = root + pathArray[i];
                int lastSplash = pathArray[i].lastIndexOf('/');
                if (lastSplash != -1) {
                    prePath = pathArray[i].substring(0, lastSplash + 1);
                }
            }
        }
        AccessController.doPrivileged((PrivilegedAction) () -> HanLP.Config.CustomDictionaryPath = pathArray);
    } catch (Exception e) {
        logger.error("can not find hanlp.properties", e);
    }
}

Source File: CustomDictionaryUtility.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

public static boolean reload() {
    CustomDictionary.dat.getSize();
    String[] paths = HanLP.Config.CustomDictionaryPath;
    if (paths == null || paths.length == 0) {
        return false;
    }
    logger.debug("begin delete hanlp custom dictionary cache");
    IOUtil.deleteFile(paths[0] + Predefine.BIN_EXT);
    logger.debug("delete hanlp custom dictionary cache successfully");
    return loadMainDictionary(paths[0]);
}

Source File: HanLPCRFAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(
        TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() -> {
            try {
                return new CRFLexicalAnalyzer();
            } catch (IOException e) {
                logger.error("can not use crf analyzer, provider default", e);
                return HanLP.newSegment();
            }
        }), configuration));
}

Source File: HanLpSegmentFactory.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public Segment build(Map<String, String> configurations) {
    String algorithm = get(configurations, "algorithm", "viterbi");
    Segment segment = HanLP.newSegment(algorithm);

    // 设置模式
    segment.enableIndexMode(getBoolean(configurations, "enableIndexMode", false));

    segment.enableOffset(true);

    // 是否识别数词和量词
    segment.enableNumberQuantifierRecognize(getBoolean(configurations, "enableNumberQuantifierRecognize", false));

    // 是否识别人名
    segment.enableNameRecognize(getBoolean(configurations, "enableNameRecognize", false));

    // 是否识别音译名
    // TODO 考虑是否依赖enableNameRecognize
    segment.enableTranslatedNameRecognize(getBoolean(configurations, "enableTranslatedNameRecognize", false));

    // 是否识别日本名?
    // TODO 考虑是否依赖enableNameRecognize
    segment.enableJapaneseNameRecognize(getBoolean(configurations, "enableJapaneseNameRecognize", false));

    // 是否识别组织名
    segment.enableOrganizationRecognize(getBoolean(configurations, "enableOrganizationRecognize", false));

    // 是否识别地名
    segment.enablePlaceRecognize(getBoolean(configurations, "enablePlaceRecognize", false));
    return segment;
}

Source File: TestSegmentWrapper.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

@Test
public void test1() {
    StringReader reader = new StringReader("张三\n\n\n新买的手机");
    SegmentWrapper wrapper = new SegmentWrapper(reader, HanLP.newSegment().enableOffset(true));
    while (true) {
        Term term = wrapper.next();
        if (term == null) {
            break;
        }
        System.out.println(term.word + "\t" + term.nature + "\t" + term.offset + "\t" + term.length());
    }
}

Source File: HanLPTokenizerFactory.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

/**
 * 初始化工厂类
 *
 * @param args 通过这个Map保存xml中的配置项
 */
public HanLPTokenizerFactory(Map<String, String> args) {
    super(args);
    enableIndexMode = getBoolean(args, "enableIndexMode", true);
    enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
    enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
    enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
    enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
    enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
    enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
    enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
    enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
    enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
    HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
    Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
    if (customDictionaryPathSet != null) {
        HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
    }
    String stopWordDictionaryPath = get(args, "stopWordDictionaryPath");
    if (stopWordDictionaryPath != null) {
        stopWordDictionary = new TreeSet<>();
        stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
    }
    if (getBoolean(args, "enableDebug", false)) {
        HanLP.Config.enableDebug();
    }
}

Source File: Tokenizer.java From similarity with Apache License 2.0

5 votes

public static List<Word> segment(String sentence) {
    List<Word> results = new ArrayList<>();
    /*// ansj_seg
    List<org.xm.ansj.domain.Term> termList = StandardSegmentation.parse(sentence).getTerms();//ansj
    results.addAll(termList
            .stream()
            .map(term -> new Word(term.getName(), term.getNature().natureStr))
            .collect(Collectors.toList())
    );*/

    /*//Xmnlp
    List<org.xm.xmnlp.seg.domain.Term> termList = Xmnlp.segment(sentence);
    results.addAll(termList
            .stream()
            .map(term -> new Word(term.word, term.getNature().name()))
            .collect(Collectors.toList())
    );*/

    // HanLP
    List<Term> termList = HanLP.segment(sentence);
    results.addAll(termList
            .stream()
            .map(term -> new Word(term.word, term.nature.name()))
            .collect(Collectors.toList())
    );

    return results;
}

Source File: PinyinDictionaryTest.java From similarity with Apache License 2.0

5 votes

@Test
public void getXmnlpPinyin() throws Exception {
    // 胳臂
    String pinyin1 = HanLP.convertToPinyinList("胳臂").toString();
    System.out.println("胳臂:" + pinyin1);

    // 划船,计划
    System.out.println("划船:" + HanLP.convertToPinyinList("划船").toString());
    List<Pinyin> pinyinList = HanLP.convertToPinyinList("计划");
    System.out.println("计划:" + pinyinList.toString());
}

Source File: Word2vecTest.java From similarity with Apache License 2.0

5 votes

@Test
public void trainModel() throws Exception {
    HanLP.Config.ShowTermNature = false;// 关闭词性标注
    Tokenizer.fileSegment(SEGMENT, RAW_CORPUS, RAW_CORPUS_SPLIT);
    String outputModelPath = Word2vec.trainModel(RAW_CORPUS_SPLIT, RAW_CORPUS_SPLIT_MODEL);
    System.out.println("outputModelPath：" + outputModelPath);
}

Source File: HanLPTokenizerFactory.java From hanlp-lucene-plugin with Apache License 2.0

5 votes

/**
 * 初始化工厂类
 *
 * @param args 通过这个Map保存xml中的配置项
 */
public HanLPTokenizerFactory(Map<String, String> args)
{
    super(args);
    enableIndexMode = getBoolean(args, "enableIndexMode", true);
    enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
    enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
    enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
    enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", false);
    enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
    enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
    enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
    enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
    enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
    enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
    HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
    algorithm = getString(args, "algorithm", "viterbi");
    Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
    if (customDictionaryPathSet != null)
    {
        HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
    }
    String stopWordDictionaryPath = get(args, "stopWordDictionaryPath");
    if (stopWordDictionaryPath != null)
    {
        stopWordDictionary = new TreeSet<>();
        stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
    }
    if (getBoolean(args, "enableDebug", false))
    {
        HanLP.Config.enableDebug();
    }
}

Source File: HanLPAnalyzer.java From hanlp-lucene-plugin with Apache License 2.0

5 votes

/**
 * 重载Analyzer接口，构造分词组件
 */
@Override
protected TokenStreamComponents createComponents(String fieldName)
{
    Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableOffset(true), filter, enablePorterStemming);
    return new TokenStreamComponents(tokenizer);
}

Source File: HanLPTokenizerTest.java From hanlp-lucene-plugin with Apache License 2.0

5 votes

@Override
public void setUp() throws Exception
{
    tokenizer = new HanLPTokenizer(HanLP.newSegment()
                                           .enableJapaneseNameRecognize(true)
                                           .enableIndexMode(true), null, false);
    tokenizer.setReader(new StringReader("林志玲亮相网友:确定不是波多野结衣？"));
    tokenizer.reset();
}

Source File: HanLpTokenizerTokenizerFactory.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

@Override
public Tokenizer create() {
    return new HanLPTokenizer(HanLP.newSegment()
                                   .enableIndexMode(indexMode)
                                   .enableNameRecognize(nameRecognize)
                                   .enableTranslatedNameRecognize(translatedNameRecognize)
                                   .enableJapaneseNameRecognize(japaneseNameRecognize)
                                   .enablePlaceRecognize(placeRecognize)
                                   .enableOrganizationRecognize(organizationRecognize)
                                   .enableCustomDictionary(useCustomDictionary)
                                   .enablePartOfSpeechTagging(speechTagging)
                                   .enableOffset(offset)
                                   .enableNumberQuantifierRecognize(numberQuantifierRecognize)
                                   .enableMultithreading(threads), null, speechTagging);
}

Source File: KeywordServiceImpl.java From onboard with Apache License 2.0

5 votes

@Override
public void generateOrUpdateKeywordsByIdentifiable(Recommendable identifiable) {
    deleteKeywordsByIdentifiable(identifiable);
    List<String> keywords = HanLP.extractKeyword(identifiable.generateText(), PER_IDENTIFIABLE_KEYWORD_COUNT);
    for (String keyword : keywords) {
        keywordMapper.insert(generateKeywordObjectByIdentifiableAndString(identifiable, keyword));
    }
}

Source File: Parser.java From antiplag with Apache License 2.0

5 votes

public boolean parseFile(File dir, String file) {
	
	try {
	 currentFile = file;
        String[] strs = FileIO.readFile(new File(dir, file),"utf-8");
	 for(int line=0;line<strs.length;line++) {			
		if(strs[line].trim().length()<1) {   //���˵�����
			continue ;
		}
		List<Term> tokens = HanLP.segment(strs[line]);  
		int col = 1;
		for(int j=0;j<tokens.size();j++) {
			Term token = tokens.get(j);
			struct.addToken(new DocToken(token.word, currentFile, 
					line+1, col, token.length(), this));
			
			col = col + tokens.get(j).word.length()+1;
		}
		
	}
		
	} catch (Exception e) {
		getProgram().addError("Parsing Error in '" + file + e.getMessage());
		return false;
	}
	return true;
}

Source File: Tokenizer.java From antiplag with Apache License 2.0

5 votes

public static String segment(String text,String sep) {
	   StringBuilder sb = new StringBuilder();
       HanLP.Config.Normalization = true; //������->���壬ȫ��->��ǣ���д->Сд��
       List<Term> tokens = NotionalTokenizer.segment(text);//�ִʣ�ȥ��ͣ�ô�
       for(Term token : tokens) {
    	   sb.append(token.word+sep);
       }
       return sb.toString();
}

Source File: TextRankSummary.java From TextRank with Apache License 2.0

5 votes

/**
 * 一句话调用接口
 * @param document 目标文档
 * @param size 需要的关键句的个数
 * @return 关键句列表
 */
public static List<String> getTopSentenceList(String document, int size)
{
    List<String> sentenceList = spiltSentence(document);
    List<List<String>> docs = new ArrayList<List<String>>();
    for (String sentence : sentenceList)
    {
        List<Term> termList = HanLP.segment(sentence);
        List<String> wordList = new LinkedList<String>();
        for (Term term : termList)
        {
            if (shouldInclude(term))
            {
                wordList.add(term.word);
            }
        }
        docs.add(wordList);
    }
    TextRankSummary textRankSummary = new TextRankSummary(docs);
    int[] topSentence = textRankSummary.getTopSentence(size);
    List<String> resultList = new LinkedList<String>();
    for (int i : topSentence)
    {
        resultList.add(sentenceList.get(i));
    }
    return resultList;
}

com.hankcs.hanlp.HanLP Java Examples