com.hankcs.hanlp.HanLP Java Examples
The following examples show how to use
com.hankcs.hanlp.HanLP.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SignUpControll.java From rebuild with GNU General Public License v3.0 | 7 votes |
@RequestMapping("checkout-name") public void checkoutName(HttpServletRequest request, HttpServletResponse response) throws IOException { String fullName = getParameterNotNull(request, "fullName"); fullName = fullName.replaceAll("[^a-zA-Z0-9\u4e00-\u9fa5]", ""); String loginName = HanLP.convertToPinyinString(fullName, "", false); if (loginName.length() > 20) { loginName = loginName.substring(0, 20); } if (BlackList.isBlack(loginName)) { writeSuccess(response); return; } for (int i = 0; i < 100; i++) { if (Application.getUserStore().existsName(loginName)) { loginName += RandomUtils.nextInt(99); } else { break; } } loginName = loginName.toLowerCase(); writeSuccess(response, loginName); }
Example #2
Source File: HanLPTokenizerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 6 votes |
public void testMultiText() throws Exception { String[] sentences = new String[]{ "中华人民共和国", "地大物博" }; tokenizer = new HanLPTokenizer(HanLP.newSegment() .enableJapaneseNameRecognize(true) .enableIndexMode(true), null, false); for (String sentence : sentences) { tokenizer.setReader(new StringReader(sentence)); tokenizer.reset(); testIncrementToken(); tokenizer.close(); } }
Example #3
Source File: HanLPNLPAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Override protected Analyzer.TokenStreamComponents createComponents(String fieldName) { return new Analyzer.TokenStreamComponents( TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() -> { try { return new PerceptronLexicalAnalyzer(); } catch (IOException e) { logger.error("can not use nlp analyzer, provider default", e); return HanLP.newSegment(); } }), configuration)); }
Example #4
Source File: Nlputil.java From dk-fitting with Apache License 2.0 | 5 votes |
/** * 拼音转换 * * @param txt 要转换拼音的语句 * @return 拼音列表 */ public static String convertToPinyinList(String txt){ String pinyinString = ""; if (txt == null){ return String.valueOf(Collections.emptyList()); } List<Pinyin> pinyinList = HanLP.convertToPinyinList(txt); for(Pinyin s : pinyinList){ pinyinString = s +","+ pinyinString; } return pinyinString; }
Example #5
Source File: TranslatedNameRecognition.java From danyuan-application with Apache License 2.0 | 5 votes |
/** * 方法名: TranslatedName * 功 能: 取得音译人名识别 * 参 数: @param str * 参 数: @return * 返 回: List<String> * 作 者 : Tenghui.Wang * @throws */ public static List<String> TranslatedName(String[] str) { List<String> list = new ArrayList<String>(); Segment segment = HanLP.newSegment().enableTranslatedNameRecognize(true); for (String sentence : str) { List<Term> termList = segment.seg(sentence); for (Term term : termList) { if (term.toString().contains("nrf")) { list.add(term.word); } } } return list; }
Example #6
Source File: DKNLPBase.java From dk-fitting with Apache License 2.0 | 5 votes |
/** * 拼音转换 * * @param txt 要转换拼音的语句 * @return 拼音列表 */ public static List<Pinyin> convertToPinyinList(String txt) { if (txt == null) return Collections.emptyList(); return HanLP.convertToPinyinList(txt); }
Example #7
Source File: Field2Schema.java From rebuild with GNU General Public License v3.0 | 5 votes |
/** * 中文 -> 拼音(仅保留字母数字) * 全英文+数字直接返回,不支持的字符会使用随机数 * * @param text * @return */ protected String toPinyinName(final String text) { String identifier = text; if (text.length() < 4) { identifier = "rb" + text + RandomUtils.nextInt(10); } // 全英文直接返回 if (identifier.matches("[a-zA-Z0-9]+")) { if (!CharSet.ASCII_ALPHA.contains(identifier.charAt(0)) || BlackList.isBlack(identifier) || BlackList.isSQLKeyword(identifier)) { identifier = "rb" + identifier; } return identifier; } identifier = HanLP.convertToPinyinString(identifier, "", false); identifier = identifier.replaceAll("[^a-zA-Z0-9]", ""); if (StringUtils.isBlank(identifier)) { identifier = String.valueOf(System.currentTimeMillis() / 1000); } char start = identifier.charAt(0); if (!CharSet.ASCII_ALPHA.contains(start)) { identifier = "rb" + identifier; } identifier = identifier.toLowerCase(); if (identifier.length() > 42) { identifier = identifier.substring(0, 42); } if (!StringHelper.isIdentifier(identifier)) { throw new ModifiyMetadataException("无效名称 : " + text); } return identifier; }
Example #8
Source File: JapaneseNameRecognition.java From danyuan-application with Apache License 2.0 | 5 votes |
/** * 方法名: TranslatedName * 功 能: 取得日本人名 * 参 数: @param str * 参 数: @return * 返 回: List<String> * 作 者 : Tenghui.Wang * @throws */ public static List<String> JapaneseName(String[] str) { List<String> list = new ArrayList<String>(); Segment segment = HanLP.newSegment().enableJapaneseNameRecognize(true); for (String sentence : str) { List<Term> termList = segment.seg(sentence); for (Term term : termList) { if (term.toString().contains("nrj")) { list.add(term.word); } } } return list; }
Example #9
Source File: OrganizationRecognition.java From danyuan-application with Apache License 2.0 | 5 votes |
/** * 方法名: Organization * 功 能: 取得组织机构 * 参 数: @param str * 参 数: @return * 返 回: List<String> * 作 者 : Tenghui.Wang * @throws */ public static List<String> Organization(String[] str) { List<String> list = new ArrayList<String>(); Segment segment = HanLP.newSegment().enableOrganizationRecognize(true); for (String sentence : str) { List<Term> termList = segment.seg(sentence); for (Term term : termList) { if (term.toString().contains("nt")) { list.add(term.word); } } } return list; }
Example #10
Source File: Nlputil.java From dk-fitting with Apache License 2.0 | 5 votes |
/** * 关键词提取 * * @param txt 要提取关键词的语句 * @param keySum 要提取关键字的数量 * @return 关键词列表 */ public static String extractKeyword(String txt, int keySum){ String keyString = ""; if (txt == null || keySum <= 0){ return String.valueOf(Collections.emptyList()); } List<String> keyList = HanLP.extractKeyword(txt, keySum); for(String s : keyList){ keyString = s +","+ keyString; } return keyString; }
Example #11
Source File: Nlputil.java From dk-fitting with Apache License 2.0 | 5 votes |
/** * 短语提取 * * @param txt 文本 * @param phSum 需要多少个短语 * @return 短语列表 */ public static String extractPhrase(String txt, int phSum) { String phraseString = ""; if (txt == null || phSum <= 0){ return String.valueOf(Collections.emptyList()); } List<String> phraseList = HanLP.extractPhrase(txt, phSum); for(String s : phraseList){ phraseString = s +","+ phraseString; } return phraseString; }
Example #12
Source File: BaseAction.java From o2oa with GNU Affero General Public License v3.0 | 5 votes |
protected List<String> keys(String key) { List<String> os = new ArrayList<>(); for (Term term : HanLP.segment(key)) { /* 字段不要太长 */ if (StringUtils.length(term.word) < 31) { os.add(StringUtils.lowerCase(term.word)); } } return os; }
Example #13
Source File: ExtMonitor.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Override public void run() { List<DictionaryFile> originalDictionaryFileList = DictionaryFileCache.getCustomDictionaryFileList(); logger.debug("hanlp original custom dictionary: {}", Arrays.toString(originalDictionaryFileList.toArray())); reloadProperty(); List<DictionaryFile> currentDictironaryFileList = getCurrentDictionaryFileList(HanLP.Config.CustomDictionaryPath); logger.debug("hanlp current custom dictionary: {}", Arrays.toString(currentDictironaryFileList.toArray())); boolean isModified = false; for (DictionaryFile currentDictionaryFile : currentDictironaryFileList) { if (!originalDictionaryFileList.contains(currentDictionaryFile)) { isModified = true; break; } } if (isModified) { logger.info("reloading hanlp custom dictionary"); try { AccessController.doPrivileged((PrivilegedAction) CustomDictionaryUtility::reload); } catch (Exception e) { logger.error("can not reload hanlp custom dictionary", e); } DictionaryFileCache.setCustomDictionaryFileList(currentDictironaryFileList); DictionaryFileCache.writeCache(); logger.info("finish reload hanlp custom dictionary"); } else { logger.info("hanlp custom dictionary isn't modified, so no need reload"); } }
Example #14
Source File: ExtMonitor.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
private void reloadProperty() { Properties p = new Properties(); try { ClassLoader loader = AccessController.doPrivileged((PrivilegedAction<ClassLoader>) () -> Thread.currentThread().getContextClassLoader()); if (loader == null) { loader = HanLP.Config.class.getClassLoader(); } p.load(new InputStreamReader(Predefine.HANLP_PROPERTIES_PATH == null ? loader.getResourceAsStream("hanlp.properties") : new FileInputStream(Predefine.HANLP_PROPERTIES_PATH), "UTF-8")); String root = p.getProperty("root", "").replaceAll("\\\\", "/"); if (root.length() > 0 && !root.endsWith("/")) { root += "/"; } String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";"); String prePath = root; for (int i = 0; i < pathArray.length; ++i) { if (pathArray[i].startsWith(" ")) { pathArray[i] = prePath + pathArray[i].trim(); } else { pathArray[i] = root + pathArray[i]; int lastSplash = pathArray[i].lastIndexOf('/'); if (lastSplash != -1) { prePath = pathArray[i].substring(0, lastSplash + 1); } } } AccessController.doPrivileged((PrivilegedAction) () -> HanLP.Config.CustomDictionaryPath = pathArray); } catch (Exception e) { logger.error("can not find hanlp.properties", e); } }
Example #15
Source File: CustomDictionaryUtility.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
public static boolean reload() { CustomDictionary.dat.getSize(); String[] paths = HanLP.Config.CustomDictionaryPath; if (paths == null || paths.length == 0) { return false; } logger.debug("begin delete hanlp custom dictionary cache"); IOUtil.deleteFile(paths[0] + Predefine.BIN_EXT); logger.debug("delete hanlp custom dictionary cache successfully"); return loadMainDictionary(paths[0]); }
Example #16
Source File: HanLPCRFAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Override protected Analyzer.TokenStreamComponents createComponents(String fieldName) { return new Analyzer.TokenStreamComponents( TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() -> { try { return new CRFLexicalAnalyzer(); } catch (IOException e) { logger.error("can not use crf analyzer, provider default", e); return HanLP.newSegment(); } }), configuration)); }
Example #17
Source File: HanLpSegmentFactory.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override public Segment build(Map<String, String> configurations) { String algorithm = get(configurations, "algorithm", "viterbi"); Segment segment = HanLP.newSegment(algorithm); // 设置模式 segment.enableIndexMode(getBoolean(configurations, "enableIndexMode", false)); segment.enableOffset(true); // 是否识别数词和量词 segment.enableNumberQuantifierRecognize(getBoolean(configurations, "enableNumberQuantifierRecognize", false)); // 是否识别人名 segment.enableNameRecognize(getBoolean(configurations, "enableNameRecognize", false)); // 是否识别音译名 // TODO 考虑是否依赖enableNameRecognize segment.enableTranslatedNameRecognize(getBoolean(configurations, "enableTranslatedNameRecognize", false)); // 是否识别日本名? // TODO 考虑是否依赖enableNameRecognize segment.enableJapaneseNameRecognize(getBoolean(configurations, "enableJapaneseNameRecognize", false)); // 是否识别组织名 segment.enableOrganizationRecognize(getBoolean(configurations, "enableOrganizationRecognize", false)); // 是否识别地名 segment.enablePlaceRecognize(getBoolean(configurations, "enablePlaceRecognize", false)); return segment; }
Example #18
Source File: TestSegmentWrapper.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Test public void test1() { StringReader reader = new StringReader("张三\n\n\n新买的手机"); SegmentWrapper wrapper = new SegmentWrapper(reader, HanLP.newSegment().enableOffset(true)); while (true) { Term term = wrapper.next(); if (term == null) { break; } System.out.println(term.word + "\t" + term.nature + "\t" + term.offset + "\t" + term.length()); } }
Example #19
Source File: HanLPTokenizerFactory.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
/** * 初始化工厂类 * * @param args 通过这个Map保存xml中的配置项 */ public HanLPTokenizerFactory(Map<String, String> args) { super(args); enableIndexMode = getBoolean(args, "enableIndexMode", true); enablePorterStemming = getBoolean(args, "enablePorterStemming", false); enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false); enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true); enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false); enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false); enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false); enableNameRecognize = getBoolean(args, "enableNameRecognize", false); enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false); enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false); HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization); Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath"); if (customDictionaryPathSet != null) { HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]); } String stopWordDictionaryPath = get(args, "stopWordDictionaryPath"); if (stopWordDictionaryPath != null) { stopWordDictionary = new TreeSet<>(); stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath)); } if (getBoolean(args, "enableDebug", false)) { HanLP.Config.enableDebug(); } }
Example #20
Source File: Tokenizer.java From similarity with Apache License 2.0 | 5 votes |
public static List<Word> segment(String sentence) { List<Word> results = new ArrayList<>(); /*// ansj_seg List<org.xm.ansj.domain.Term> termList = StandardSegmentation.parse(sentence).getTerms();//ansj results.addAll(termList .stream() .map(term -> new Word(term.getName(), term.getNature().natureStr)) .collect(Collectors.toList()) );*/ /*//Xmnlp List<org.xm.xmnlp.seg.domain.Term> termList = Xmnlp.segment(sentence); results.addAll(termList .stream() .map(term -> new Word(term.word, term.getNature().name())) .collect(Collectors.toList()) );*/ // HanLP List<Term> termList = HanLP.segment(sentence); results.addAll(termList .stream() .map(term -> new Word(term.word, term.nature.name())) .collect(Collectors.toList()) ); return results; }
Example #21
Source File: PinyinDictionaryTest.java From similarity with Apache License 2.0 | 5 votes |
@Test public void getXmnlpPinyin() throws Exception { // 胳臂 String pinyin1 = HanLP.convertToPinyinList("胳臂").toString(); System.out.println("胳臂:" + pinyin1); // 划船,计划 System.out.println("划船:" + HanLP.convertToPinyinList("划船").toString()); List<Pinyin> pinyinList = HanLP.convertToPinyinList("计划"); System.out.println("计划:" + pinyinList.toString()); }
Example #22
Source File: Word2vecTest.java From similarity with Apache License 2.0 | 5 votes |
@Test public void trainModel() throws Exception { HanLP.Config.ShowTermNature = false;// 关闭词性标注 Tokenizer.fileSegment(SEGMENT, RAW_CORPUS, RAW_CORPUS_SPLIT); String outputModelPath = Word2vec.trainModel(RAW_CORPUS_SPLIT, RAW_CORPUS_SPLIT_MODEL); System.out.println("outputModelPath:" + outputModelPath); }
Example #23
Source File: HanLPTokenizerFactory.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
/** * 初始化工厂类 * * @param args 通过这个Map保存xml中的配置项 */ public HanLPTokenizerFactory(Map<String, String> args) { super(args); enableIndexMode = getBoolean(args, "enableIndexMode", true); enablePorterStemming = getBoolean(args, "enablePorterStemming", false); enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false); enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true); enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", false); enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false); enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false); enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false); enableNameRecognize = getBoolean(args, "enableNameRecognize", false); enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false); enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false); HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization); algorithm = getString(args, "algorithm", "viterbi"); Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath"); if (customDictionaryPathSet != null) { HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]); } String stopWordDictionaryPath = get(args, "stopWordDictionaryPath"); if (stopWordDictionaryPath != null) { stopWordDictionary = new TreeSet<>(); stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath)); } if (getBoolean(args, "enableDebug", false)) { HanLP.Config.enableDebug(); } }
Example #24
Source File: HanLPAnalyzer.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
/** * 重载Analyzer接口,构造分词组件 */ @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableOffset(true), filter, enablePorterStemming); return new TokenStreamComponents(tokenizer); }
Example #25
Source File: HanLPTokenizerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
@Override public void setUp() throws Exception { tokenizer = new HanLPTokenizer(HanLP.newSegment() .enableJapaneseNameRecognize(true) .enableIndexMode(true), null, false); tokenizer.setReader(new StringReader("林志玲亮相网友:确定不是波多野结衣?")); tokenizer.reset(); }
Example #26
Source File: HanLpTokenizerTokenizerFactory.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Override public Tokenizer create() { return new HanLPTokenizer(HanLP.newSegment() .enableIndexMode(indexMode) .enableNameRecognize(nameRecognize) .enableTranslatedNameRecognize(translatedNameRecognize) .enableJapaneseNameRecognize(japaneseNameRecognize) .enablePlaceRecognize(placeRecognize) .enableOrganizationRecognize(organizationRecognize) .enableCustomDictionary(useCustomDictionary) .enablePartOfSpeechTagging(speechTagging) .enableOffset(offset) .enableNumberQuantifierRecognize(numberQuantifierRecognize) .enableMultithreading(threads), null, speechTagging); }
Example #27
Source File: KeywordServiceImpl.java From onboard with Apache License 2.0 | 5 votes |
@Override public void generateOrUpdateKeywordsByIdentifiable(Recommendable identifiable) { deleteKeywordsByIdentifiable(identifiable); List<String> keywords = HanLP.extractKeyword(identifiable.generateText(), PER_IDENTIFIABLE_KEYWORD_COUNT); for (String keyword : keywords) { keywordMapper.insert(generateKeywordObjectByIdentifiableAndString(identifiable, keyword)); } }
Example #28
Source File: Parser.java From antiplag with Apache License 2.0 | 5 votes |
public boolean parseFile(File dir, String file) { try { currentFile = file; String[] strs = FileIO.readFile(new File(dir, file),"utf-8"); for(int line=0;line<strs.length;line++) { if(strs[line].trim().length()<1) { //���˵����� continue ; } List<Term> tokens = HanLP.segment(strs[line]); int col = 1; for(int j=0;j<tokens.size();j++) { Term token = tokens.get(j); struct.addToken(new DocToken(token.word, currentFile, line+1, col, token.length(), this)); col = col + tokens.get(j).word.length()+1; } } } catch (Exception e) { getProgram().addError("Parsing Error in '" + file + e.getMessage()); return false; } return true; }
Example #29
Source File: Tokenizer.java From antiplag with Apache License 2.0 | 5 votes |
public static String segment(String text,String sep) { StringBuilder sb = new StringBuilder(); HanLP.Config.Normalization = true; //������->���壬ȫ��->��ǣ���д->Сд�� List<Term> tokens = NotionalTokenizer.segment(text);//�ִʣ�ȥ��ͣ�ô� for(Term token : tokens) { sb.append(token.word+sep); } return sb.toString(); }
Example #30
Source File: TextRankSummary.java From TextRank with Apache License 2.0 | 5 votes |
/** * 一句话调用接口 * @param document 目标文档 * @param size 需要的关键句的个数 * @return 关键句列表 */ public static List<String> getTopSentenceList(String document, int size) { List<String> sentenceList = spiltSentence(document); List<List<String>> docs = new ArrayList<List<String>>(); for (String sentence : sentenceList) { List<Term> termList = HanLP.segment(sentence); List<String> wordList = new LinkedList<String>(); for (Term term : termList) { if (shouldInclude(term)) { wordList.add(term.word); } } docs.add(wordList); } TextRankSummary textRankSummary = new TextRankSummary(docs); int[] topSentence = textRankSummary.getTopSentence(size); List<String> resultList = new LinkedList<String>(); for (int i : topSentence) { resultList.add(sentenceList.get(i)); } return resultList; }