com.hankcs.hanlp.HanLP Java Examples

The following examples show how to use com.hankcs.hanlp.HanLP. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SignUpControll.java    From rebuild with GNU General Public License v3.0 7 votes vote down vote up
@RequestMapping("checkout-name")
public void checkoutName(HttpServletRequest request, HttpServletResponse response) throws IOException {
	String fullName = getParameterNotNull(request, "fullName");
	
	fullName = fullName.replaceAll("[^a-zA-Z0-9\u4e00-\u9fa5]", "");
	String loginName = HanLP.convertToPinyinString(fullName, "", false);
	if (loginName.length() > 20) {
		loginName = loginName.substring(0, 20);
	}
	if (BlackList.isBlack(loginName)) {
		writeSuccess(response);
		return;
	}
	
	for (int i = 0; i < 100; i++) {
		if (Application.getUserStore().existsName(loginName)) {
			loginName += RandomUtils.nextInt(99);
		} else {
			break;
		}
	}
	
	loginName = loginName.toLowerCase();
	writeSuccess(response, loginName);
}
 
Example #2
Source File: HanLPTokenizerTest.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
public void testMultiText() throws Exception
{
    String[] sentences = new String[]{
            "中华人民共和国",
            "地大物博"
    };
    tokenizer = new HanLPTokenizer(HanLP.newSegment()
                                           .enableJapaneseNameRecognize(true)
                                           .enableIndexMode(true), null, false);
    for (String sentence : sentences)
    {
        tokenizer.setReader(new StringReader(sentence));
        tokenizer.reset();
        testIncrementToken();
        tokenizer.close();
    }
}
 
Example #3
Source File: HanLPNLPAnalyzer.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(
        TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() -> {
            try {
                return new PerceptronLexicalAnalyzer();
            } catch (IOException e) {
                logger.error("can not use nlp analyzer, provider default", e);
                return HanLP.newSegment();
            }
        }), configuration));
}
 
Example #4
Source File: Nlputil.java    From dk-fitting with Apache License 2.0 5 votes vote down vote up
/**
 * 拼音转换
 *
 * @param txt 要转换拼音的语句
 * @return 拼音列表
 */
public static String convertToPinyinList(String txt){
    String pinyinString = "";
    if (txt == null){
        return String.valueOf(Collections.emptyList());
    }
    List<Pinyin> pinyinList = HanLP.convertToPinyinList(txt);
    for(Pinyin s : pinyinList){
        pinyinString = s +","+ pinyinString;
    }
    return pinyinString;
}
 
Example #5
Source File: TranslatedNameRecognition.java    From danyuan-application with Apache License 2.0 5 votes vote down vote up
/**
 * 方法名: TranslatedName
 * 功 能: 取得音译人名识别
 * 参 数: @param str
 * 参 数: @return
 * 返 回: List<String>
 * 作 者 : Tenghui.Wang
 * @throws
 */
public static List<String> TranslatedName(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableTranslatedNameRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nrf")) {
				list.add(term.word);
			}
		}
	}
	return list;
}
 
Example #6
Source File: DKNLPBase.java    From dk-fitting with Apache License 2.0 5 votes vote down vote up
/**
 * 拼音转换
 *
 * @param txt 要转换拼音的语句
 * @return 拼音列表
 */
public static List<Pinyin> convertToPinyinList(String txt)
{
    if (txt == null) return Collections.emptyList();

    return HanLP.convertToPinyinList(txt);
}
 
Example #7
Source File: Field2Schema.java    From rebuild with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 中文 -> 拼音(仅保留字母数字)
 * 全英文+数字直接返回,不支持的字符会使用随机数
 * 
 * @param text
 * @return
 */
protected String toPinyinName(final String text) {
	String identifier = text;
	if (text.length() < 4) {
		identifier = "rb" + text + RandomUtils.nextInt(10);
	}
	
	// 全英文直接返回
	if (identifier.matches("[a-zA-Z0-9]+")) {
		if (!CharSet.ASCII_ALPHA.contains(identifier.charAt(0))
				|| BlackList.isBlack(identifier) || BlackList.isSQLKeyword(identifier)) {
			identifier = "rb" + identifier;
		}
		return identifier;
	}
	
	identifier = HanLP.convertToPinyinString(identifier, "", false);
	identifier = identifier.replaceAll("[^a-zA-Z0-9]", "");
	if (StringUtils.isBlank(identifier)) {
		identifier = String.valueOf(System.currentTimeMillis() / 1000);
	}

	char start = identifier.charAt(0);
	if (!CharSet.ASCII_ALPHA.contains(start)) {
		identifier = "rb" + identifier;
	}
	
	identifier = identifier.toLowerCase();
	if (identifier.length() > 42) {
		identifier = identifier.substring(0, 42);
	}
	
	if (!StringHelper.isIdentifier(identifier)) {
		throw new ModifiyMetadataException("无效名称 : " + text);
	}
	return identifier;
}
 
Example #8
Source File: JapaneseNameRecognition.java    From danyuan-application with Apache License 2.0 5 votes vote down vote up
/**
 * 方法名: TranslatedName
 * 功 能: 取得日本人名
 * 参 数: @param str
 * 参 数: @return
 * 返 回: List<String>
 * 作 者 : Tenghui.Wang
 * @throws
 */
public static List<String> JapaneseName(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableJapaneseNameRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nrj")) {
				list.add(term.word);
			}
		}
	}
	return list;
}
 
Example #9
Source File: OrganizationRecognition.java    From danyuan-application with Apache License 2.0 5 votes vote down vote up
/**
 * 方法名: Organization
 * 功 能: 取得组织机构
 * 参 数: @param str
 * 参 数: @return
 * 返 回: List<String>
 * 作 者 : Tenghui.Wang
 * @throws
 */
public static List<String> Organization(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableOrganizationRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nt")) {
				list.add(term.word);
			}
		}
	}
	return list;
}
 
Example #10
Source File: Nlputil.java    From dk-fitting with Apache License 2.0 5 votes vote down vote up
/**
 * 关键词提取
 *
 * @param txt    要提取关键词的语句
 * @param keySum 要提取关键字的数量
 * @return 关键词列表
 */
public static String extractKeyword(String txt, int keySum){
    String keyString = "";
    if (txt == null || keySum <= 0){
        return String.valueOf(Collections.emptyList());
    }
    List<String> keyList = HanLP.extractKeyword(txt, keySum);
    for(String s : keyList){

        keyString = s +","+  keyString;
    }
    return keyString;
}
 
Example #11
Source File: Nlputil.java    From dk-fitting with Apache License 2.0 5 votes vote down vote up
/**
 * 短语提取
 *
 * @param txt   文本
 * @param phSum 需要多少个短语
 * @return 短语列表
 */
public static String extractPhrase(String txt, int phSum) {
    String phraseString = "";
    if (txt == null || phSum <= 0){
        return String.valueOf(Collections.emptyList());
    }
    List<String> phraseList = HanLP.extractPhrase(txt, phSum);
    for(String s : phraseList){
        phraseString = s +","+ phraseString;
    }
    return phraseString;
}
 
Example #12
Source File: BaseAction.java    From o2oa with GNU Affero General Public License v3.0 5 votes vote down vote up
protected List<String> keys(String key) {
	List<String> os = new ArrayList<>();
	for (Term term : HanLP.segment(key)) {
		/* 字段不要太长 */
		if (StringUtils.length(term.word) < 31) {
			os.add(StringUtils.lowerCase(term.word));
		}
	}
	return os;
}
 
Example #13
Source File: ExtMonitor.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
@Override
public void run() {
    List<DictionaryFile> originalDictionaryFileList = DictionaryFileCache.getCustomDictionaryFileList();
    logger.debug("hanlp original custom dictionary: {}", Arrays.toString(originalDictionaryFileList.toArray()));
    reloadProperty();
    List<DictionaryFile> currentDictironaryFileList = getCurrentDictionaryFileList(HanLP.Config.CustomDictionaryPath);
    logger.debug("hanlp current custom dictionary: {}", Arrays.toString(currentDictironaryFileList.toArray()));
    boolean isModified = false;
    for (DictionaryFile currentDictionaryFile : currentDictironaryFileList) {
        if (!originalDictionaryFileList.contains(currentDictionaryFile)) {
            isModified = true;
            break;
        }
    }
    if (isModified) {
        logger.info("reloading hanlp custom dictionary");
        try {
            AccessController.doPrivileged((PrivilegedAction) CustomDictionaryUtility::reload);
        } catch (Exception e) {
            logger.error("can not reload hanlp custom dictionary", e);
        }
        DictionaryFileCache.setCustomDictionaryFileList(currentDictironaryFileList);
        DictionaryFileCache.writeCache();
        logger.info("finish reload hanlp custom dictionary");
    } else {
        logger.info("hanlp custom dictionary isn't modified, so no need reload");
    }
}
 
Example #14
Source File: ExtMonitor.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
private void reloadProperty() {
    Properties p = new Properties();
    try {
        ClassLoader loader = AccessController.doPrivileged((PrivilegedAction<ClassLoader>) () -> Thread.currentThread().getContextClassLoader());
        if (loader == null) {
            loader = HanLP.Config.class.getClassLoader();
        }
        p.load(new InputStreamReader(Predefine.HANLP_PROPERTIES_PATH == null ? loader.getResourceAsStream("hanlp.properties") : new FileInputStream(Predefine.HANLP_PROPERTIES_PATH), "UTF-8"));
        String root = p.getProperty("root", "").replaceAll("\\\\", "/");
        if (root.length() > 0 && !root.endsWith("/")) {
            root += "/";
        }
        String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";");
        String prePath = root;
        for (int i = 0; i < pathArray.length; ++i) {
            if (pathArray[i].startsWith(" ")) {
                pathArray[i] = prePath + pathArray[i].trim();
            } else {
                pathArray[i] = root + pathArray[i];
                int lastSplash = pathArray[i].lastIndexOf('/');
                if (lastSplash != -1) {
                    prePath = pathArray[i].substring(0, lastSplash + 1);
                }
            }
        }
        AccessController.doPrivileged((PrivilegedAction) () -> HanLP.Config.CustomDictionaryPath = pathArray);
    } catch (Exception e) {
        logger.error("can not find hanlp.properties", e);
    }
}
 
Example #15
Source File: CustomDictionaryUtility.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
public static boolean reload() {
    CustomDictionary.dat.getSize();
    String[] paths = HanLP.Config.CustomDictionaryPath;
    if (paths == null || paths.length == 0) {
        return false;
    }
    logger.debug("begin delete hanlp custom dictionary cache");
    IOUtil.deleteFile(paths[0] + Predefine.BIN_EXT);
    logger.debug("delete hanlp custom dictionary cache successfully");
    return loadMainDictionary(paths[0]);
}
 
Example #16
Source File: HanLPCRFAnalyzer.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(
        TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() -> {
            try {
                return new CRFLexicalAnalyzer();
            } catch (IOException e) {
                logger.error("can not use crf analyzer, provider default", e);
                return HanLP.newSegment();
            }
        }), configuration));
}
 
Example #17
Source File: HanLpSegmentFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public Segment build(Map<String, String> configurations) {
    String algorithm = get(configurations, "algorithm", "viterbi");
    Segment segment = HanLP.newSegment(algorithm);

    // 设置模式
    segment.enableIndexMode(getBoolean(configurations, "enableIndexMode", false));

    segment.enableOffset(true);

    // 是否识别数词和量词
    segment.enableNumberQuantifierRecognize(getBoolean(configurations, "enableNumberQuantifierRecognize", false));

    // 是否识别人名
    segment.enableNameRecognize(getBoolean(configurations, "enableNameRecognize", false));

    // 是否识别音译名
    // TODO 考虑是否依赖enableNameRecognize
    segment.enableTranslatedNameRecognize(getBoolean(configurations, "enableTranslatedNameRecognize", false));

    // 是否识别日本名?
    // TODO 考虑是否依赖enableNameRecognize
    segment.enableJapaneseNameRecognize(getBoolean(configurations, "enableJapaneseNameRecognize", false));

    // 是否识别组织名
    segment.enableOrganizationRecognize(getBoolean(configurations, "enableOrganizationRecognize", false));

    // 是否识别地名
    segment.enablePlaceRecognize(getBoolean(configurations, "enablePlaceRecognize", false));
    return segment;
}
 
Example #18
Source File: TestSegmentWrapper.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
@Test
public void test1() {
    StringReader reader = new StringReader("张三\n\n\n新买的手机");
    SegmentWrapper wrapper = new SegmentWrapper(reader, HanLP.newSegment().enableOffset(true));
    while (true) {
        Term term = wrapper.next();
        if (term == null) {
            break;
        }
        System.out.println(term.word + "\t" + term.nature + "\t" + term.offset + "\t" + term.length());
    }
}
 
Example #19
Source File: HanLPTokenizerFactory.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
/**
 * 初始化工厂类
 *
 * @param args 通过这个Map保存xml中的配置项
 */
public HanLPTokenizerFactory(Map<String, String> args) {
    super(args);
    enableIndexMode = getBoolean(args, "enableIndexMode", true);
    enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
    enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
    enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
    enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
    enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
    enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
    enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
    enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
    enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
    HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
    Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
    if (customDictionaryPathSet != null) {
        HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
    }
    String stopWordDictionaryPath = get(args, "stopWordDictionaryPath");
    if (stopWordDictionaryPath != null) {
        stopWordDictionary = new TreeSet<>();
        stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
    }
    if (getBoolean(args, "enableDebug", false)) {
        HanLP.Config.enableDebug();
    }
}
 
Example #20
Source File: Tokenizer.java    From similarity with Apache License 2.0 5 votes vote down vote up
public static List<Word> segment(String sentence) {
    List<Word> results = new ArrayList<>();
    /*// ansj_seg
    List<org.xm.ansj.domain.Term> termList = StandardSegmentation.parse(sentence).getTerms();//ansj
    results.addAll(termList
            .stream()
            .map(term -> new Word(term.getName(), term.getNature().natureStr))
            .collect(Collectors.toList())
    );*/

    /*//Xmnlp
    List<org.xm.xmnlp.seg.domain.Term> termList = Xmnlp.segment(sentence);
    results.addAll(termList
            .stream()
            .map(term -> new Word(term.word, term.getNature().name()))
            .collect(Collectors.toList())
    );*/

    // HanLP
    List<Term> termList = HanLP.segment(sentence);
    results.addAll(termList
            .stream()
            .map(term -> new Word(term.word, term.nature.name()))
            .collect(Collectors.toList())
    );

    return results;
}
 
Example #21
Source File: PinyinDictionaryTest.java    From similarity with Apache License 2.0 5 votes vote down vote up
@Test
public void getXmnlpPinyin() throws Exception {
    // 胳臂
    String pinyin1 = HanLP.convertToPinyinList("胳臂").toString();
    System.out.println("胳臂:" + pinyin1);

    // 划船,计划
    System.out.println("划船:" + HanLP.convertToPinyinList("划船").toString());
    List<Pinyin> pinyinList = HanLP.convertToPinyinList("计划");
    System.out.println("计划:" + pinyinList.toString());
}
 
Example #22
Source File: Word2vecTest.java    From similarity with Apache License 2.0 5 votes vote down vote up
@Test
public void trainModel() throws Exception {
    HanLP.Config.ShowTermNature = false;// 关闭词性标注
    Tokenizer.fileSegment(SEGMENT, RAW_CORPUS, RAW_CORPUS_SPLIT);
    String outputModelPath = Word2vec.trainModel(RAW_CORPUS_SPLIT, RAW_CORPUS_SPLIT_MODEL);
    System.out.println("outputModelPath:" + outputModelPath);
}
 
Example #23
Source File: HanLPTokenizerFactory.java    From hanlp-lucene-plugin with Apache License 2.0 5 votes vote down vote up
/**
 * 初始化工厂类
 *
 * @param args 通过这个Map保存xml中的配置项
 */
public HanLPTokenizerFactory(Map<String, String> args)
{
    super(args);
    enableIndexMode = getBoolean(args, "enableIndexMode", true);
    enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
    enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
    enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
    enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", false);
    enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
    enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
    enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
    enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
    enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
    enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
    HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
    algorithm = getString(args, "algorithm", "viterbi");
    Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
    if (customDictionaryPathSet != null)
    {
        HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
    }
    String stopWordDictionaryPath = get(args, "stopWordDictionaryPath");
    if (stopWordDictionaryPath != null)
    {
        stopWordDictionary = new TreeSet<>();
        stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
    }
    if (getBoolean(args, "enableDebug", false))
    {
        HanLP.Config.enableDebug();
    }
}
 
Example #24
Source File: HanLPAnalyzer.java    From hanlp-lucene-plugin with Apache License 2.0 5 votes vote down vote up
/**
 * 重载Analyzer接口,构造分词组件
 */
@Override
protected TokenStreamComponents createComponents(String fieldName)
{
    Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableOffset(true), filter, enablePorterStemming);
    return new TokenStreamComponents(tokenizer);
}
 
Example #25
Source File: HanLPTokenizerTest.java    From hanlp-lucene-plugin with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception
{
    tokenizer = new HanLPTokenizer(HanLP.newSegment()
                                           .enableJapaneseNameRecognize(true)
                                           .enableIndexMode(true), null, false);
    tokenizer.setReader(new StringReader("林志玲亮相网友:确定不是波多野结衣?"));
    tokenizer.reset();
}
 
Example #26
Source File: HanLpTokenizerTokenizerFactory.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create() {
    return new HanLPTokenizer(HanLP.newSegment()
                                   .enableIndexMode(indexMode)
                                   .enableNameRecognize(nameRecognize)
                                   .enableTranslatedNameRecognize(translatedNameRecognize)
                                   .enableJapaneseNameRecognize(japaneseNameRecognize)
                                   .enablePlaceRecognize(placeRecognize)
                                   .enableOrganizationRecognize(organizationRecognize)
                                   .enableCustomDictionary(useCustomDictionary)
                                   .enablePartOfSpeechTagging(speechTagging)
                                   .enableOffset(offset)
                                   .enableNumberQuantifierRecognize(numberQuantifierRecognize)
                                   .enableMultithreading(threads), null, speechTagging);
}
 
Example #27
Source File: KeywordServiceImpl.java    From onboard with Apache License 2.0 5 votes vote down vote up
@Override
public void generateOrUpdateKeywordsByIdentifiable(Recommendable identifiable) {
    deleteKeywordsByIdentifiable(identifiable);
    List<String> keywords = HanLP.extractKeyword(identifiable.generateText(), PER_IDENTIFIABLE_KEYWORD_COUNT);
    for (String keyword : keywords) {
        keywordMapper.insert(generateKeywordObjectByIdentifiableAndString(identifiable, keyword));
    }
}
 
Example #28
Source File: Parser.java    From antiplag with Apache License 2.0 5 votes vote down vote up
public boolean parseFile(File dir, String file) {
	
	try {
	 currentFile = file;
        String[] strs = FileIO.readFile(new File(dir, file),"utf-8");
	 for(int line=0;line<strs.length;line++) {			
		if(strs[line].trim().length()<1) {   //���˵�����
			continue ;
		}
		List<Term> tokens = HanLP.segment(strs[line]);  
		int col = 1;
		for(int j=0;j<tokens.size();j++) {
			Term token = tokens.get(j);
			struct.addToken(new DocToken(token.word, currentFile, 
					line+1, col, token.length(), this));
			
			col = col + tokens.get(j).word.length()+1;
		}
		
	}
		
	} catch (Exception e) {
		getProgram().addError("Parsing Error in '" + file + e.getMessage());
		return false;
	}
	return true;
}
 
Example #29
Source File: Tokenizer.java    From antiplag with Apache License 2.0 5 votes vote down vote up
public static String segment(String text,String sep) {
	   StringBuilder sb = new StringBuilder();
       HanLP.Config.Normalization = true; //������->���壬ȫ��->��ǣ���д->Сд��
       List<Term> tokens = NotionalTokenizer.segment(text);//�ִʣ�ȥ��ͣ�ô�
       for(Term token : tokens) {
    	   sb.append(token.word+sep);
       }
       return sb.toString();
}
 
Example #30
Source File: TextRankSummary.java    From TextRank with Apache License 2.0 5 votes vote down vote up
/**
 * 一句话调用接口
 * @param document 目标文档
 * @param size 需要的关键句的个数
 * @return 关键句列表
 */
public static List<String> getTopSentenceList(String document, int size)
{
    List<String> sentenceList = spiltSentence(document);
    List<List<String>> docs = new ArrayList<List<String>>();
    for (String sentence : sentenceList)
    {
        List<Term> termList = HanLP.segment(sentence);
        List<String> wordList = new LinkedList<String>();
        for (Term term : termList)
        {
            if (shouldInclude(term))
            {
                wordList.add(term.word);
            }
        }
        docs.add(wordList);
    }
    TextRankSummary textRankSummary = new TextRankSummary(docs);
    int[] topSentence = textRankSummary.getTopSentence(size);
    List<String> resultList = new LinkedList<String>();
    for (int i : topSentence)
    {
        resultList.add(sentenceList.get(i));
    }
    return resultList;
}