com.hankcs.hanlp.seg.Segment Java Examples

The following examples show how to use com.hankcs.hanlp.seg.Segment. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: WordFreqStatistics.java From similarity with Apache License 2.0

6 votes

public static void statistics(Segment segment, String inputFilePath) {
    try {
        //词频统计
        WordFreqStatistics statistic = new WordFreqStatistics(segment);
        BufferedReader reader = IOUtil.newBufferedReader(inputFilePath);
        String t;
        StringBuilder s = new StringBuilder();
        while ((t = reader.readLine()) != null) {
            s.append(t);
        }
        statistic.seg(s.toString());
        statistic.setResultPath(inputFilePath.replace(".txt", "") + "-WordFrequencyStatistics-Result.txt");
        statistic.dump();
        reader.close();
    } catch (IOException e) {
        logger.error("IO error: " + e.getLocalizedMessage());
    }
}

Example #2

Source File: TranslatedNameRecognition.java From danyuan-application with Apache License 2.0

5 votes

/**
 * 方法名： TranslatedName
 * 功 能： 取得音译人名识别
 * 参 数： @param str
 * 参 数： @return
 * 返 回： List<String>
 * 作 者 ： Tenghui.Wang
 * @throws
 */
public static List<String> TranslatedName(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableTranslatedNameRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nrf")) {
				list.add(term.word);
			}
		}
	}
	return list;
}

Example #3

Source File: HanLpTokenizer.java From jstarcraft-nlp with Apache License 2.0

5 votes

/**
 * @param segment              HanLP中的某个分词器
 * @param filter               停用词
 * @param enablePorterStemming 英文原型转换
 */
public HanLpTokenizer(Segment segment, Set<String> filter) {
    super();
    this.segment = new SegmentWrapper(this.input, segment);
    if (filter != null && filter.size() > 0) {
        this.filter = new BinTrie<String>();
        for (String stopWord : filter) {
            this.filter.put(stopWord, null);
        }
    }
}

Example #4

Source File: HanLPNLPAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(
        TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() -> {
            try {
                return new PerceptronLexicalAnalyzer();
            } catch (IOException e) {
                logger.error("can not use nlp analyzer, provider default", e);
                return HanLP.newSegment();
            }
        }), configuration));
}

Example #5

Source File: HanLPTokenizer.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

/**
 * @param segment              HanLP中的某个分词器
 * @param filter               停用词
 * @param enablePorterStemming 英文原型转换
 */
public HanLPTokenizer(Segment segment, Set<String> filter, boolean enablePorterStemming) {
    super();
    this.segment = new SegmentWrapper(input, segment);
    if (filter != null && filter.size() > 0) {
        this.filter = new BinTrie<String>();
        for (String stopWord : filter) {
            this.filter.put(stopWord, null);
        }
    }
    this.enablePorterStemming = enablePorterStemming;
}

Example #6

Source File: HanLPCRFAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(
        TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() -> {
            try {
                return new CRFLexicalAnalyzer();
            } catch (IOException e) {
                logger.error("can not use crf analyzer, provider default", e);
                return HanLP.newSegment();
            }
        }), configuration));
}

Example #7

Source File: HanLPTokenizerFactory.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

public static HanLPTokenizerFactory createCRF(IndexSettings indexSettings,
                                              Environment environment,
                                              String name,
                                              Settings settings) {
    return new HanLPTokenizerFactory(indexSettings, environment, name, settings) {
        @Override
        public Tokenizer create() {
            Segment seg = new CRFSegment().enablePartOfSpeechTagging(true);
            return new HanLPTokenizer(seg, defaultStopWordDictionary, enablePorterStemming);
        }
    };
}

Example #8

Source File: HanLPTokenizerFactory.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

public static HanLPTokenizerFactory createShortest(IndexSettings indexSettings,
                                                   Environment environment,
                                                   String name,
                                                   Settings settings) {
    return new HanLPTokenizerFactory(indexSettings, environment, name, settings) {
        @Override
        public Tokenizer create() {
            Segment seg = new DijkstraSegment().enableCustomDictionary(false)
                                               .enablePlaceRecognize(true)
                                               .enableOrganizationRecognize(true);
            return new HanLPTokenizer(seg, defaultStopWordDictionary, enablePorterStemming);
        }
    };
}

Example #9

Source File: ChineseNameRecognition.java From danyuan-application with Apache License 2.0

5 votes

/**
 * 方法名： ChineseName
 * 功 能： 取得中文名
 * 参 数： @param str
 * 参 数： @return
 * 返 回： List<String>
 * 作 者 ： Tenghui.Wang
 * @throws
 */
public static List<String> ChineseName(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableNameRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nr")) {
				list.add(term.word);
			}
		}
	}
	return list;
}

Example #10

Source File: OrganizationRecognition.java From danyuan-application with Apache License 2.0

5 votes

/**
 * 方法名： Organization
 * 功 能： 取得组织机构
 * 参 数： @param str
 * 参 数： @return
 * 返 回： List<String>
 * 作 者 ： Tenghui.Wang
 * @throws
 */
public static List<String> Organization(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableOrganizationRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nt")) {
				list.add(term.word);
			}
		}
	}
	return list;
}

Example #11

Source File: JapaneseNameRecognition.java From danyuan-application with Apache License 2.0

5 votes

/**
 * 方法名： TranslatedName
 * 功 能： 取得日本人名
 * 参 数： @param str
 * 参 数： @return
 * 返 回： List<String>
 * 作 者 ： Tenghui.Wang
 * @throws
 */
public static List<String> JapaneseName(String[] str) {
	List<String> list = new ArrayList<String>();
	Segment segment = HanLP.newSegment().enableJapaneseNameRecognize(true);
	for (String sentence : str) {
		List<Term> termList = segment.seg(sentence);
		for (Term term : termList) {
			if (term.toString().contains("nrj")) {
				list.add(term.word);
			}
		}
	}
	return list;
}

Example #12

Source File: Tokenizer.java From similarity with Apache License 2.0

5 votes

public static void fileSegment(Segment segment, String inputFilePath, String outputFilePath) {
    try {
        WordFreqStatistics.statistics(segment, inputFilePath);
        BufferedReader reader = IOUtil.newBufferedReader(inputFilePath);
        long allCount = 0;
        long lexCount = 0;
        long start = System.currentTimeMillis();
        String outPath = inputFilePath.replace(".txt", "") + "-Segment-Result.txt";
        if (outputFilePath != null && outputFilePath.trim().length() > 0) outPath = outputFilePath;
        FileOutputStream fos = new FileOutputStream(new File(outPath));
        String temp;
        while ((temp = reader.readLine()) != null) {
            List<Term> parse = segment.seg(temp);
            StringBuilder sb = new StringBuilder();
            for (Term term : parse) {
                sb.append(term.toString() + "\t");
                if (term.word.trim().length() > 0) {
                    allCount += term.length();
                    lexCount += 1;
                }
            }
            fos.write(sb.toString().trim().getBytes());
            fos.write("\n".getBytes());
        }

        fos.flush();
        fos.close();
        reader.close();
        long end = System.currentTimeMillis();
        System.out.println("segment result save：" + outPath);
        System.out.println("total " + allCount + " chars, " + lexCount + " words, spend" + (end - start) + "ms ");
    } catch (IOException e) {
        logger.error("IO error: " + e.getLocalizedMessage());
    }
}

Example #13

Source File: HanlpSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
protected Tokenizer getSegmenter() {
    Segment segment = HanLP.newSegment();
    segment.enableOffset(true);
    HanLpTokenizer tokenizer = new HanLpTokenizer(segment, Collections.EMPTY_SET);
    return tokenizer;
}

Example #14

Source File: HanLPTokenizer.java From hanlp-lucene-plugin with Apache License 2.0

5 votes

/**
 * @param segment              HanLP中的某个分词器
 * @param filter               停用词
 * @param enablePorterStemming 英文原型转换
 */
public HanLPTokenizer(Segment segment, Set<String> filter, boolean enablePorterStemming)
{
    super();
    this.segment = new SegmentWrapper(input, segment);
    if (filter != null && filter.size() > 0)
    {
        this.filter = new BinTrie<String>();
        for (String stopWord : filter)
        {
            this.filter.put(stopWord, null);
        }
    }
    this.enablePorterStemming = enablePorterStemming;
}

Example #15

Source File: HanLpSegmentFactory.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public Segment build(Map<String, String> configurations) {
    String algorithm = get(configurations, "algorithm", "viterbi");
    Segment segment = HanLP.newSegment(algorithm);

    // 设置模式
    segment.enableIndexMode(getBoolean(configurations, "enableIndexMode", false));

    segment.enableOffset(true);

    // 是否识别数词和量词
    segment.enableNumberQuantifierRecognize(getBoolean(configurations, "enableNumberQuantifierRecognize", false));

    // 是否识别人名
    segment.enableNameRecognize(getBoolean(configurations, "enableNameRecognize", false));

    // 是否识别音译名
    // TODO 考虑是否依赖enableNameRecognize
    segment.enableTranslatedNameRecognize(getBoolean(configurations, "enableTranslatedNameRecognize", false));

    // 是否识别日本名?
    // TODO 考虑是否依赖enableNameRecognize
    segment.enableJapaneseNameRecognize(getBoolean(configurations, "enableJapaneseNameRecognize", false));

    // 是否识别组织名
    segment.enableOrganizationRecognize(getBoolean(configurations, "enableOrganizationRecognize", false));

    // 是否识别地名
    segment.enablePlaceRecognize(getBoolean(configurations, "enablePlaceRecognize", false));
    return segment;
}

Example #16

Source File: HanLPTokenizerFactory.java From elasticsearch-analysis-hanlp with Apache License 2.0

5 votes

public static HanLPTokenizerFactory createNShort(IndexSettings indexSettings,
                                                 Environment environment,
                                                 String name,
                                                 Settings settings) {
    return new HanLPTokenizerFactory(indexSettings, environment, name, settings) {
        @Override
        public Tokenizer create() {
            Segment seg = new NShortSegment().enableCustomDictionary(false)
                                             .enablePlaceRecognize(true)
                                             .enableOrganizationRecognize(true);
            return new HanLPTokenizer(seg, defaultStopWordDictionary, enablePorterStemming);
        }
    };
}

Example #17

Source File: WordFreqStatistics.java From similarity with Apache License 2.0

4 votes

public WordFreqStatistics(String resultPath, Segment segment) {
    this.resultPath = resultPath;
    this.segment = segment;
}

Example #18

Source File: SegmentWrapper.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

public SegmentWrapper(Reader reader, Segment segment) {
    scanner = createScanner(reader);
    this.segment = segment;
}

Example #19

Source File: WordFreqStatistics.java From similarity with Apache License 2.0

4 votes

public Segment getSegment() {
    return segment;
}

Example #20

Source File: WordFreqStatistics.java From similarity with Apache License 2.0

4 votes

public void setSegment(Segment segment) {
    this.segment = segment;
}

Example #21

Source File: WordFreqStatistics.java From similarity with Apache License 2.0

4 votes

public WordFreqStatistics(Segment segment) {
    this.segment = segment;
}

Example #22

Source File: HanLPSpeedAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(TokenizerBuilder.tokenizer(AccessController
            .doPrivileged((PrivilegedAction<Segment>)() -> new DoubleArrayTrieSegment().enableCustomDictionary(false)),
        configuration));
}

Example #23

Source File: SegmentWrapper.java From hanlp-lucene-plugin with Apache License 2.0

4 votes

public SegmentWrapper(Reader reader, Segment segment)
{
    this.input = reader;
    this.segment = segment;
}

Example #24

Source File: HanLPAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)HanLP::newSegment), configuration));
}

Example #25

Source File: HanLPNShortAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(TokenizerBuilder.tokenizer(
        AccessController.doPrivileged((PrivilegedAction<Segment>)() -> new NShortSegment().enableCustomDictionary(
            false).enablePlaceRecognize(true).enableOrganizationRecognize(true)), configuration));
}

Example #26

Source File: HanLPStandardAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)HanLP::newSegment), configuration));
}

Example #27

Source File: HanLPIndexAnalyzer.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
    return new Analyzer.TokenStreamComponents(
        TokenizerBuilder.tokenizer(AccessController.doPrivileged((PrivilegedAction<Segment>)() ->
            HanLP.newSegment().enableIndexMode(true)), configuration));
}

Example #28

Source File: SegmentWrapper.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

public SegmentWrapper(Reader reader, Segment segment) {
    scanner = createScanner(reader);
    this.segment = segment;
}

Example #29

Source File: SegmentWrapper.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

public SegmentWrapper(Reader reader, Segment segment, Configuration configuration) {
    scanner = createScanner(reader);
    this.segment = segment;
    this.configuration = configuration;
}

Example #30

Source File: HanLPTokenizer.java From elasticsearch-analysis-hanlp with Apache License 2.0

4 votes

/**
 * @param segment       HanLP中的某个分词器
 * @param configuration 分词配置
 */
public HanLPTokenizer(Segment segment, Configuration configuration) {
    this.configuration = configuration;
    this.segment = new SegmentWrapper(this.input, segment, configuration);
}