Python jieba.posseg() Examples

The following are 30 code examples of jieba.posseg(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: jieba_pseg_extractor.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def posseg(text):
        # type: (Text) -> List[Token]

        import jieba
        import jieba.posseg as pseg

        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result 
Example #2
Source File: semantic.py    From chat with MIT License 5 votes vote down vote up
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    将句子切分为同义词向量标签。

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    如果同义词词典中没有则标注为切词工具默认的词性。

    Args:
        pattern: 'w'-分词, 'k'-唯一关键词,'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。
    """
    # 句尾标点符号过滤
    sentence = sentence.rstrip(''.join(punctuation_all))
    # 句尾语气词过滤
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in filter_characters]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in filter_characters:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector 
Example #3
Source File: jieba_test.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def testPosseg_NOHMM(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg_NOHMM", file=sys.stderr) 
Example #4
Source File: jieba_test.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def testPosseg(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg", file=sys.stderr) 
Example #5
Source File: tfidf.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf() 
Example #6
Source File: __main__.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f 
Example #7
Source File: jieba_test.py    From annotated_jieba with MIT License 5 votes vote down vote up
def testPosseg_NOHMM(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg_NOHMM", file=sys.stderr) 
Example #8
Source File: jieba_test.py    From annotated_jieba with MIT License 5 votes vote down vote up
def testPosseg(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg", file=sys.stderr) 
Example #9
Source File: tfidf.py    From annotated_jieba with MIT License 5 votes vote down vote up
def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf() 
Example #10
Source File: harvesttext.py    From HarvestText with MIT License 5 votes vote down vote up
def posseg(self, sent, standard_name=False, stopwords=None):
        if self.language == 'en':
            from nltk import word_tokenize, pos_tag
            stopwords = set() if stopwords is None else stopwords
            tokens = [word for word in word_tokenize(sent) if word not in stopwords]
            return pos_tag(tokens, tagset='universal')
        else:
            self.standard_name = standard_name
            entities_info = self.entity_linking(sent)
            sent2 = self.decoref(sent, entities_info)
            result = []
            i = 0
            for word, flag in pseg.cut(sent2):
                if word in self.entity_types:
                    if self.standard_name:
                        word = entities_info[i][1][0]  # 使用链接的实体
                    else:
                        l, r = entities_info[i][0]  # 或使用原文
                        word = sent[l:r]
                    flag = entities_info[i][1][1][1:-1]
                    i += 1
                else:
                    if stopwords and word in stopwords:
                        continue
                result.append((word, flag))
            return result 
Example #11
Source File: predict.py    From chinese_reading_comprehension with Apache License 2.0 5 votes vote down vote up
def get_n(sentence):
    words = jieba.posseg.cut(sentence)
    word_list = []
    for word, flag in words:
        if 'n' in flag or flag in ['vn']:
            word_list.append(word)
    return set(word_list) 
Example #12
Source File: tfidf.py    From python-girlfriend-mood with MIT License 5 votes vote down vote up
def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf() 
Example #13
Source File: __main__.py    From python-girlfriend-mood with MIT License 5 votes vote down vote up
def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f 
Example #14
Source File: tfidf.py    From QAbot_by_base_KG with MIT License 5 votes vote down vote up
def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf() 
Example #15
Source File: __main__.py    From QAbot_by_base_KG with MIT License 5 votes vote down vote up
def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f 
Example #16
Source File: __main__.py    From jieba_fast with MIT License 5 votes vote down vote up
def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f 
Example #17
Source File: jieba_pseg_extractor.py    From rasa_nlu_gq with Apache License 2.0 5 votes vote down vote up
def posseg(text):
        # type: (Text) -> List[Token]
        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result 
Example #18
Source File: tfidf.py    From jieba_fast with MIT License 5 votes vote down vote up
def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf() 
Example #19
Source File: jieba_test.py    From jieba_fast with MIT License 5 votes vote down vote up
def testPosseg(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg", file=sys.stderr) 
Example #20
Source File: jieba_test.py    From jieba_fast with MIT License 5 votes vote down vote up
def testPosseg_NOHMM(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg_NOHMM", file=sys.stderr) 
Example #21
Source File: __main__.py    From chinese-support-redux with GNU General Public License v3.0 5 votes vote down vote up
def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f 
Example #22
Source File: tfidf.py    From chinese-support-redux with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf() 
Example #23
Source File: predata.py    From medical-entity-recognition with Apache License 2.0 5 votes vote down vote up
def text2ner(text):
    seq, pos, label = [], [], []
    segment = jieba.posseg.cut(text)
    words, flags = [], []
    for seg in segment:
        words.append(seg.word)
        flags.append(seg.flag)
    i = 0
    tag = 'O'
    pre = 0  # 判断前面<>
    sign = 0  # 记录有多个连续的<>
    while i < len(words):
        if words[i] != '<':
            seq.append(words[i])
            pos.append(flags[i])
            label.append(tag)
            if tag == 'B':
                tag = 'I'
                sign = 1
            i += 1
        else:
            if words[i+1] == '/':
                pre -= 1
                if pre == 0:
                    tag = 'O'
                else:
                    tag = 'I'
            else:
                pre += 1
                if pre == 1:
                    tag = 'B'
                    sign = 0
                elif sign == 1:
                    tag = 'I'
            while i < len(words) and words[i] != '>':
                i += 1
            i += 1
    return seq, pos, label 
Example #24
Source File: tokenizer_test.py    From pycorrector with Apache License 2.0 5 votes vote down vote up
def test_segment():
    """测试疾病名纠错"""
    error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以'  # 奥美沙坦酯片
    print(error_sentence_1)
    print(segment(error_sentence_1))
    import jieba
    print(list(jieba.tokenize(error_sentence_1)))
    import jieba.posseg as pseg
    words = pseg.lcut("我爱北京天安门")  # jieba默认模式
    print('old:', words)
    # jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持
    # words = pseg.cut("我爱北京天安门", use_paddle=True)  # paddle模式
    # for word, flag in words:
    #     print('new:','%s %s' % (word, flag)) 
Example #25
Source File: __main__.py    From Synonyms with MIT License 5 votes vote down vote up
def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f 
Example #26
Source File: tfidf.py    From Synonyms with MIT License 5 votes vote down vote up
def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf() 
Example #27
Source File: jieba_pseg_extractor.py    From rasa_nlu_gq with Apache License 2.0 5 votes vote down vote up
def posseg_cut_examples(self, example):
        raw_entities = example.get("entities", [])
        example_posseg = self.posseg(example.text)

        for (item_posseg, start, end) in example_posseg:
            part_of_speech = self.component_config["part_of_speech"]
            for (word_posseg, flag_posseg) in item_posseg:
                if flag_posseg in part_of_speech:
                    raw_entities.append({
                        'start': start,
                        'end': end,
                        'value': word_posseg,
                        'entity': flag_posseg
                    })
        return raw_entities 
Example #28
Source File: jieba_pseg_extractor.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def posseg_cut_examples(self, example):
        raw_entities = example.get("entities", [])
        example_posseg = self.posseg(example.text)
        for (item_posseg, start, end) in example_posseg:
            part_of_speech = self.component_config["part_of_speech"]
            for (word_posseg, flag_posseg) in item_posseg:
                if flag_posseg in part_of_speech:
                    raw_entities.append({
                        'start': start,
                        'end': end,
                        'value': word_posseg,
                        'entity': flag_posseg
                    })
        return raw_entities 
Example #29
Source File: tfidf.py    From python-girlfriend-mood with MIT License 4 votes vote down vote up
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
        """
        Extract keywords from sentence using TF-IDF algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
                        if the POS of w is not in this list,it will be filtered.
            - withFlag: only work with allowPOS is not empty.
                        if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        if allowPOS:
            allowPOS = frozenset(allowPOS)
            words = self.postokenizer.cut(sentence)
        else:
            words = self.tokenizer.cut(sentence)
        freq = {}
        for w in words:
            if allowPOS:
                if w.flag not in allowPOS:
                    continue
                elif not withFlag:
                    w = w.word
            wc = w.word if allowPOS and withFlag else w
            if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
                continue
            freq[w] = freq.get(w, 0.0) + 1.0
        total = sum(freq.values())
        for k in freq:
            kw = k.word if allowPOS and withFlag else k
            freq[k] *= self.idf_freq.get(kw, self.median_idf) / total

        if withWeight:
            tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(freq, key=freq.__getitem__, reverse=True)
        if topK:
            return tags[:topK]
        else:
            return tags 
Example #30
Source File: reader.py    From medical-entity-recognition with Apache License 2.0 4 votes vote down vote up
def text2word_ner_bio_format(text):
    """
    将标签数据集转换成word_ner_BIO格式的标准数据集
    :param text:
    :return:
    """
    segment = jieba.posseg.cut(text)
    # 采用BIOSE方式
    # B: 开始,I:中间,O:无关词,S:单个词,E:结尾
    # 将训练数据转换为标准的ner格式的数据
    start = 0
    type = ''
    stack = []
    flag = 0
    features = []
    pieces = split(text)
    pre = 0
    for seg in segment:
        if seg.word == '<':
            flag = 1
            pre = 0
            continue
        elif seg.word == '>':
            flag = 0
            pre = 0
            continue

        if flag == 0:
            while start < len(pieces) and getType(pieces[start]) != 'OTHER':
                stack.append(getType(pieces[start]))
                start += 1
            while start < len(pieces) and getType(pieces[start][1:]) != 'OTHER':
                stack.pop()
                start += 1
            while start < len(pieces) and getType(pieces[start]) != 'OTHER':
                stack.append(getType(pieces[start]))
                start += 1
            if start < len(pieces):
                index = pieces[start].find(seg.word, pre)
                pre = index + 1
                if len(stack) == 0:
                    type = 'O'
                    if start < len(pieces) and index + len(seg.word) == len(pieces[start]):
                        start += 1
                else:
                    if start < len(pieces):
                        if index == 0:
                            type = 'B-' + stack[-1]
                        elif index != -1:
                            type = 'I-' + stack[-1]
                        if len(pieces[start]) - index == len(seg.word):
                            start += 1
                features.append([seg.word, seg.flag, type])
    return features