Python jieba.posseg() Examples
The following are 30
code examples of jieba.posseg().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
jieba
, or try the search function
.
Example #1
Source File: jieba_pseg_extractor.py From rasa_nlu with Apache License 2.0 | 5 votes |
def posseg(text): # type: (Text) -> List[Token] import jieba import jieba.posseg as pseg result = [] for (word, start, end) in jieba.tokenize(text): pseg_data = [(w, f) for (w, f) in pseg.cut(word)] result.append((pseg_data, start, end)) return result
Example #2
Source File: semantic.py From chat with MIT License | 5 votes |
def synonym_cut(sentence, pattern="wf"): """Cut the sentence into a synonym vector tag. 将句子切分为同义词向量标签。 If a word in this sentence was not found in the synonym dictionary, it will be marked with default value of the word segmentation tool. 如果同义词词典中没有则标注为切词工具默认的词性。 Args: pattern: 'w'-分词, 'k'-唯一关键词,'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。 """ # 句尾标点符号过滤 sentence = sentence.rstrip(''.join(punctuation_all)) # 句尾语气词过滤 sentence = sentence.rstrip(tone_words) synonym_vector = [] if pattern == "w": synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters] elif pattern == "k": synonym_vector = analyse.extract_tags(sentence, topK=1) elif pattern == "t": synonym_vector = analyse.extract_tags(sentence, topK=10) elif pattern == "wf": result = posseg.cut(sentence) # synonym_vector = [(item.word, item.flag) for item in result \ # if item.word not in filter_characters] # Modify in 2017.4.27 for item in result: if item.word not in filter_characters: if len(item.flag) < 4: item.flag = list(posseg.cut(item.word))[0].flag synonym_vector.append((item.word, item.flag)) elif pattern == "tf": result = posseg.cut(sentence) tags = analyse.extract_tags(sentence, topK=10) for item in result: if item.word in tags: synonym_vector.append((item.word, item.flag)) return synonym_vector
Example #3
Source File: jieba_test.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def testPosseg_NOHMM(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg_NOHMM", file=sys.stderr)
Example #4
Source File: jieba_test.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def testPosseg(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg", file=sys.stderr)
Example #5
Source File: tfidf.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def __init__(self, idf_path=None): self.tokenizer = jieba.dt self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) self.idf_freq, self.median_idf = self.idf_loader.get_idf()
Example #6
Source File: __main__.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def cutfunc(sentence, _, HMM=True): for w, f in jieba.posseg.cut(sentence, HMM): yield w + posdelim + f
Example #7
Source File: jieba_test.py From annotated_jieba with MIT License | 5 votes |
def testPosseg_NOHMM(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg_NOHMM", file=sys.stderr)
Example #8
Source File: jieba_test.py From annotated_jieba with MIT License | 5 votes |
def testPosseg(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg", file=sys.stderr)
Example #9
Source File: tfidf.py From annotated_jieba with MIT License | 5 votes |
def __init__(self, idf_path=None): self.tokenizer = jieba.dt self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) self.idf_freq, self.median_idf = self.idf_loader.get_idf()
Example #10
Source File: harvesttext.py From HarvestText with MIT License | 5 votes |
def posseg(self, sent, standard_name=False, stopwords=None): if self.language == 'en': from nltk import word_tokenize, pos_tag stopwords = set() if stopwords is None else stopwords tokens = [word for word in word_tokenize(sent) if word not in stopwords] return pos_tag(tokens, tagset='universal') else: self.standard_name = standard_name entities_info = self.entity_linking(sent) sent2 = self.decoref(sent, entities_info) result = [] i = 0 for word, flag in pseg.cut(sent2): if word in self.entity_types: if self.standard_name: word = entities_info[i][1][0] # 使用链接的实体 else: l, r = entities_info[i][0] # 或使用原文 word = sent[l:r] flag = entities_info[i][1][1][1:-1] i += 1 else: if stopwords and word in stopwords: continue result.append((word, flag)) return result
Example #11
Source File: predict.py From chinese_reading_comprehension with Apache License 2.0 | 5 votes |
def get_n(sentence): words = jieba.posseg.cut(sentence) word_list = [] for word, flag in words: if 'n' in flag or flag in ['vn']: word_list.append(word) return set(word_list)
Example #12
Source File: tfidf.py From python-girlfriend-mood with MIT License | 5 votes |
def __init__(self, idf_path=None): self.tokenizer = jieba.dt self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) self.idf_freq, self.median_idf = self.idf_loader.get_idf()
Example #13
Source File: __main__.py From python-girlfriend-mood with MIT License | 5 votes |
def cutfunc(sentence, _, HMM=True): for w, f in jieba.posseg.cut(sentence, HMM): yield w + posdelim + f
Example #14
Source File: tfidf.py From QAbot_by_base_KG with MIT License | 5 votes |
def __init__(self, idf_path=None): self.tokenizer = jieba.dt self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) self.idf_freq, self.median_idf = self.idf_loader.get_idf()
Example #15
Source File: __main__.py From QAbot_by_base_KG with MIT License | 5 votes |
def cutfunc(sentence, _, HMM=True): for w, f in jieba.posseg.cut(sentence, HMM): yield w + posdelim + f
Example #16
Source File: __main__.py From jieba_fast with MIT License | 5 votes |
def cutfunc(sentence, _, HMM=True): for w, f in jieba.posseg.cut(sentence, HMM): yield w + posdelim + f
Example #17
Source File: jieba_pseg_extractor.py From rasa_nlu_gq with Apache License 2.0 | 5 votes |
def posseg(text): # type: (Text) -> List[Token] result = [] for (word, start, end) in jieba.tokenize(text): pseg_data = [(w, f) for (w, f) in pseg.cut(word)] result.append((pseg_data, start, end)) return result
Example #18
Source File: tfidf.py From jieba_fast with MIT License | 5 votes |
def __init__(self, idf_path=None): self.tokenizer = jieba.dt self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) self.idf_freq, self.median_idf = self.idf_loader.get_idf()
Example #19
Source File: jieba_test.py From jieba_fast with MIT License | 5 votes |
def testPosseg(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg", file=sys.stderr)
Example #20
Source File: jieba_test.py From jieba_fast with MIT License | 5 votes |
def testPosseg_NOHMM(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg_NOHMM", file=sys.stderr)
Example #21
Source File: __main__.py From chinese-support-redux with GNU General Public License v3.0 | 5 votes |
def cutfunc(sentence, _, HMM=True): for w, f in jieba.posseg.cut(sentence, HMM): yield w + posdelim + f
Example #22
Source File: tfidf.py From chinese-support-redux with GNU General Public License v3.0 | 5 votes |
def __init__(self, idf_path=None): self.tokenizer = jieba.dt self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) self.idf_freq, self.median_idf = self.idf_loader.get_idf()
Example #23
Source File: predata.py From medical-entity-recognition with Apache License 2.0 | 5 votes |
def text2ner(text): seq, pos, label = [], [], [] segment = jieba.posseg.cut(text) words, flags = [], [] for seg in segment: words.append(seg.word) flags.append(seg.flag) i = 0 tag = 'O' pre = 0 # 判断前面<> sign = 0 # 记录有多个连续的<> while i < len(words): if words[i] != '<': seq.append(words[i]) pos.append(flags[i]) label.append(tag) if tag == 'B': tag = 'I' sign = 1 i += 1 else: if words[i+1] == '/': pre -= 1 if pre == 0: tag = 'O' else: tag = 'I' else: pre += 1 if pre == 1: tag = 'B' sign = 0 elif sign == 1: tag = 'I' while i < len(words) and words[i] != '>': i += 1 i += 1 return seq, pos, label
Example #24
Source File: tokenizer_test.py From pycorrector with Apache License 2.0 | 5 votes |
def test_segment(): """测试疾病名纠错""" error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以' # 奥美沙坦酯片 print(error_sentence_1) print(segment(error_sentence_1)) import jieba print(list(jieba.tokenize(error_sentence_1))) import jieba.posseg as pseg words = pseg.lcut("我爱北京天安门") # jieba默认模式 print('old:', words) # jieba.enable_paddle() # 启动paddle模式。 0.40版之后开始支持,早期版本不支持 # words = pseg.cut("我爱北京天安门", use_paddle=True) # paddle模式 # for word, flag in words: # print('new:','%s %s' % (word, flag))
Example #25
Source File: __main__.py From Synonyms with MIT License | 5 votes |
def cutfunc(sentence, _, HMM=True): for w, f in jieba.posseg.cut(sentence, HMM): yield w + posdelim + f
Example #26
Source File: tfidf.py From Synonyms with MIT License | 5 votes |
def __init__(self, idf_path=None): self.tokenizer = jieba.dt self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) self.idf_freq, self.median_idf = self.idf_loader.get_idf()
Example #27
Source File: jieba_pseg_extractor.py From rasa_nlu_gq with Apache License 2.0 | 5 votes |
def posseg_cut_examples(self, example): raw_entities = example.get("entities", []) example_posseg = self.posseg(example.text) for (item_posseg, start, end) in example_posseg: part_of_speech = self.component_config["part_of_speech"] for (word_posseg, flag_posseg) in item_posseg: if flag_posseg in part_of_speech: raw_entities.append({ 'start': start, 'end': end, 'value': word_posseg, 'entity': flag_posseg }) return raw_entities
Example #28
Source File: jieba_pseg_extractor.py From rasa_nlu with Apache License 2.0 | 5 votes |
def posseg_cut_examples(self, example): raw_entities = example.get("entities", []) example_posseg = self.posseg(example.text) for (item_posseg, start, end) in example_posseg: part_of_speech = self.component_config["part_of_speech"] for (word_posseg, flag_posseg) in item_posseg: if flag_posseg in part_of_speech: raw_entities.append({ 'start': start, 'end': end, 'value': word_posseg, 'entity': flag_posseg }) return raw_entities
Example #29
Source File: tfidf.py From python-girlfriend-mood with MIT License | 4 votes |
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False): """ Extract keywords from sentence using TF-IDF algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr']. if the POS of w is not in this list,it will be filtered. - withFlag: only work with allowPOS is not empty. if True, return a list of pair(word, weight) like posseg.cut if False, return a list of words """ if allowPOS: allowPOS = frozenset(allowPOS) words = self.postokenizer.cut(sentence) else: words = self.tokenizer.cut(sentence) freq = {} for w in words: if allowPOS: if w.flag not in allowPOS: continue elif not withFlag: w = w.word wc = w.word if allowPOS and withFlag else w if len(wc.strip()) < 2 or wc.lower() in self.stop_words: continue freq[w] = freq.get(w, 0.0) + 1.0 total = sum(freq.values()) for k in freq: kw = k.word if allowPOS and withFlag else k freq[k] *= self.idf_freq.get(kw, self.median_idf) / total if withWeight: tags = sorted(freq.items(), key=itemgetter(1), reverse=True) else: tags = sorted(freq, key=freq.__getitem__, reverse=True) if topK: return tags[:topK] else: return tags
Example #30
Source File: reader.py From medical-entity-recognition with Apache License 2.0 | 4 votes |
def text2word_ner_bio_format(text): """ 将标签数据集转换成word_ner_BIO格式的标准数据集 :param text: :return: """ segment = jieba.posseg.cut(text) # 采用BIOSE方式 # B: 开始,I:中间,O:无关词,S:单个词,E:结尾 # 将训练数据转换为标准的ner格式的数据 start = 0 type = '' stack = [] flag = 0 features = [] pieces = split(text) pre = 0 for seg in segment: if seg.word == '<': flag = 1 pre = 0 continue elif seg.word == '>': flag = 0 pre = 0 continue if flag == 0: while start < len(pieces) and getType(pieces[start]) != 'OTHER': stack.append(getType(pieces[start])) start += 1 while start < len(pieces) and getType(pieces[start][1:]) != 'OTHER': stack.pop() start += 1 while start < len(pieces) and getType(pieces[start]) != 'OTHER': stack.append(getType(pieces[start])) start += 1 if start < len(pieces): index = pieces[start].find(seg.word, pre) pre = index + 1 if len(stack) == 0: type = 'O' if start < len(pieces) and index + len(seg.word) == len(pieces[start]): start += 1 else: if start < len(pieces): if index == 0: type = 'B-' + stack[-1] elif index != -1: type = 'I-' + stack[-1] if len(pieces[start]) - index == len(seg.word): start += 1 features.append([seg.word, seg.flag, type]) return features