Python jieba.add_word() Examples

The following are 14 code examples of jieba.add_word(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: nlp.py    From open-entity-relation-extraction with MIT License 6 votes vote down vote up
def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
                jieba.add_word(entity)
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))  # 单个用户词加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))  # 单个用户词加入示例
        # 分词,不进行词性标注
        # lemmas = pynlpir.segment(sentence, pos_tagging=False)
        lemmas = jieba.lcut(sentence)
        # pynlpir.close()  # 释放
        return lemmas 
Example #2
Source File: matcher.py    From Chatbot with GNU General Public License v3.0 5 votes vote down vote up
def jiebaCustomSetting(self, dict_path, usr_dict_path):

        jieba.set_dictionary(dict_path)
        with open(usr_dict_path, 'r', encoding='utf-8') as dic:
            for word in dic:
                jieba.add_word(word.strip('\n')) 
Example #3
Source File: nlp.py    From open-entity-relation-extraction with MIT License 5 votes vote down vote up
def __init__(self, user_dict_dir=default_user_dict_dir, model_dir=default_model_dir):
        self.default_user_dict_dir = user_dict_dir
        self.default_model_dir = model_dir
        # 初始化分词器
        # pynlpir.open()  # 初始化分词器
        # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
        files = os.listdir(user_dict_dir)
        for file in files:
            file_path = os.path.join(user_dict_dir, file)
            # 文件夹则跳过
            if os.path.isdir(file):
                continue
            with open(file_path, 'r', encoding='utf-8') as f:
                line = f.readline()
                while line:
                    word = line.strip('\n').strip()
                    jieba.add_word(word)
                    # print(c_char_p(word.encode()))
                    # pynlpir.nlpir.AddUserWord(c_char_p(word.encode()))
                    line = f.readline()

        # 加载ltp模型
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))

        if postag_flag or ner_flag or parse_flag:
            print('load model failed!') 
Example #4
Source File: xxx.py    From Medical-Named-Entity-Rec-Based-on-Dilated-CNN with GNU General Public License v3.0 5 votes vote down vote up
def load_dict():
    dics=csv.reader(open("DICT_NOW.csv",'r',encoding='utf8'))
    flag=0
    fuhao = [';', '。', '?', '?', '!', '!', ';']

    for row in dics:
        if flag==0:
            flag=1
            continue
        if len(row)==2:
            jieba.add_word(row[0].strip(),tag=row[1].strip()) 
Example #5
Source File: harvesttext.py    From HarvestText with MIT License 5 votes vote down vote up
def prepare(self):
        self.prepared = True
        for type0 in self.entity_types:
            tag0 = "n"
            if "人名" in type0:
                tag0 = "nr"
            elif "地名" in type0:
                tag0 = "ns"
            elif "机构" in type0:
                tag0 = "nt"
            elif "其他专名" in type0:
                tag0 = "nz"
            jieba.add_word(type0, freq = 10000, tag=tag0) 
Example #6
Source File: data_preprocess.py    From Neural-Headline-Generator-CN with GNU General Public License v3.0 5 votes vote down vote up
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
    jieba.enable_parallel(32)
    for word in custom_words:
        jieba.add_word(word)
    words=jieba.lcut(text)
    return words 
Example #7
Source File: spm_preprocessor.py    From fancy-nlp with GNU General Public License v3.0 5 votes vote down vote up
def load_word_dict(self) -> None:
        if self.external_word_dict:
            for word in self.external_word_dict:
                jieba.add_word(word, freq=1000000) 
Example #8
Source File: text_classification_preprocessor.py    From fancy-nlp with GNU General Public License v3.0 5 votes vote down vote up
def load_word_dict(self):
        if self.external_word_dict:
            for word in self.external_word_dict:
                jieba.add_word(word, freq=1000000) 
Example #9
Source File: ner_preprocessor.py    From fancy-nlp with GNU General Public License v3.0 5 votes vote down vote up
def load_word_dict(self):
        """Load external word dictionary in jieba"""
        if self.external_word_dict:
            for word in self.external_word_dict:
                jieba.add_word(word, freq=1000000) 
Example #10
Source File: text_Emotion.py    From AiLearning with GNU General Public License v3.0 5 votes vote down vote up
def load_word2jieba(self):
        vocab_list = load_pkl(self.vocab_list)
        if vocab_list != []:
            print("加载词的总量: ", len(vocab_list))
            for word in vocab_list:
                jieba.add_word(word) 
Example #11
Source File: text_Emotion.py    From AiLearning with GNU General Public License v3.0 5 votes vote down vote up
def load_data(self, word_index, vocab_list, test_size=0.25):
        STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"]
        if vocab_list != []:
            for word in vocab_list:
                jieba.add_word(word)

        def func(line):
            # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]
            words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
            indexs = [word_index.get(word, 0) for word in words]
            return indexs

        df = pd.read_excel(self.data_file, header=0, error_bad_lines=False, encoding="utf_8_sig")
        x = df["comment"].apply(lambda line: func(line)).tolist()
        x = pad_sequences(x, maxlen=self.MAX_SEQUENCE_LENGTH)
        y = df["label"].tolist()
        # 按照大小和顺序,生成 label(0,1,2...自然数类型)
        """
        In [7]: to_categorical(np.asarray([1,1,0,1,3]))
        Out[7]:
        array([[0., 1., 0., 0.],
            [0., 1., 0., 0.],
            [1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 0., 1.]], dtype=float32)
        """
        y = to_categorical(np.asarray(y))
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10000)
        return (x_train, y_train), (x_test, y_test) 
Example #12
Source File: matcher.py    From PTT-Chat-Generator with MIT License 5 votes vote down vote up
def jiebaCustomSetting(self, dict_path, usr_dict_path):

        jieba.set_dictionary(dict_path)
        with open(usr_dict_path, 'r', encoding='utf-8') as dic:
            for word in dic:
                jieba.add_word(word.strip('\n')) 
Example #13
Source File: matcher.py    From PTT-Chat-Generator with MIT License 5 votes vote down vote up
def TaibaCustomSetting(self, usr_dict):

        with open(usr_dict, 'r', encoding='utf-8') as dic:
            for word in dic:
                Taiba.add_word(word.strip('\n')) 
Example #14
Source File: data_loader.py    From Agriculture_KnowledgeGraph with GNU General Public License v3.0 5 votes vote down vote up
def sentence_segmentation(self, sentence, entity1, entity2):
        jieba.add_word(entity1, freq=999999)
        jieba.add_word(entity2, freq=999999)

        seglist = list(jieba.cut(sentence, cut_all=False, HMM=False))
        jieba.del_word(entity1)
        jieba.del_word(entity2)
        return seglist