Python gensim.models.word2vec() Examples

The following are 16 code examples of gensim.models.word2vec(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models , or try the search function

Example #1

Source File: rulebase.py From Chatbot with GNU General Public License v3.0

6 votes

def load_model(self,path):

        """
        Load a trained word2vec model(binary format only).

        Args:
            path: the path of the model.
        """
        try:
            self.model = models.Word2Vec.load(path)  # current loading method
        except FileNotFoundError as file_not_found_err:
            print("[Gensim] FileNotFoundError", file_not_found_err)
            exit()
        except UnicodeDecodeError as unicode_decode_err:
            print("[Gensim] UnicodeDecodeError", unicode_decode_err)
            self.model = models.KeyedVectors.load_word2vec_format(path, binary=True)  # old loading method
        except Exception as ex:
            print("[Gensim] Exception", ex)
            exit()

Example #2

Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Example #3

Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Example #4

Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0

6 votes

def load_data_and_labels(data_file, word2vec_file):
    """
    Load research data from files, splits the data into words and generates labels.
    Return split sentences, labels and the max sentence length of the research data.

    Args:
        data_file: The research data
        word2vec_file: The word2vec model file
    Returns:
        The class Data
    """
    # Load word2vec file
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = word2vec.Word2Vec.load(word2vec_file)

    # Load data from files and split by words
    data = data_word2vec(input_file=data_file, word2vec_model=model)
    # plot_seq_len(data_file, data)

    return data

Example #5

Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Example #6

Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Example #7

Source File: nlp_chinese.py From simple_nlp_chinese with MIT License

6 votes

def train_model(file_input, file_output):
    file_intermediate = os.path.join(
        os.path.dirname(file_input),
        os.path.splitext(file_input)[0])
    process_corpus_extraction(
        file_input, file_intermediate + '.extracted')
    process_chinese_filtering(
        file_intermediate + '.extracted',
        file_intermediate + '.filtered')
    process_chinese_transformation(
        file_intermediate + '.filtered',
        file_intermediate + '.transformed')
    process_chinese_transformation(
        file_intermediate + '.transformed',
        file_intermediate + '.segmented')
    # we can train for either word2vec or doc2vec
    # process_word_training(
    #     file_intermediate + '.segmented', file_output)
    process_doc_training(
        file_intermediate + '.segmented', file_output)

Example #8

Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Example #9

Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Example #10

Source File: NLP.py From Financial-NLP with Apache License 2.0

5 votes

def load_model(self, save_model_name):
        """
        load model into the object(self.model)
        """
        self.model=word2vec.Word2Vec.load(save_model_name)
        self.len_vector=self.model.trainables.layer1_size
        try:
            self.renew_label_vec()
        except:
            self.safe_renew_label_vec()

Example #11

Source File: NLP.py From Financial-NLP with Apache License 2.0

5 votes

def safe_nlp_vector(self, words):
        """
        Parameters
            ----------
            words : list of str/str 
                wordbag
        Returns
            ----------
            ndarray(float)
                the corresponding vectors of words in wordbag.
                a vector contains the similarities calculated by word2vec and wordnet.
        """
        if isinstance(words, string_types):
            synonym=self.synonym_label(words)
            similarity=self.similarity_label(words)
        else:
            synonym=np.empty((len(self.Label_index),len(words)))
            similarity=np.empty((len(self.Label_index),len(words)))
            for i in range(len(words)):
                try:
                    synonym[:,i]=self.synonym_label(words[i])
                except:
                    synonym[:,i]=np.zeros((len(self.Label_index),1))[:,0]
                try:    
                    similarity[:,i]=self.similarity_label(words[i])[:,0]
                except:
                    similarity[:,i]=np.zeros((len(self.Label_index),1))[:,0]
        vector=np.concatenate((similarity, synonym))
        return vector

Example #12

Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0

5 votes

def load_data_and_labels(data_file, num_labels, word2vec_file, data_aug_flag):
    """
    Load research data from files, splits the data into words and generates labels.
    Return split sentences, labels and the max sentence length of the research data.

    Args:
        data_file: The research data
        num_labels: The number of classes
        word2vec_file: The word2vec model file
        data_aug_flag: The flag of data augmented
    Returns:
        The class _Data()
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    # Load word2vec file
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = word2vec.Word2Vec.load(word2vec_file)

    # Load data from files and split by words
    data = data_word2vec(input_file=data_file, num_labels=num_labels, word2vec_model=model)
    if data_aug_flag:
        data = data_augmented(data)

    # plot_seq_len(data_file, data)

    return data

Example #13

Source File: nlp_chinese.py From simple_nlp_chinese with MIT License

5 votes

def process_word_training(file_input, file_output):
    model = gensim.models.Word2Vec(
        gensim.models.word2vec.LineSentence(file_input),
        size=400, workers=multiprocessing.cpu_count())
    # trim unneeded model memory = use (much) less RAM
    model.init_sims(replace=True)
    model.save(file_output)

Example #14

Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0

5 votes

def load_data_and_labels(data_file, num_classes_list, total_classes, word2vec_file, data_aug_flag):
    """
    Load research data from files, splits the data into words and generates labels.
    Return split sentences, labels and the max sentence length of the research data.

    Args:
        data_file: The research data
        num_classes_list: <list> The number of classes
        total_classes: The total number of classes
        word2vec_file: The word2vec file
        data_aug_flag: The flag of data augmented
    Returns:
        The class _Data()
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    # Load word2vec file
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = word2vec.Word2Vec.load(word2vec_file)

    # Load data from files and split by words
    data = data_word2vec(data_file, num_classes_list, total_classes, word2vec_model=model)
    if data_aug_flag:
        data = data_augmented(data)

    # plot_seq_len(data_file, data)

    return data

Example #15

Source File: NLP.py From Financial-NLP with Apache License 2.0

4 votes

def train_Word2Vec(self, train_corpus, saveflag=False, save_model_name='NLP_model', Size=100, Min_count=5):#, show_process=True):
        """
        train the word2vec model with the processing file.
        Parameters
            ----------
            train_corpus : str/list of lists
                name(absolute path) of train_corpus.
                of a list of sentences(a sentence is a list of words).
            saveflag : bool
                save trained model locally?
            save_model_name : str
                the model name(absolute path)
                default: 'NLP_model'
            Size : int
                length of the word vector
            Min_count : int
                minimum frequence can a word record on dictionary.
        Returns
            Nothing
        """
        print('start training...')
        prev_time = datetime.datetime.now() #当前时间    
        
        self.len_vector=Size
        #if show_process==True:
        #    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)   
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)  
        if isinstance(train_corpus, string_types):
            sentences=self.txt2sentence(train_corpus)
        else:    
            sentences=train_corpus
        self.model=gensim.models.Word2Vec(sentences, size=Size, min_count=Min_count) #word to vector\in R^Size
        if saveflag:
            self.save_model(save_model_name) # save model locally
        try:
            self.renew_label_vec()
        except:
            self.safe_renew_label_vec()
        
        cur_time = datetime.datetime.now()  #训练后此时时间
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        print('done.')
        print("It costs %02d:%02d:%02d to train word2vec model." % (h, m, s))
        # model.wv.save_word2vec_format(save_model_name+".bin",binary=True)

Example #16

Source File: NLP.py From Financial-NLP with Apache License 2.0

4 votes

def show_Word2Vec(self, s, k=1, mode='topk'):
        """
        not often use now.
        Parameters
            ----------
            save_model_name : str
                the name of saved model
            s : str
            k : int/str
                if mode='similarity', it's a string.
                if mode='topk', it's a number, and defaultly 1.
            mode : str
                'similarity' : calculate the similarity between s and k, and note that k is a string.
                'topk' (default): find top k similar words of s, and note that k is a integer.
        Returns
            ----------
            float
                if mode='similarity', this is the similarity between s and k.
                if mode='return_topk', it'll not return a number but a iterator.
                if mode='topk', it'll print the most similar k words.
        """
        if self.model is None:
            raise Exception("no model")
            #model=word2vec.Word2Vec.load(save_model_name)
        if mode=='topk':
            y=self.model.most_similar(s,topn=k)
            print('与"%s"最相关的词有:\n' % s)
            for item in y:
                print(item[0],item[1])
        
        elif mode=='return_topk':
            return self.model.wv.most_similar(s,topn=k)
            #return model.most_similar(s,topn=k)
                
        elif mode=='similarity':
            y=self.model.wv.similarity(s,k) 
            # 余弦相似度，即对于两个向量v1,v2，先单位化后，再求内积。
            print('"%s"和"%s"的相似度为：%f%%' % (s,k,(y*100)))
            return y
        
        elif mode=='vector':
            print(self.model[s])