Python gensim.models() Examples
The following are 30
code examples of gensim.models().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim
, or try the search function
.
Example #1
Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0 | 6 votes |
def create_metadata_file(word2vec_file, output_file): """ Create the metadata file based on the corpus file (Used for the Embedding Visualization later). Args: word2vec_file: The word2vec file output_file: The metadata file path Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist.") model = gensim.models.Word2Vec.load(word2vec_file) word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()]) word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)] with open(output_file, 'w+') as fout: for word in word2idx_sorted: if word[0] is None: print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard") fout.write('<Empty Line>' + '\n') else: fout.write(word[0] + '\n')
Example #2
Source File: build_models.py From smappPy with GNU General Public License v2.0 | 6 votes |
def online_lda(corpus, dictionary, k=25, alpha="symmetric", chunk_size=10000, update_every=1, passes=1): """ Build the standard online LDA topic model (see gensim: http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation) Updates model every 'update_every' chunks, does 'passes' full passes over the corpus (updating every 'update_every' time each pass), and breaks corpus into 'chunk_size' document chunks. EG: chunk_size=100, update_every=1, passes=1: Does one full pass over the corpus, updating the model every one chunk, breaking the whole corpus into corpus_size/chunk_size chunks. 500 documents => 5 chunks, updates model on every chunk. Alpha values can be "symmetric", "asymmetric", and "auto". See: http://radimrehurek.com/gensim/models/ldamodel.html """ return models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=k, alpha=alpha, chunksize=chunk_size, update_every=update_every, passes=passes)
Example #3
Source File: build_models.py From smappPy with GNU General Public License v2.0 | 6 votes |
def batch_lda(corpus, dictionary, k=25, alpha="symmetric", passes=20): """ Build basic batch LDA topic model (see gensim: http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation) Does 'passes' number of passes over the whole corpus, no chunking, and updates the model at the end of every full pass. Alpha values can be "symmetric", "asymmetric", and "auto". See: http://radimrehurek.com/gensim/models/ldamodel.html """ return models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=k, alpha=alpha, update_every=0, passes=passes)
Example #4
Source File: textAnalysis.py From deep_learning with MIT License | 6 votes |
def predictData(): """ 使用模型预测真实数据 """ input_texts = ["很好很满意","不好不满意","质量有问题","商家态度很差","售后很渣,渣渣"] # word_model = word2vec.Word2Vec.load('./models/Word2vec_model.model') # w2indx, w2vec, texts = create_dictionaries(word_model, texts) # print(texts) texts = predict_wordtoVect(input_texts) model = get_model() # # 预测 pred_result = model.predict_classes(texts) print(pred_result) labels = [int(round(x[0])) for x in pred_result] label2word = {1: '正面', 0: '负面'} for i in range(len(pred_result)): print('{0} -------- {1}'.format(label2word[labels[i]], input_texts[i]))
Example #5
Source File: get_indices.py From NETL-Automatic-Topic-Labelling- with Apache License 2.0 | 6 votes |
def get_word(word): inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word) if inst == None: length = len(word.split("_")) if length < 5: return True, word else: if inst.group(1) != "disambiguation": word2 = re.sub(r'_\(.+\)','',word) if len(word2.split(" ")) <5: return True, word return False,word # Load the trained doc2vec and word2vec models.
Example #6
Source File: rock_gensim.py From MusicTaster with MIT License | 6 votes |
def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250, iter_n=50): if not input_datas and data_path: input_datas = pickle.load(open(data_path, 'rb')) full_data = [] for i in input_datas: tmp = [] for j in i: tmp.append(j[0]) tmp.append(j[1]) full_data.append(tmp) data_process_logger.info('start training') wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window, size=size, iter=iter_n) with open(fout_path, 'wb') as fout: data_process_logger.info('start saving model') pickle.dump(wv_model, fout) print 'model saved'
Example #7
Source File: predict_phrase.py From EARL with GNU General Public License v3.0 | 6 votes |
def predict_phrase(phrase): #load the model #preprocess the phrase #phrase_clean = clean_str(phrase) phrase_clean = phrase #load the dictionary char_dict = np.load('EARL/models/char_dict.npy').item() #phrase_clean = [char for char in phrase_clean] #print phrase_clean phrase_clean = [char_dict[char] for char in phrase_clean] #print phrase_clean #print np.concatenate((np.zeros(max_len-len(phrase_clean)), phrase_clean) ) prediction = model.predict(np.concatenate((np.zeros((270-len(phrase_clean))), phrase_clean)).reshape(1,270)) print prediction[0] pred = np.argmax(prediction[0]) return 'R' if pred == 0 else 'E'
Example #8
Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0 | 6 votes |
def create_metadata_file(word2vec_file, output_file): """ Create the metadata file based on the corpus file (Used for the Embedding Visualization later). Args: word2vec_file: The word2vec file output_file: The metadata file path Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist.") model = gensim.models.Word2Vec.load(word2vec_file) word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()]) word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)] with open(output_file, 'w+') as fout: for word in word2idx_sorted: if word[0] is None: print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard") fout.write('<Empty Line>' + '\n') else: fout.write(word[0] + '\n')
Example #9
Source File: utils.py From mat2vec with MIT License | 6 votes |
def compute_epoch_accuracies(root, prefix, analogy_file): filenames = glob.glob(os.path.join(root, prefix+"_epoch*.model")) nr_epochs = len(filenames) accuracies = dict() losses = [0] * nr_epochs for filename in filenames: epoch = int(re.search("\d+\.model", filename).group()[:-6]) m = Word2Vec.load(filename) losses[epoch] = m.get_latest_training_loss() sections = m.wv.accuracy(analogy_file) for sec in sections: if sec["section"] not in accuracies: accuracies[sec["section"]] = [0] * nr_epochs correct, incorrect = len(sec["correct"]), len(sec["incorrect"]) if incorrect > 0: accuracy = correct / (correct + incorrect) else: accuracy = 0 accuracies[sec["section"]][epoch] = (correct, incorrect, accuracy) save_obj(accuracies, os.path.join("models", prefix + "_accuracies")) save_obj(np.concatenate([np.array([losses[0]]), np.diff(losses)]), os.path.join("models", prefix + "_loss"))
Example #10
Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0 | 6 votes |
def load_word2vec_matrix(word2vec_file): """ Return the word2vec model matrix. Args: word2vec_file: The word2vec file Returns: The word2vec model matrix Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = gensim.models.Word2Vec.load(word2vec_file) vocab_size = model.wv.vectors.shape[0] embedding_size = model.vector_size vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()]) embedding_matrix = np.zeros([vocab_size, embedding_size]) for key, value in vocab.items(): if key is not None: embedding_matrix[value] = model[key] return vocab_size, embedding_size, embedding_matrix
Example #11
Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0 | 6 votes |
def load_word2vec_matrix(word2vec_file): """ Return the word2vec model matrix. Args: word2vec_file: The word2vec file Returns: The word2vec model matrix Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = gensim.models.Word2Vec.load(word2vec_file) vocab_size = model.wv.vectors.shape[0] embedding_size = model.vector_size vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()]) embedding_matrix = np.zeros([vocab_size, embedding_size]) for key, value in vocab.items(): if key is not None: embedding_matrix[value] = model[key] return vocab_size, embedding_size, embedding_matrix
Example #12
Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0 | 6 votes |
def load_word2vec_matrix(word2vec_file): """ Return the word2vec model matrix. Args: word2vec_file: The word2vec file Returns: The word2vec model matrix Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = gensim.models.Word2Vec.load(word2vec_file) vocab_size = model.wv.vectors.shape[0] embedding_size = model.vector_size vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()]) embedding_matrix = np.zeros([vocab_size, embedding_size]) for key, value in vocab.items(): if key is not None: embedding_matrix[value] = model[key] return vocab_size, embedding_size, embedding_matrix
Example #13
Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0 | 6 votes |
def create_metadata_file(word2vec_file, output_file): """ Create the metadata file based on the corpus file (Used for the Embedding Visualization later). Args: word2vec_file: The word2vec file output_file: The metadata file path Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist.") model = gensim.models.Word2Vec.load(word2vec_file) word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()]) word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)] with open(output_file, 'w+') as fout: for word in word2idx_sorted: if word[0] is None: print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard") fout.write('<Empty Line>' + '\n') else: fout.write(word[0] + '\n')
Example #14
Source File: wordembed.py From PyShortTextCategorization with MIT License | 6 votes |
def shorttext_to_avgvec(shorttext, wvmodel): """ Convert the short text into an averaged embedded vector representation. Given a short sentence, it converts all the tokens into embedded vectors according to the given word-embedding model, sums them up, and normalize the resulting vector. It returns the resulting vector that represents this short sentence. :param shorttext: a short sentence :param wvmodel: word-embedding model :return: an embedded vector that represents the short sentence :type shorttext: str :type wvmodel: gensim.models.keyedvectors.KeyedVectors :rtype: numpy.ndarray """ vec = np.sum([wvmodel[token] for token in tokenize(shorttext) if token in wvmodel], axis=0) # normalize norm = np.linalg.norm(vec) if norm != 0: vec /= norm return vec
Example #15
Source File: main.py From nonce2vec with MIT License | 5 votes |
def _train(args): logger.info('Training word2vec model with gensim') sentences = Samples(source='wiki', shuffle=False, input_data=args.datadir) if not args.train_mode: raise Exception('Unspecified train mode') output_model_filepath = futils.get_model_path(args.datadir, args.outputdir, args.train_mode, args.alpha, args.neg, args.window, args.sample, args.epochs, args.min_count, args.size) logger.info('Saving output w2v model to {}'.format(output_model_filepath)) model = gensim.models.Word2Vec( min_count=args.min_count, alpha=args.alpha, negative=args.neg, window=args.window, sample=args.sample, iter=args.epochs, size=args.size, workers=args.num_threads) if args.train_mode == 'cbow': model.sg = 0 if args.train_mode == 'skipgram': model.sg = 1 logger.info('Building vocabulary...') model.build_vocab(sentences) logger.info('Training model...') model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) logger.info('Training complete. Saving model...') model.save(output_model_filepath) logger.info('Done.')
Example #16
Source File: lsi_model.py From aca with MIT License | 5 votes |
def create_lsi_model(num_topics,dictionary,corpus): print ("create lsi model ...") tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics) #lsi_model = models.LsiModel(corpus,id2word=dictionary,num_topics = num_topics) corpus_lsi = lsi_model[corpus_tfidf] #corpus_lsi = lsi_model[corpus] corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi) #corpus_simi_matrix = similarities.MatrixSimilarity(corpus_tfidf) return (tfidf_model,lsi_model,corpus_simi_matrix)
Example #17
Source File: gensim_nlpir.py From nlp_learning with MIT License | 5 votes |
def mode_training(): """ 模型训练 """ # 读取文件下下面的文件 # sentences = MySentences('/some/directory') # 分词数据 sentences = word2vec.Text8Corpus('data/xuezhong_seg_1.txt') # 训练 size参数主要是用来设置神经网络的层数 # workers参数用于设置并发训练时候的线程数,不过仅当Cython安装的情况 model = word2vec.Word2Vec( sentences, min_count=20, size=4000, window=10, workers=4) # model.sort_vocab() # 计算两个词的相似度/相关程度 # simil_1 = model.wv.similarity(u"王仙芝", u"老怪物") # simil_2 = model.wv.similarity(u"徐凤年", u"殿下") # print("【王仙芝】和【老怪物】相似度:", simil_1) # print("【徐凤年】和【世子】相似度:", simil_2) # # 计算某个词的相关词列表 # lar = model.wv.most_similar(u"徐凤年", topn=20) # 20个最相关的 # print("【徐凤年】相关性:", lar) # 保存模型,以便重用 model.save(u"models/xue.model") print("training finished")
Example #18
Source File: lsi_author.py From aca with MIT License | 5 votes |
def create_lsi_model(num_topics,dictionary,corpus): print ("create lsi model ...") tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics) corpus_lsi = lsi_model[corpus_tfidf] corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi) return (tfidf_model,lsi_model,corpus_simi_matrix)
Example #19
Source File: lsi_neighbor.py From aca with MIT License | 5 votes |
def create_lsi_model(num_topics,dictionary,corpus): print ("create lsi model ...") tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics) corpus_lsi = lsi_model[corpus_tfidf] corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi) return (tfidf_model,lsi_model,corpus_simi_matrix)
Example #20
Source File: similarity.py From bugbug with Mozilla Public License 2.0 | 5 votes |
def __init__( self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8 ): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) self.corpus = [] for bug in bugzilla.get_bugs(): textual_features = self.text_preprocess(self.get_text(bug)) self.corpus.append([bug["id"], textual_features]) # Assigning unique integer ids to all words self.dictionary = Dictionary(text for bug_id, text in self.corpus) # Conversion to BoW corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus] # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions tfidf = models.TfidfModel(corpus_final) corpus_tfidf = tfidf[corpus_final] # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing self.lsi = models.LsiModel( corpus_tfidf, id2word=self.dictionary, num_topics=300 ) corpus_lsi = self.lsi[corpus_tfidf] # Indexing the corpus self.index = similarities.Similarity( output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300 )
Example #21
Source File: textAnalysis.py From deep_learning with MIT License | 5 votes |
def get_model(): # # 加载网络结构 # with open('./models/text_lstm.yaml', 'r') as yaml_file: # loaded_model_yaml = yaml_file.read() # model = model_from_yaml(loaded_model_yaml) # # 加载模型权重 # model.load_weights("./models/text_lstm.h5") # print("model Loaded") # model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) # utils.plot_model(model,to_file='./models/text_lstm_model.png') model = load_model("./models/text_lstm_full.h5") return model
Example #22
Source File: nlp_word2vec.py From resilient-community-apps with MIT License | 5 votes |
def __init__(self, model_name=None, log=None): # word2vec model. It is a gensim.models.Word2Vec self.word2vec = None # # dataset used to train the word2vec model # It is a list of list of words. Ex.[[security, incident],[window, mac, linux], ....] # Data shall be preprocessed by subclass # self.dataset = [] # model name. This is used to save the model into a file self.model_name = model_name self.log = log if log else logging.getLogger(__name__) self.feature_size = 0
Example #23
Source File: nlp_word2vec.py From resilient-community-apps with MIT License | 5 votes |
def build_model(self): """ :return: """ # call template method to load and preprocess data self.load_data() self.preprocess_data() # get the settings for NLP nlp_settings = NLPSettings.get_instance() self.feature_size = nlp_settings.w2v_feature_size() bigram = gensim.models.phrases.Phrases(self.dataset, min_count=nlp_settings.bigram_min_count(), threshold=nlp_settings.bigram_threshold()) bigram = gensim.models.phrases.Phraser(bigram) tokenized_corpus = bigram[self.dataset] word2vec = Word2Vec(size=self.feature_size, window=nlp_settings.w2v_window(), min_count=nlp_settings.w2v_min_count(), sample=nlp_settings.w2v_sample(), alpha=nlp_settings.w2v_alpha(), min_alpha=nlp_settings.w2v_min_alpha(), negative=nlp_settings.w2v_negative(), workers=multiprocessing.cpu_count() - 1) # Build Vocabulary word2vec.build_vocab(tokenized_corpus, progress_per=nlp_settings.w2v_progress_per()) # Train word2vec.train(tokenized_corpus, total_examples=word2vec.corpus_count, epochs=nlp_settings.w2v_epochs(), report_delay=nlp_settings.w2v_report_delay()) self.word2vec = word2vec
Example #24
Source File: textAnalysis.py From deep_learning with MIT License | 5 votes |
def train_model(input_dim,x_train, y_train, x_test, y_test): print(input_dim) print('设计模型 Model...') model = Sequential() model.add(Embedding(input_dim,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)) model.add(LSTM(256, activation="relu")) model.add(Dropout(0.3)) model.add(Dense(512,activation='relu')) model.add(Dropout(0.5)) model.add(Dense(256,activation='relu')) model.add(Dropout(0.5)) model.add(Dense(1,activation="sigmoid")) print('编译模型...') # 使用 adam优化 sgd = Adam(lr=0.0003) model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy']) tbCallBack= callbacks.TensorBoard(log_dir='./logs',histogram_freq=0, write_graph=True, write_images=True) # best_model = ModelCheckpoint("./models/text_lstm.h5", monitor='val_loss', verbose=0, save_best_only=True) print("训练...") model.fit(x_train, y_train, batch_size=batch_size, epochs=3,verbose=1, validation_data=(x_test, y_test),callbacks=[tbCallBack]) # print("评估...") score, accuracy = model.evaluate(x_test, y_test, batch_size=batch_size) print('\nTest score:', score) print('Test accuracy:', accuracy) yaml_string = model.to_yaml() with open('./models/text_lstm.yaml', 'w') as outfile: outfile.write(yaml_string) model.save_weights('./models/text_lstm.h5')
Example #25
Source File: textAnalysis.py From deep_learning with MIT License | 5 votes |
def word2vec_train(text): model = word2vec.Word2Vec(size=EMBEDDING_DIM, min_count=10, window=window_size, workers=cpu_count,iter=1) model.build_vocab(text) model.train(text, total_examples=model.corpus_count, epochs=model.iter) model.save('./models/Word2vec_model.model') index_dict, word_vectors, text = create_dictionaries(model=model, combined=text) return index_dict, word_vectors, text # 定义网络结构
Example #26
Source File: recommender_context_local.py From word2vec-recommender with MIT License | 5 votes |
def train(model_file): contexts = ContextCorpus(data_obj) model = gensim.models.Word2Vec(contexts, min_count=5, workers= multiprocessing.cpu_count(), negative=3, sg=1, size = 300, sample=1e-3, hs=1, window = 5) #a1 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-5, hs=0, window = 5) #a2 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=5, sg=0, size = 300, sample=1e-3, hs=1, window = 5) #a3 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=0, window = 5) #a4 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-5, hs=0, window = 5) #a5 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-4, hs=1, window = 5) #a6 # ./word2vec -train train100B.txt -read-vocab voc -output vectors.bin -cbow 1 -size 300 -window 5 -negative 3 -hs 0 -sample 1e-5 -threads 12 -binary 1 -min-count 10 model.init_sims(replace=True) model.save(model_file)
Example #27
Source File: recommender_context.py From word2vec-recommender with MIT License | 5 votes |
def train(model_file): contexts = ContextCorpus(data_obj) #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=1, window = 5) #a1 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-5, hs=0, window = 5) #a2 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=5, sg=0, size = 300, sample=1e-3, hs=1, window = 5) #a3 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=0, window = 5) #a4 #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-5, hs=0, window = 5) #a5 model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-4, hs=1, window = 5) #a6 # ./word2vec -train train100B.txt -read-vocab voc -output vectors.bin -cbow 1 -size 300 -window 5 -negative 3 -hs 0 -sample 1e-5 -threads 12 -binary 1 -min-count 10 model.init_sims(replace=True) model.save(model_file)
Example #28
Source File: sample_size_NN.py From robotreviewer with GNU General Public License v3.0 | 5 votes |
def load_trained_w2v_model(path): #m = Word2Vec.load_word2vec_format(path, binary=True) m = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) return m
Example #29
Source File: document2vec.py From Document2Vec with MIT License | 5 votes |
def _expand_from(self, corpus, prefix=None, labels=None): """ Pass through the dataset once to add the new labels to the model. These labels stand in one for each document/sentence and not for new vocabulary. """ if prefix is None: prefix = 'SENT' num_lines = sum(1 for _ in corpus) # Expand syn0 shape = (self.syn0.shape[0] + num_lines, self.syn0.shape[1]) syn0 = (np.random.random(shape).astype(self.syn0.dtype) - 0.5) syn0 /= self.layer1_size syn0[:self.syn0.shape[0]] = self.syn0 self.syn0 = syn0 index2word_start = len(self.index2word) for j, line_no in enumerate(range(num_lines)): # Expand vocab newvocab = gensim.models.doc2vec.Vocab() newvocab.index = len(self.index2word) newvocab.sample_probability = 1.0 # We insert each sentence at the root of the # Huffman tree. It's a hack. newvocab.code = [1, ] * int(math.log(line_no + 1, 2) + 1) label = Document2Vec._make_label(prefix, str(j)) self.vocab[label] = newvocab # Expand index2word self.index2word.append(label) assert len(self.index2word) == newvocab.index + 1 return index2word_start
Example #30
Source File: wordembed.py From PyShortTextCategorization with MIT License | 5 votes |
def load_word2vec_model(path, binary=True): """ Load a pre-trained Word2Vec model. :param path: path of the file of the pre-trained Word2Vec model :param binary: whether the file is in binary format (Default: True) :return: a pre-trained Word2Vec model :type path: str :type binary: bool :rtype: gensim.models.keyedvectors.KeyedVectors """ return KeyedVectors.load_word2vec_format(path, binary=binary)