Python gensim.models() Examples

The following are 30 code examples of gensim.models(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim , or try the search function

Example #1

Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Example #2

Source File: build_models.py From smappPy with GNU General Public License v2.0

6 votes

def online_lda(corpus, dictionary, k=25, alpha="symmetric", chunk_size=10000, update_every=1, passes=1):
	"""
	Build the standard online LDA topic model (see gensim:
	http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation)
	
	Updates model every 'update_every' chunks, does 'passes' full passes over the corpus (updating
	every 'update_every' time each pass), and breaks corpus into 'chunk_size' document chunks.

	EG: chunk_size=100, update_every=1, passes=1: Does one full pass over the corpus, updating the
	model every one chunk, breaking the whole corpus into corpus_size/chunk_size chunks. 

	500 documents => 5 chunks, updates model on every chunk.

	Alpha values can be "symmetric", "asymmetric", and "auto". See: 
	http://radimrehurek.com/gensim/models/ldamodel.html
	"""
	return models.ldamodel.LdaModel(corpus=corpus,
									id2word=dictionary,
									num_topics=k,
									alpha=alpha,
									chunksize=chunk_size,
									update_every=update_every,
									passes=passes)

Example #3

Source File: build_models.py From smappPy with GNU General Public License v2.0

6 votes

def batch_lda(corpus, dictionary, k=25, alpha="symmetric", passes=20):
	"""
	Build basic batch LDA topic model (see gensim:
	http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation)

	Does 'passes' number of passes over the whole corpus, no chunking, and updates the model
	at the end of every full pass.

	Alpha values can be "symmetric", "asymmetric", and "auto". See: 
	http://radimrehurek.com/gensim/models/ldamodel.html
	"""
	return models.ldamodel.LdaModel(corpus=corpus,
								   id2word=dictionary,
								   num_topics=k,
								   alpha=alpha,
								   update_every=0,
								   passes=passes)

Example #4

Source File: textAnalysis.py From deep_learning with MIT License

6 votes

def predictData():
    """
    使用模型预测真实数据

    """
    input_texts = ["很好很满意","不好不满意","质量有问题","商家态度很差","售后很渣，渣渣"]

    # word_model = word2vec.Word2Vec.load('./models/Word2vec_model.model')
    # w2indx, w2vec, texts = create_dictionaries(word_model, texts)
    # print(texts)

    texts = predict_wordtoVect(input_texts)

    model = get_model()
    # # 预测
    pred_result = model.predict_classes(texts)
    print(pred_result)
    labels = [int(round(x[0])) for x in pred_result]
    label2word = {1: '正面', 0: '负面'}
    for i in range(len(pred_result)):
        print('{0} -------- {1}'.format(label2word[labels[i]], input_texts[i]))

Example #5

Source File: get_indices.py From NETL-Automatic-Topic-Labelling- with Apache License 2.0

6 votes

def get_word(word):
    inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word)

    if inst == None:
        length = len(word.split("_"))
        if length < 5:
            return True, word
    else:
        if inst.group(1) != "disambiguation":
            word2 = re.sub(r'_\(.+\)','',word)
            if len(word2.split(" ")) <5:
                return True, word

    return False,word

# Load the trained doc2vec and word2vec models.

Example #6

Source File: rock_gensim.py From MusicTaster with MIT License

6 votes

def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None,
                               min_count=5, sorted_vocab=1, window=10,
                               size=250,
                               iter_n=50):
    if not input_datas and data_path:
        input_datas = pickle.load(open(data_path, 'rb'))
    full_data = []
    for i in input_datas:
        tmp = []
        for j in i:
            tmp.append(j[0])
            tmp.append(j[1])
        full_data.append(tmp)
    data_process_logger.info('start training')
    wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
                                      size=size, iter=iter_n)
    with open(fout_path, 'wb') as fout:
        data_process_logger.info('start saving model')
        pickle.dump(wv_model, fout)
        print 'model saved'

Example #7

Source File: predict_phrase.py From EARL with GNU General Public License v3.0

6 votes

def predict_phrase(phrase):
   #load the model
   #preprocess the phrase
   
   #phrase_clean = clean_str(phrase)
   phrase_clean = phrase 
   #load the dictionary
   char_dict = np.load('EARL/models/char_dict.npy').item()
   #phrase_clean = [char for char in phrase_clean]
   #print phrase_clean
   
   phrase_clean = [char_dict[char] for char in phrase_clean]

   #print phrase_clean
   
   #print np.concatenate((np.zeros(max_len-len(phrase_clean)), phrase_clean) )
   prediction = model.predict(np.concatenate((np.zeros((270-len(phrase_clean))), phrase_clean)).reshape(1,270))

   print prediction[0]
   
   pred = np.argmax(prediction[0])

   return 'R' if pred == 0 else 'E'

Example #8

Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Example #9

Source File: utils.py From mat2vec with MIT License

6 votes

def compute_epoch_accuracies(root, prefix, analogy_file):
    filenames = glob.glob(os.path.join(root, prefix+"_epoch*.model"))
    nr_epochs = len(filenames)
    accuracies = dict()
    losses = [0] * nr_epochs
    for filename in filenames:
        epoch = int(re.search("\d+\.model", filename).group()[:-6])
        m = Word2Vec.load(filename)
        losses[epoch] = m.get_latest_training_loss()
        sections = m.wv.accuracy(analogy_file)
        for sec in sections:
            if sec["section"] not in accuracies:
                accuracies[sec["section"]] = [0] * nr_epochs
            correct, incorrect = len(sec["correct"]), len(sec["incorrect"])
            if incorrect > 0:
                accuracy = correct / (correct + incorrect)
            else:
                accuracy = 0
            accuracies[sec["section"]][epoch] = (correct, incorrect, accuracy)
        save_obj(accuracies, os.path.join("models", prefix + "_accuracies"))
        save_obj(np.concatenate([np.array([losses[0]]), np.diff(losses)]), os.path.join("models", prefix + "_loss"))

Example #10

Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Example #11

Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Example #12

Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Example #13

Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Example #14

Source File: wordembed.py From PyShortTextCategorization with MIT License

6 votes

def shorttext_to_avgvec(shorttext, wvmodel):
    """ Convert the short text into an averaged embedded vector representation.

    Given a short sentence, it converts all the tokens into embedded vectors according to
    the given word-embedding model, sums
    them up, and normalize the resulting vector. It returns the resulting vector
    that represents this short sentence.

    :param shorttext: a short sentence
    :param wvmodel: word-embedding model
    :return: an embedded vector that represents the short sentence
    :type shorttext: str
    :type wvmodel: gensim.models.keyedvectors.KeyedVectors
    :rtype: numpy.ndarray
    """
    vec = np.sum([wvmodel[token] for token in tokenize(shorttext) if token in wvmodel], axis=0)

    # normalize
    norm = np.linalg.norm(vec)
    if norm != 0:
        vec /= norm

    return vec

Example #15

Source File: main.py From nonce2vec with MIT License

5 votes

def _train(args):
    logger.info('Training word2vec model with gensim')
    sentences = Samples(source='wiki', shuffle=False, input_data=args.datadir)
    if not args.train_mode:
        raise Exception('Unspecified train mode')
    output_model_filepath = futils.get_model_path(args.datadir, args.outputdir,
                                                  args.train_mode,
                                                  args.alpha, args.neg,
                                                  args.window, args.sample,
                                                  args.epochs,
                                                  args.min_count, args.size)
    logger.info('Saving output w2v model to {}'.format(output_model_filepath))
    model = gensim.models.Word2Vec(
        min_count=args.min_count, alpha=args.alpha, negative=args.neg,
        window=args.window, sample=args.sample, iter=args.epochs,
        size=args.size, workers=args.num_threads)
    if args.train_mode == 'cbow':
        model.sg = 0
    if args.train_mode == 'skipgram':
        model.sg = 1
    logger.info('Building vocabulary...')
    model.build_vocab(sentences)
    logger.info('Training model...')
    model.train(sentences, total_examples=model.corpus_count,
                epochs=model.epochs)
    logger.info('Training complete. Saving model...')
    model.save(output_model_filepath)
    logger.info('Done.')

Example #16

Source File: lsi_model.py From aca with MIT License

5 votes

def create_lsi_model(num_topics,dictionary,corpus):

    print ("create lsi model ...")
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    #lsi_model = models.LsiModel(corpus,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    #corpus_lsi = lsi_model[corpus]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    #corpus_simi_matrix = similarities.MatrixSimilarity(corpus_tfidf)
    return (tfidf_model,lsi_model,corpus_simi_matrix)

Example #17

Source File: gensim_nlpir.py From nlp_learning with MIT License

5 votes

def mode_training():
    """
    模型训练
    """
    # 读取文件下下面的文件
    # sentences = MySentences('/some/directory')
    # 分词数据
    sentences = word2vec.Text8Corpus('data/xuezhong_seg_1.txt')
    # 训练 size参数主要是用来设置神经网络的层数
    # workers参数用于设置并发训练时候的线程数，不过仅当Cython安装的情况
    model = word2vec.Word2Vec(
        sentences, min_count=20, size=4000, window=10, workers=4)


    # model.sort_vocab()

    # 计算两个词的相似度/相关程度
    # simil_1 = model.wv.similarity(u"王仙芝", u"老怪物")
    # simil_2 = model.wv.similarity(u"徐凤年", u"殿下")
    # print("【王仙芝】和【老怪物】相似度：", simil_1)
    # print("【徐凤年】和【世子】相似度：", simil_2)

    # # 计算某个词的相关词列表
    # lar = model.wv.most_similar(u"徐凤年", topn=20)  # 20个最相关的
    # print("【徐凤年】相关性：", lar)

    # 保存模型，以便重用
    model.save(u"models/xue.model")
    print("training finished")

Example #18

Source File: lsi_author.py From aca with MIT License

5 votes

def create_lsi_model(num_topics,dictionary,corpus):
    print ("create lsi model ...")

    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    return (tfidf_model,lsi_model,corpus_simi_matrix)

Example #19

Source File: lsi_neighbor.py From aca with MIT License

5 votes

def create_lsi_model(num_topics,dictionary,corpus):
    print ("create lsi model ...")

    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    return (tfidf_model,lsi_model,corpus_simi_matrix)

Example #20

Source File: similarity.py From bugbug with Mozilla Public License 2.0

5 votes

def __init__(
        self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []

        for bug in bugzilla.get_bugs():

            textual_features = self.text_preprocess(self.get_text(bug))
            self.corpus.append([bug["id"], textual_features])

        # Assigning unique integer ids to all words
        self.dictionary = Dictionary(text for bug_id, text in self.corpus)

        # Conversion to BoW
        corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus]

        # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
        tfidf = models.TfidfModel(corpus_final)
        corpus_tfidf = tfidf[corpus_final]

        # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
        self.lsi = models.LsiModel(
            corpus_tfidf, id2word=self.dictionary, num_topics=300
        )
        corpus_lsi = self.lsi[corpus_tfidf]

        # Indexing the corpus
        self.index = similarities.Similarity(
            output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300
        )

Example #21

Source File: textAnalysis.py From deep_learning with MIT License

5 votes

def get_model():
    # # 加载网络结构
    # with open('./models/text_lstm.yaml', 'r') as yaml_file:
    #     loaded_model_yaml = yaml_file.read()
    # model = model_from_yaml(loaded_model_yaml)
    # # 加载模型权重
    # model.load_weights("./models/text_lstm.h5")
    # print("model Loaded")
    # model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
                  
    # utils.plot_model(model,to_file='./models/text_lstm_model.png')

    model = load_model("./models/text_lstm_full.h5")

    return model

Example #22

Source File: nlp_word2vec.py From resilient-community-apps with MIT License

5 votes

def __init__(self, model_name=None, log=None):
        # word2vec model. It is a gensim.models.Word2Vec
        self.word2vec = None
        #
        # dataset used to train the word2vec model
        # It is a list of list of words. Ex.[[security, incident],[window, mac, linux], ....]
        # Data shall be preprocessed by subclass
        #
        self.dataset = []
        # model name. This is used to save the model into a file
        self.model_name = model_name
        self.log = log if log else logging.getLogger(__name__)
        self.feature_size = 0

Example #23

Source File: nlp_word2vec.py From resilient-community-apps with MIT License

5 votes

def build_model(self):
        """

        :return:
        """
        # call template method to load and preprocess data
        self.load_data()
        self.preprocess_data()

        # get the settings for NLP
        nlp_settings = NLPSettings.get_instance()
        self.feature_size = nlp_settings.w2v_feature_size()

        bigram = gensim.models.phrases.Phrases(self.dataset,
                                               min_count=nlp_settings.bigram_min_count(),
                                               threshold=nlp_settings.bigram_threshold())

        bigram = gensim.models.phrases.Phraser(bigram)

        tokenized_corpus = bigram[self.dataset]

        word2vec = Word2Vec(size=self.feature_size,
                            window=nlp_settings.w2v_window(),
                            min_count=nlp_settings.w2v_min_count(),
                            sample=nlp_settings.w2v_sample(),
                            alpha=nlp_settings.w2v_alpha(),
                            min_alpha=nlp_settings.w2v_min_alpha(),
                            negative=nlp_settings.w2v_negative(),
                            workers=multiprocessing.cpu_count() - 1)
        # Build Vocabulary
        word2vec.build_vocab(tokenized_corpus,
                             progress_per=nlp_settings.w2v_progress_per())
        # Train
        word2vec.train(tokenized_corpus,
                       total_examples=word2vec.corpus_count,
                       epochs=nlp_settings.w2v_epochs(),
                       report_delay=nlp_settings.w2v_report_delay())

        self.word2vec = word2vec

Example #24

Source File: textAnalysis.py From deep_learning with MIT License

5 votes

def train_model(input_dim,x_train, y_train, x_test, y_test):
    print(input_dim)    
    print('设计模型 Model...')

    model = Sequential()

    model.add(Embedding(input_dim,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(LSTM(256, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(1,activation="sigmoid"))

    print('编译模型...')   # 使用 adam优化
    sgd = Adam(lr=0.0003)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    
    tbCallBack= callbacks.TensorBoard(log_dir='./logs',histogram_freq=0, write_graph=True, write_images=True)
    # best_model = ModelCheckpoint("./models/text_lstm.h5", monitor='val_loss', verbose=0, save_best_only=True)
    print("训练...")
    model.fit(x_train, y_train, batch_size=batch_size, epochs=3,verbose=1, validation_data=(x_test, y_test),callbacks=[tbCallBack])
    # 
    print("评估...")
    score, accuracy = model.evaluate(x_test, y_test, batch_size=batch_size)
    print('\nTest score:', score)
    print('Test accuracy:', accuracy)

    yaml_string = model.to_yaml()
    with open('./models/text_lstm.yaml', 'w') as outfile:
        outfile.write(yaml_string)
    model.save_weights('./models/text_lstm.h5')

Example #25

Source File: textAnalysis.py From deep_learning with MIT License

5 votes

def word2vec_train(text):

    model = word2vec.Word2Vec(size=EMBEDDING_DIM, min_count=10, window=window_size, workers=cpu_count,iter=1)
    model.build_vocab(text)
    model.train(text, total_examples=model.corpus_count, epochs=model.iter)
    model.save('./models/Word2vec_model.model')
    index_dict, word_vectors, text = create_dictionaries(model=model, combined=text)
    return index_dict, word_vectors, text



# 定义网络结构

Example #26

Source File: recommender_context_local.py From word2vec-recommender with MIT License

5 votes

def train(model_file):
    contexts = ContextCorpus(data_obj)
    model = gensim.models.Word2Vec(contexts, min_count=5, workers= multiprocessing.cpu_count(), negative=3, sg=1, size = 300, sample=1e-3, hs=1, window = 5) #a1 
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-5, hs=0, window = 5) #a2 
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=5, sg=0, size = 300, sample=1e-3, hs=1, window = 5) #a3
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=0, window = 5) #a4
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-5, hs=0, window = 5) #a5
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-4, hs=1, window = 5) #a6
    # ./word2vec -train train100B.txt -read-vocab voc -output vectors.bin -cbow 1 -size 300 -window 5 -negative 3 -hs 0 -sample 1e-5 -threads 12 -binary 1 -min-count 10
    model.init_sims(replace=True)
    model.save(model_file)

Example #27

Source File: recommender_context.py From word2vec-recommender with MIT License

5 votes

def train(model_file):
    contexts = ContextCorpus(data_obj)
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=1, window = 5) #a1 
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-5, hs=0, window = 5) #a2 
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=5, sg=0, size = 300, sample=1e-3, hs=1, window = 5) #a3
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=0, window = 5) #a4
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-5, hs=0, window = 5) #a5
    model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-4, hs=1, window = 5) #a6
    # ./word2vec -train train100B.txt -read-vocab voc -output vectors.bin -cbow 1 -size 300 -window 5 -negative 3 -hs 0 -sample 1e-5 -threads 12 -binary 1 -min-count 10
    model.init_sims(replace=True)
    model.save(model_file)

Example #28

Source File: sample_size_NN.py From robotreviewer with GNU General Public License v3.0

5 votes

def load_trained_w2v_model(path):
    #m = Word2Vec.load_word2vec_format(path, binary=True)
    m = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    return m

Example #29

Source File: document2vec.py From Document2Vec with MIT License

5 votes

def _expand_from(self, corpus, prefix=None, labels=None):
        """
        Pass through the dataset once to add the new labels to the model.
        These labels stand in one for each document/sentence and not
        for new vocabulary.
        """
        if prefix is None:
            prefix = 'SENT'
        num_lines = sum(1 for _ in corpus)
        # Expand syn0
        shape = (self.syn0.shape[0] + num_lines, self.syn0.shape[1])
        syn0 = (np.random.random(shape).astype(self.syn0.dtype) - 0.5)
        syn0 /= self.layer1_size
        syn0[:self.syn0.shape[0]] = self.syn0
        self.syn0 = syn0
        index2word_start = len(self.index2word)
        for j, line_no in enumerate(range(num_lines)):
            # Expand vocab
            newvocab = gensim.models.doc2vec.Vocab()
            newvocab.index = len(self.index2word)
            newvocab.sample_probability = 1.0
            # We insert each sentence at the root of the
            # Huffman tree. It's a hack.
            newvocab.code = [1, ] * int(math.log(line_no + 1, 2) + 1)
            label = Document2Vec._make_label(prefix, str(j))
            self.vocab[label] = newvocab
            # Expand index2word
            self.index2word.append(label)
            assert len(self.index2word) == newvocab.index + 1
        return index2word_start

Example #30

Source File: wordembed.py From PyShortTextCategorization with MIT License

5 votes

def load_word2vec_model(path, binary=True):
    """ Load a pre-trained Word2Vec model.

    :param path: path of the file of the pre-trained Word2Vec model
    :param binary: whether the file is in binary format (Default: True)
    :return: a pre-trained Word2Vec model
    :type path: str
    :type binary: bool
    :rtype: gensim.models.keyedvectors.KeyedVectors
    """
    return KeyedVectors.load_word2vec_format(path, binary=binary)