Python gensim.models.Word2Vec() Examples

The following are 30 code examples of gensim.models.Word2Vec(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models , or try the search function .
Example #1
Source File: build_w2v.py    From text-classifier with Apache License 2.0 7 votes vote down vote up
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True) 
Example #2
Source File: node2vec.py    From GraphEmbedding with MIT License 6 votes vote down vote up
def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):

        kwargs["sentences"] = self.sentences
        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["size"] = embed_size
        kwargs["sg"] = 1
        kwargs["hs"] = 0  # node2vec not use Hierarchical Softmax
        kwargs["workers"] = workers
        kwargs["window"] = window_size
        kwargs["iter"] = iter

        print("Learning embedding vectors...")
        model = Word2Vec(**kwargs)
        print("Learning embedding vectors done!")

        self.w2v_model = model

        return model 
Example #3
Source File: pre_train.py    From embeddings with Apache License 2.0 6 votes vote down vote up
def train_word2vec(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_word2vec(args**) -> Takes the input file,
    the output file and the model hyperparameters as
    arguments and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = Word2Vec(sentence, sg=skipgram, hs=loss,
                     size=size, alpha=0.05, window=5,
                     min_count=5, workers=3, iter=epochs)

    model.save(output_file) 
Example #4
Source File: create_word2vec.py    From dutchembeddings with GNU General Public License v2.0 6 votes vote down vote up
def create(basedir, num_workers=12, size=320, threshold=5):
        """
        Creates a word2vec model using the Gensim word2vec implementation.

        :param basedir: the dir from which to get the documents.
        :param num_workers: the number of workers to use for training word2vec
        :param size: the size of the resulting vectors.
        :param threshold: the frequency threshold.
        :return: the model.
        """

        logging.basicConfig(level=logging.INFO)
        sentences = SentenceIter(root=basedir)

        model = Word2Vec(sentences=sentences,
                         sg=True,
                         size=size,
                         workers=num_workers,
                         min_count=threshold,
                         window=11,
                         negative=15)
        model.save_word2vec_format("{0}-{1}.wordvecs", "{0}-{1}.vocab")

        return model 
Example #5
Source File: diffusion_2_vec.py    From diff2vec with GNU General Public License v3.0 6 votes vote down vote up
def learn_pooled_embeddings(walks, counts, args):
    """
    Method to learn an embedding given the sequences and arguments.
    :param walks: Linear vertex sequences.
    :param counts: Number of nodes.
    :param args: Arguments.
    """
    model = Word2Vec(walks,
                     size=args.dimensions,
                     window=args.window_size,
                     min_count=1,
                     sg=1,
                     workers=args.workers,
                     iter=args.iter,
                     alpha=args.alpha)

    save_embedding(args, model, counts) 
Example #6
Source File: metapath2vec.py    From cogdl with MIT License 6 votes vote down vote up
def train(self, G, node_type):
        self.G = G
        self.node_type = [str(a) for a in node_type]
        walks = self._simulate_walks(self.walk_length, self.walk_num, self.schema)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))])
        return embeddings 
Example #7
Source File: lex_sem_ft.py    From DL-text with MIT License 6 votes vote down vote up
def sum_trigram(sent, model):
    sent = sent.split()
    first = True
    second = True
    tot = 0
    for i in range(len(sent)):
        try:
            if first:
                tot += model[None, None][sent[i]]
                first = False
            elif second:
                tot += model[None, sent[i-1]][sent[i]]
                second = False
            else:
                tot += model[sent[i-2], sent[i-1]][sent[i]]
        except:
            continue
    return tot

#Word2Vec Training(Returns Vector): 
Example #8
Source File: node2vec.py    From cogdl with MIT License 6 votes vote down vote up
def train(self, G):
        self.G = G
        is_directed = nx.is_directed(self.G)
        for i, j in G.edges():
            G[i][j]["weight"] = G[i][j].get("weight", 1.0)
            if not is_directed:
                G[j][i]["weight"] = G[j][i].get("weight", 1.0)
        self._preprocess_transition_probs()
        walks = self._simulate_walks(self.walk_num, self.walk_length)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        self.embeddings = np.asarray(
            [model.wv[str(id2node[i])] for i in range(len(id2node))]
        )
        return self.embeddings 
Example #9
Source File: node2vec.py    From entity2vec with Apache License 2.0 6 votes vote down vote up
def learn_embeddings(self, output, output_format='binary'):
        """
        Learn embeddings by optimizing the Skipgram objective using SGD.
        """

        self._simulate_walks()  # simulate random walks

        model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0,
                         workers=self.workers, iter=self.iter, negative=25, sg=1)

        print("defined model using w2v")

        is_binary = output_format != 'text'
        model.wv.save_word2vec_format(output, binary=is_binary)

        actual_format = 'text' if output_format == 'text' else 'binary'
        print("saved model in word2vec %s format" % actual_format)

        return 
Example #10
Source File: deepwalk.py    From cogdl with MIT License 6 votes vote down vote up
def train(self, G):
        self.G = G
        walks = self._simulate_walks(self.walk_length, self.walk_num)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))])
        return embeddings 
Example #11
Source File: keyword_word2vec.py    From nlg-yongzhuo with MIT License 6 votes vote down vote up
def train_word2vec_by_word():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running")

    inp = "cut_zhwiki_wiki_parse.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_word.vec"

    print(multiprocessing.cpu_count())
    model = Word2Vec(LineSentence(inp), size=300, window=10,
                     # 这里用skip-heriber
                     min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count())

    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False) 
Example #12
Source File: walkers.py    From Splitter with GNU General Public License v3.0 6 votes vote down vote up
def learn_base_embedding(self):
        """
        Learning an embedding of nodes in the base graph.
        :return self.embedding: Embedding of nodes in the latent space.
        """
        self.paths = [[str(node) for node in walk] for walk in self.paths]

        model = Word2Vec(self.paths,
                         size=self.args.dimensions,
                         window=self.args.window_size,
                         min_count=1,
                         sg=1,
                         workers=self.args.workers,
                         iter=1)

        self.embedding = np.array([list(model[str(n)]) for n in self.graph.nodes()])
        return self.embedding 
Example #13
Source File: node2vec.py    From entity2rec with Apache License 2.0 6 votes vote down vote up
def learn_embeddings(self, output):
        """
        Learn embeddings by optimizing the Skipgram objective using SGD.
        """

        walks = self._simulate_walks()  # simulate random walks

        model = Word2Vec(walks, size=self.dimensions, window=self.window_size, min_count=0,
                         workers=self.workers, iter=self.iter, negative=25, sg=1)

        print("defined model using w2v")

        model.wv.save_word2vec_format(output, binary=True)

        # free memory
        del walks
        self.alias_nodes = None
        self.alias_edges = None
        self.G = None

        print("saved model in word2vec binary format")

        return 
Example #14
Source File: deepwalk.py    From GraphEmbedding with MIT License 6 votes vote down vote up
def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):

        kwargs["sentences"] = self.sentences
        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["size"] = embed_size
        kwargs["sg"] = 1  # skip gram
        kwargs["hs"] = 1  # deepwalk use Hierarchical Softmax
        kwargs["workers"] = workers
        kwargs["window"] = window_size
        kwargs["iter"] = iter

        print("Learning embedding vectors...")
        model = Word2Vec(**kwargs)
        print("Learning embedding vectors done!")

        self.w2v_model = model
        return model 
Example #15
Source File: graph2vec.py    From PyTorchText with MIT License 6 votes vote down vote up
def train_save(self, list_csv):
        sentences = MySentences(list_csv)
        num_features = 256
        min_word_count = 1
        num_workers = 20
        context = 5
        epoch = 20
        sample = 1e-5
        model = Word2Vec(
            sentences,
            size=num_features,
            min_count=min_word_count,
            workers=num_workers,
            sample=sample,
            window=context,
            iter=epoch,
        )
        #model.save(model_fn)
        return model 
Example #16
Source File: lex_sem_ft.py    From DeepLearn with MIT License 6 votes vote down vote up
def sum_trigram(sent, model):
    sent = sent.split()
    first = True
    second = True
    tot = 0
    for i in range(len(sent)):
        try:
            if first:
                tot += model[None, None][sent[i]]
                first = False
            elif second:
                tot += model[None, sent[i-1]][sent[i]]
                second = False
            else:
                tot += model[sent[i-2], sent[i-1]][sent[i]]
        except:
            continue
    return tot

#Word2Vec Training(Returns Vector): 
Example #17
Source File: deepwalk.py    From CogDL-TensorFlow with MIT License 6 votes vote down vote up
def train(self, G):
        self.G = G
        walks = self._simulate_walks(self.walk_length, self.walk_num)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        embeddings = np.asarray([model[str(id2node[i])] for i in range(len(id2node))])
        return embeddings 
Example #18
Source File: baseline.py    From HARP with MIT License 6 votes vote down vote up
def skipgram_baseline(graph, **kwargs):
    scale = kwargs.get('scale', -1)
    representation_size = kwargs.get('representation_size', 128)

    if scale == 1:
        edges, weights = graph.get_edges()
    else:
        path_length = kwargs.get('path_length', 40)
        num_paths = kwargs.get('num_paths', 80)
        output = kwargs.get('output', 'default')
        edges = graph_coarsening.build_deepwalk_corpus(graph, num_paths, path_length, output)

    if kwargs['hs'] == 0:
        print ('Training the Negative Sampling Model...')
        model = Word2Vec(edges, size=representation_size, window=kwargs['window_size'], min_count=0, sg=1, hs=0, iter=kwargs['iter_count'], negative=5, workers=20)
    else:
        print ('Training the Hierarchical Softmax Model...')
        model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sg=1, hs=1, iter=kwargs['iter_count'], workers=20)

    print ('Finish training the Skip-gram model.')
    return model 
Example #19
Source File: node2vec.py    From CogDL-TensorFlow with MIT License 6 votes vote down vote up
def train(self, G):
        self.G = G
        is_directed = nx.is_directed(self.G)
        for i, j in G.edges():
            G[i][j]["weight"] = G[i][j].get("weight", 1.0)
            if not is_directed:
                G[j][i]["weight"] = G[j][i].get("weight", 1.0)
        self._preprocess_transition_probs()
        walks = self._simulate_walks(self.walk_num, self.walk_length)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        self.embeddings = np.asarray(
            [model[str(id2node[i])] for i in range(len(id2node))]
        )
        return self.embeddings 
Example #20
Source File: test_average.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 6 votes vote down vote up
def test_cy_equal_np_w2v_random(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        # Random initialization
        w2v.build_vocab(SENTENCES)

        m1 = Average(w2v)
        m1.prep.prepare_vectors(
            sv=m1.sv, total_sentences=len(self.sentences), update=False
        )
        m1._pre_train_calls()
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1)

        m2 = Average(w2v)
        m2.prep.prepare_vectors(
            sv=m2.sv, total_sentences=len(self.sentences), update=False
        )
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy

        o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2)

        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6)) 
Example #21
Source File: postprocessing.py    From vec4ir with MIT License 6 votes vote down vote up
def uptrain(corpus,
            model_path=None,
            binary=True,
            lockf=0.0,
            min_count=1,
            size=300,
            **word2vec_params):
    wv = Word2Vec(min_count=min_count, size=size, **word2vec_params)
    print("Building vocabulary...")
    wv.build_vocab(corpus)
    print("Found %d distinct words." % len(wv.index2word))
    if model_path is not None:
        print("Intersecting with", model_path, "...")
        wv.intersect_word2vec_format(model_path, binary=binary, lockf=lockf)
        print("Intersected vectors locked with", lockf)

    total_examples = len(corpus)
    print("Training on %d documents..." % total_examples)
    wv.train(corpus, total_examples=total_examples)

    return wv 
Example #22
Source File: test_sif.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_broken_vocab(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        w2v.build_vocab([l.split() for l in open(CORPUS, "r")])
        for k in w2v.wv.vocab:
            w2v.wv.vocab[k].count = np.nan

        model = SIF(w2v)
        with self.assertRaises(RuntimeError):
            model.train(self.sentences) 
Example #23
Source File: pretrain_embedding.py    From tf_CFO with MIT License 5 votes vote down vote up
def train(data_path, save_dir):
    sentences = []
    data_files = [os.path.join(os.path.dirname(data_path), file) for file in os.listdir(data_path)]
    for data_file in data_files:
        with open(data_file, 'r')as reader:
            for line in reader:
                question = line.strip().split('\t')[-1].lower()
                sentences.append(nltk.word_tokenize(question))

    model = Word2Vec(sentences, size=300, min_count=1, window=5, sg=1, iter=10)
    weights = model.wv.syn0
    d = dict([(k, v.index) for k, v in model.wv.vocab.items()])

    embeddings_index = {}
    for item in d:
        embeddings_index[item] = weights[d[item], :]
    pickle_save(embeddings_index, os.path.join(save_dir, 'fb_word2vec_300d.pkl'))

    word2idx = {}
    for idx, word in enumerate(embeddings_index.keys()):
        word2idx[word] = idx+1  # index 0 refers to unknown token
    pickle_save(word2idx, os.path.join(save_dir, 'fb_word2idx.pkl'))
   
    char2idx = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10,
                'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19,
                't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1':28,
                '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36}
    pickle_save(char2idx, os.path.join(save_dir, 'fb_char2idx.pkl')) 
Example #24
Source File: __main__.py    From GraphEmbeddingRecommendationSystem with MIT License 5 votes vote down vote up
def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()
    # print(args)

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)
    # YOUR CODE HERE
    # print(args.number_walks)
    # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0))
    walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0,rand=random.Random(0))
    print len(walk)
    model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
    print model                    
    # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1)
    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.next()
        groundtruth = [line.strip().split("\t")[:3] for line in fin]    # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    # print(groundtruth)
    pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth]
    # print(pr)
    print "MSE = %f" % mean_squared_error(tr, pr)
    print "accuracy = %f" % accuracy_score(tr, pr)
    cm = confusion_matrix(tr, pr, labels=range(1,6))
    print cm 
Example #25
Source File: link_prediction.py    From node2vec_linkprediction with MIT License 5 votes vote down vote up
def learn_embeddings(self, walks, dimensions, window_size=10, niter=5):
        '''
        Learn embeddings by optimizing the Skipgram objective using SGD.
        '''
        # TODO: Python27 only
        walks = [map(str, walk) for walk in walks]
        model = Word2Vec(walks,
                         size=dimensions,
                         window=window_size,
                         min_count=0,
                         sg=1,
                         workers=self.workers,
                         iter=niter)
        self.wvecs = model.wv 
Example #26
Source File: test_query_expansion.py    From vec4ir with MIT License 5 votes vote down vote up
def test_embedded_query_expansion():
    model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1)
    m = 2
    expansion = EmbeddedQueryExpansion(model.wv, m=m)
    expansion.fit(DOCUMENTS)
    query = "surf"
    expanded_query = expansion.transform(query)
    # surf => surf surf Surfing
    print(query, expanded_query, sep='=>')
    assert len(expanded_query.split()) == len(query.split()) + m 
Example #27
Source File: test_query_expansion.py    From vec4ir with MIT License 5 votes vote down vote up
def test_centroid_expansion():
    model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1)
    m = 2
    expansion = CentroidExpansion(model.wv, m=m)
    expansion.fit(DOCUMENTS)
    query = "surf"
    expanded_query = expansion.transform(query)
    # surf => surf surf Surfing
    print(query, expanded_query, sep='=>')
    assert len(expanded_query.split()) == len(query.split()) + m 
Example #28
Source File: test_vec4ir.py    From vec4ir with MIT License 5 votes vote down vote up
def test_combined():
    model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1)
    wcd = WordCentroidDistance(model.wv)
    tfidf = Tfidf()

    wcd.fit(documents)
    # # they can operate on different feilds
    tfidf.fit(['fox', 'scientists'])
    match_op = Matching().fit(documents)

    combined = wcd + tfidf ** 2

    retrieval = Retrieval(combined, matching=match_op, labels=[7,42])
    result = retrieval.query('fox')
    assert result[0] == 7 
    result = retrieval.query('scientists')
    assert result[0] == 42


# # PYEMD is required
# def test_wordmovers():
#     model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1)
#     match_op = Matching()
#     wmd = WordMoversDistance(model.wv)
#     retrieval = Retrieval(wmd, matching=match_op)
#     retrieval.fit(documents)
#     result = retrieval.query('dog')
#     assert result[0] == 0 
Example #29
Source File: test_vec4ir.py    From vec4ir with MIT License 5 votes vote down vote up
def test_word2vec():
    model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1)
    match_op = Matching()
    with pytest.raises(ValueError):
        wcd = WordCentroidDistance(model)

    wcd = WordCentroidDistance(model.wv)
    retrieval = Retrieval(wcd, matching=match_op)
    retrieval.fit(documents)
    result = retrieval.query('dog')
    assert result[0] == 0 
Example #30
Source File: train.py    From DeepNews with Apache License 2.0 5 votes vote down vote up
def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'):
        model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count())
        model.wv.save_word2vec_format(model_save_file_name, binary=False)