Python gensim.models.word2vec.LineSentence() Examples

The following are 30 code examples of gensim.models.word2vec.LineSentence(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.word2vec , or try the search function .
Example #1
Source File: build_w2v.py    From text-classifier with Apache License 2.0 7 votes vote down vote up
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True) 
Example #2
Source File: helpers.py    From webvectors with GNU General Public License v3.0 6 votes vote down vote up
def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi',
              commonfile='common_tagged.txt'):
    """
    :param source_file:
    :param outfile:
    :param mincount:
    :param threshold:
    :param scoring:
    :param commonfile:
    :return:
    """
    common = set([word.strip() for word in open(commonfile, 'r').readlines()])
    data = LineSentence(source_file)
    bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold,
                                 scoring=scoring, max_vocab_size=400000000, delimiter=b':::',
                                 progress_per=100000, common_terms=common)
    bigrams = Phraser(bigram_transformer)
    tempfile = open(outfile, 'a')
    print('Writing bigrammed text to %s' % outfile, file=sys.stderr)
    for i in bigrams[data]:
        tempfile.write(' '.join(i) + '\n')
    tempfile.close()
    return len(bigrams.phrasegrams) 
Example #3
Source File: pre_train.py    From embeddings with Apache License 2.0 6 votes vote down vote up
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_fasttext(args**) -> Takes the input file, the
    output file and the model
    hyperparameters as arguments
    and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = FastText(sentence, sg=skipgram, hs=loss, size=size,
                     alpha=0.05, window=5, min_count=5, min_n=2,
                     max_n=5, workers=3, iter=epochs)

    model.save(output_file) 
Example #4
Source File: pre_train.py    From embeddings with Apache License 2.0 6 votes vote down vote up
def train_word2vec(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_word2vec(args**) -> Takes the input file,
    the output file and the model hyperparameters as
    arguments and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = Word2Vec(sentence, sg=skipgram, hs=loss,
                     size=size, alpha=0.05, window=5,
                     min_count=5, workers=3, iter=epochs)

    model.save(output_file) 
Example #5
Source File: keyword_word2vec.py    From nlg-yongzhuo with MIT License 6 votes vote down vote up
def train_word2vec_by_word():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running")

    inp = "cut_zhwiki_wiki_parse.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_word.vec"

    print(multiprocessing.cpu_count())
    model = Word2Vec(LineSentence(inp), size=300, window=10,
                     # 这里用skip-heriber
                     min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count())

    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False) 
Example #6
Source File: train.py    From DeepNews with Apache License 2.0 5 votes vote down vote up
def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'):
        model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count())
        model.wv.save_word2vec_format(model_save_file_name, binary=False) 
Example #7
Source File: word2vec_vector.py    From nlp_xiaojiang with MIT License 5 votes vote down vote up
def train_word2vec_by_char():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running")

    inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_char.vec"
    model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False) 
Example #8
Source File: train_vectors.py    From Blackstone with Apache License 2.0 5 votes vote down vote up
def compute_vectors(input_path: Path, output_path: Path):
    """
    Builds word embeddings using gensim Word2Vec. This function takes
    a file contained single sentences per line and writes the computed
    vectors in text format to the specified output path. 
    """
    print(f"Processing {input_path}")
    sentences = LineSentence(input_path)
    bigram_transformer = Phrases(sentences)
    model = Word2Vec(
        bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4
    )
    print(f"Saving vectors to {output_path}")
    model.wv.save_word2vec_format(output_path, binary=False) 
Example #9
Source File: word2vec_helpers.py    From DetectMaliciousURL with Apache License 2.0 5 votes vote down vote up
def generate_word2vec_files(input_file, output_model_file, output_vector_file, size = 128, window = 5, min_count = 5):
    start_time = time.time()

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model = Word2Vec(LineSentence(input_file), size = size, window = window, min_count = min_count, workers = multiprocessing.cpu_count())
    model.save(output_model_file)
    model.wv.save_word2vec_format(output_vector_file, binary=False)

    end_time = time.time()
    print("used time : %d s" % (end_time - start_time)) 
Example #10
Source File: word2vec_helpers.py    From DetectMaliciousURL with Apache License 2.0 5 votes vote down vote up
def generate_word2vec_files(input_file, output_model_file, output_vector_file, size = 128, window = 5, min_count = 5):
    start_time = time.time()

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model = Word2Vec(LineSentence(input_file), size = size, window = window, min_count = min_count, workers = multiprocessing.cpu_count())
    model.save(output_model_file)
    model.wv.save_word2vec_format(output_vector_file, binary=False)

    end_time = time.time()
    print("used time : %d s" % (end_time - start_time)) 
Example #11
Source File: class_w2v.py    From 2016_CCFsougou2 with MIT License 5 votes vote down vote up
def train_w2v(self, filename):
        """
        训练wv模型
        :param filename:path
        :return:none
        """
        sentences = word2vec.LineSentence(filename)  # 加载语料,要求语料为“一行一文本”的格式
        print '正在训练w2v 针对语料:',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
        print '训练完毕,已保存: ', savepath,
        model.save(savepath) 
Example #12
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #13
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #14
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators 
Example #15
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #16
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #17
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #18
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #19
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators 
Example #20
Source File: class_w2v.py    From 2016CCF_BDCI_Sougou with MIT License 5 votes vote down vote up
def train_w2v(self, filename):
        """
        训练wv模型
        :param filename:path
        :return:none
        """
        sentences = word2vec.LineSentence(filename)  # 加载语料,要求语料为“一行一文本”的格式
        print '正在训练w2v 针对语料:',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
        print '训练完毕,已保存: ', savepath,
        model.save(savepath) 
Example #21
Source File: preprocess.py    From blstm-cws with MIT License 5 votes vote down vote up
def gen_embeddings(in_file, out_file, size=100):
    corpus = LineSentence(in_file)
    model = Word2Vec(
        sentences=corpus, size=size, alpha=0.025, window=5, min_count=5,
        max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
        sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
        trim_rule=None, sorted_vocab=1
    )
    model.save_word2vec_format(out_file, binary=False) 
Example #22
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #23
Source File: train_word2vec_model.py    From linguistic-style-transfer with Apache License 2.0 5 votes vote down vote up
def train_word2vec_model(text_file_path, model_file_path):
    # define training data
    # train model
    logger.info("Loading input file and training mode ...")
    model = Word2Vec(sentences=LineSentence(text_file_path), min_count=1, size=global_config.embedding_size)
    # summarize the loaded model
    logger.info("Model Details: {}".format(model))
    # save model
    model.wv.save_word2vec_format(model_file_path, binary=True)
    logger.info("Model saved") 
Example #24
Source File: train.py    From word2vec-tutorial with MIT License 5 votes vote down vote up
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)

    #保存模型,供日後使用
    model.save(u"word2vec.model")

    #模型讀取方式
    # model = word2vec.Word2Vec.load("your_model_name") 
Example #25
Source File: train.py    From word2vec-tutorial with MIT License 5 votes vote down vote up
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)

    #保存模型,供日後使用
    model.save("word2vec.model")

    #模型讀取方式
    # model = word2vec.Word2Vec.load("your_model_name") 
Example #26
Source File: class_w2v.py    From 2016CCF-sougou with Apache License 2.0 5 votes vote down vote up
def train_w2v(self, filename):
        """
        训练wv模型
        :param filename:path
        :return:none
        """
        sentences = word2vec.LineSentence(filename)  # 加载语料,要求语料为“一行一文本”的格式
        print '正在训练w2v 针对语料:',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
        print '训练完毕,已保存: ', savepath,
        model.save(savepath) 
Example #27
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #28
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #29
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators 
Example #30
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())