Python gensim.corpora.MmCorpus() Examples

The following are 21 code examples of gensim.corpora.MmCorpus(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.corpora , or try the search function .
Example #1
Source File: keysearch.py    From simsearch with MIT License 6 votes vote down vote up
def load(cls, save_dir='./'):
        """
        Load the corpus from a save directory.
        """
        tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
        tagsToDocs = tables[0]
        docsToTags = tables[1]        
        titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
        tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
        corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
        dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
        files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
        doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
        
        ksearch = KeySearch(dictionary, tfidf_model, 
                            corpus_tfidf, titles, tagsToDocs,
                            docsToTags, files, doc_line_nums) 
        
        return ksearch 
Example #2
Source File: textpro.py    From comparable-text-miner with Apache License 2.0 6 votes vote down vote up
def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
################################################################################## 
Example #3
Source File: keysearch.py    From wiki-sim-search with MIT License 6 votes vote down vote up
def load(cls, save_dir='./'):
        """
        Load the corpus from a save directory.
        """
        tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
        tagsToDocs = tables[0]
        docsToTags = tables[1]        
        titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
        tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
        corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
        dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
        files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
        doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
        
        ksearch = KeySearch(dictionary, tfidf_model, 
                            corpus_tfidf, titles, tagsToDocs,
                            docsToTags, files, doc_line_nums) 
        
        return ksearch 
Example #4
Source File: docsim.py    From nlp_learning with MIT License 6 votes vote down vote up
def train(self, prefix: str, corporas: list):
        """ 训练模型
        保存字典,语料,模型到磁盘

        Arguments:
            prefix {str} -- 模型名称前缀
            corpora_documents {list} -- 分词后的文本
        """
        # 生成字典和向量语料
        dictionary = corpora.Dictionary(corporas)
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corpus = [dictionary.doc2bow(text) for text in corporas]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)  # 保存生成的语料
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))  # 保存Tfidf模型 
Example #5
Source File: docsim.py    From nlp_learning with MIT License 5 votes vote down vote up
def update_model(self, prefix: str, sysno: int, doc: str):
        """
        更新字典
        :param prefix:
        :param sysno: 系统编号
        :param doc:   文本内容
        :return:
        """

        corporas = self.segment(doc)
        # # 更新字典
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载
        dictionary.add_documents([corporas])
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corporas_docs = np.load("./data/{}_words.npy".format(prefix))
        corporas_docs = list(corporas_docs)
        corporas_docs.append(corporas)
        np.save("./data/{}_words.npy".format(prefix), corporas_docs)
        # 更新corpus
        corpus = [dictionary.doc2bow(text) for text in corporas_docs]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)

        # 更新TfidfModel
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))

        # 更新索引字典
        with open('./data/idx_dic.dic', 'r') as f:
            dt = f.read()
            idx_dic = eval(dt)

        if sysno not in idx_dic.values():
            idx_dic[len(idx_dic)] = sysno

        with open('./data/idx_dic.dic', 'w') as f:
            f.write(str(idx_dic)) 
Example #6
Source File: keysearch.py    From simsearch with MIT License 5 votes vote down vote up
def save(self, save_dir='./'):
        """
        Write out the built corpus to a save directory.
        """
        # Store the tag tables.
        pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))
        
        # Store the document titles.
        pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))
        
        # Write out the tfidf model.
        self.tfidf_model.save(save_dir + 'documents.tfidf_model')
        
        # Write out the tfidf corpus.
        corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)  

        # Write out the dictionary.
        self.dictionary.save(save_dir + 'documents.dict')
        
        # Save the filenames.
        pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))
        
        # Save the file ID and line numbers for each document.
        pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))
        
        # Objects that are not saved:
        #  - stop_list - You don't need to filter stop words for new input
        #                text, they simply aren't found in the dictionary.
        #  - frequency - This preliminary word count object is only used for
        #                removing infrequent words. Final word counts are in
        #                the `dictionary` object. 
Example #7
Source File: builder.py    From Greynir with GNU General Public License v3.0 5 votes vote down vote up
def load_tfidf_corpus(self):
        """ Load a TFIDF corpus from file """
        return corpora.MmCorpus(self._TFIDF_CORPUS_FILE) 
Example #8
Source File: builder.py    From Greynir with GNU General Public License v3.0 5 votes vote down vote up
def create_tfidf_corpus(self):
        """ Create a TFIDF corpus from a plain vector corpus """
        if self._tfidf is None:
            self.load_tfidf_model()
        corpus = self.load_plain_corpus()
        corpus_tfidf = self._tfidf[corpus]
        corpora.MmCorpus.serialize(self._TFIDF_CORPUS_FILE, corpus_tfidf) 
Example #9
Source File: builder.py    From Greynir with GNU General Public License v3.0 5 votes vote down vote up
def load_plain_corpus(self):
        """ Load the plain corpus from file """
        return corpora.MmCorpus(self._PLAIN_CORPUS_FILE) 
Example #10
Source File: builder.py    From Greynir with GNU General Public License v3.0 5 votes vote down vote up
def create_plain_corpus(self):
        """ Create a plain vector corpus, where each vector represents a
            document. Each element of the vector contains the count of
            the corresponding word (as indexed by the dictionary) in
            the document. """
        if self._dictionary is None:
            self.load_dictionary()
        dci = CorpusIterator(dictionary=self._dictionary)
        corpora.MmCorpus.serialize(self._PLAIN_CORPUS_FILE, dci) 
Example #11
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #12
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #13
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #14
Source File: topics_analysis.py    From contextualLSTM with Apache License 2.0 5 votes vote down vote up
def load_corpus_and_dict(corpus_path, id2word_path):
    print("[BLOCK] Loading  corpus and dictionary files from %s and %s" % (data_path, id2word_path))
    sys.stdout.flush()
    dictionary = Dictionary.load_from_text(id2word_path)

    print("[BLOCK] Loading corpus iterator")
    sys.stdout.flush()
    #mm = gensim.corpora.MmCorpus(corpus_path)
    corpus = MmCorpus(bz2.BZ2File(corpus_path)) # use this if you compressed the TFIDF output (recommended)

    return corpus, dictionary 
Example #15
Source File: docsim.py    From nlp_learning with MIT License 5 votes vote down vote up
def calc_similarity(self, prefix: str, text: str):
        """计算相似度
        返回索引和余弦值

        Arguments:
            prefix {str} -- 模型前缀
            text {str} -- 文本数据
            value {float} -- 设定的阈值,返回大于这个值的数据
        """
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载字典
        corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix))  # 加载语料
        tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix))  # 加载Tfidf模型
        corpus_tfidf = tfidf_model[corpus]

        lsi = models.LsiModel(corpus_tfidf)
        corpus_lsi = lsi[corpus_tfidf]
        similarity_lsi = similarities.Similarity('./models/similarity-lsi-index',
                                                 corpus_lsi,
                                                 num_features=400,
                                                 num_best=3)
        cut_raw = self.segment(text)  # 1.分词
        corpus = dictionary.doc2bow(cut_raw)  # 2.转换成bow向量
        corpus_tfidf = tfidf_model[corpus]  # 3.计算tfidf值
        corpus_lsi = lsi[corpus_tfidf]  # 4.计算lsi值
        sims = similarity_lsi[corpus_lsi]

        with open('./data/idx_dic.dic', 'r') as f:
            dt = f.read()
            idx_dic = eval(dt)

        result = []
        if sims is not None:
            result = [idx_dic[idx] for idx, val in sims if val > self.keep_val]

        return result 
Example #16
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #17
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #18
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #19
Source File: keysearch.py    From wiki-sim-search with MIT License 5 votes vote down vote up
def save(self, save_dir='./'):
        """
        Write out the built corpus to a save directory.
        """
        # Store the tag tables.
        pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))
        
        # Store the document titles.
        pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))
        
        # Write out the tfidf model.
        self.tfidf_model.save(save_dir + 'documents.tfidf_model')
        
        # Write out the tfidf corpus.
        corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)  

        # Write out the dictionary.
        self.dictionary.save(save_dir + 'documents.dict')
        
        # Save the filenames.
        pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))
        
        # Save the file ID and line numbers for each document.
        pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))
        
        # Objects that are not saved:
        #  - stop_list - You don't need to filter stop words for new input
        #                text, they simply aren't found in the dictionary.
        #  - frequency - This preliminary word count object is only used for
        #                removing infrequent words. Final word counts are in
        #                the `dictionary` object. 
Example #20
Source File: textpro.py    From comparable-text-miner with Apache License 2.0 5 votes vote down vote up
def prepare_gensim_corpus(corpus_name, corpus, output_path, min_freq=5):
	if not output_path.endswith('/'): output_path = output_path + '/'
	check_dir(output_path) # if directory does not exist, then create
	
	logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
	logging.info( 'loading corpus' )
	texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
	logging.info( 'tokenizing' )
	all_tokens = [item for sublist in texts for item in sublist]
	logging.info( 'mark tokens which have frequency less than %d', min_freq )
	tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v < min_freq ])
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'filter low frequency tokens' )
	texts = [[word for word in text if word not in tokens_once] for text in texts]
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'building dictionary' )
	dictionary = corpora.Dictionary(texts)
	logging.info( 'saving dictionary' )
	dictFile = output_path + corpus_name + '.dict'
	dictionary.save(dictFile) 
	logging.info( 'building corpus in  mm format' )
	corpus = [dictionary.doc2bow(text) for text in texts]
	logging.info( 'saving corpus' )
	gensim_corpus_file = output_path + corpus_name + '.mm'
	corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
	logging.info( 'computing tfidf' )
	tfidf = models.TfidfModel(corpus) # tfidf model 
	corpus_tfidf = tfidf[corpus] # tfidf corpus 
	logging.info( 'saving tfidf corpus' )
	corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
	corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
	logging.info( 'gensim corpus is ready' )
################################################################################## 
Example #21
Source File: text_mining.py    From Listed-company-news-crawl-and-text-analysis with MIT License 4 votes vote down vote up
def classifyRealtimeStockNews(self,doc_list):
		'''Classify real-time news(articles/documents) of specific stock.

		#Arguments:
			doc_list: List of real-time news(articles/documents) crawled from specific websites.
		'''
		print(' * extract relevant stock codes from latest crawled news ... ')
		relevant_stock_list = self.extractStockCodeFromRealtimeNews(doc_list)
		if len(relevant_stock_list) != 0:
			tfDim = 200
			for i, code_list in enumerate(relevant_stock_list):
				for code in code_list:

					print(' * load SVM parameters (gamma & C) ... ')
					Params_svm = {'kernel': ['rbf'], 'gamma': [10, 20, 50, 100, 150, 200], \
						'C': [10, 15, 20, 30, 50, 100]}

					print(' * use historical news to build SVM model of ' + code + ' ... ')
					self.classifyHistoryStockNews("Stock_News",code,modelType='lda',tfDim=tfDim,renewDict=False,\
							renewModel=False,Classifier='SVM',Params=Params_svm) #code="600740"

					print(' * load historical dictionary of ' + code + ' ...')
					dictionary = corpora.Dictionary.load(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_dict.dict')
					
					print(' * tokenize latest crawled news ... ')
					token = self.tp.jieba_tokenize(doc_list)

					print(' * create bow-vector of latest news of ' + code + ' ... ')
					bowvec_doc = [dictionary.doc2bow(text) for text in token]
					
					print(' * load bow-vector of historical news of ' + code + ' ... ')
					bowvec_all = list(corpora.MmCorpus(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_bowvec.mm'))
					
					print(' * extend latest bow-vector to historical bow-vector of ' + code + ' ... ')
					bowvec_all.extend(bowvec_doc)
					
					print(' * create new lda model of ' + code + ' ... ')
					_, NewmodelVec = self.tp.CallTransformationModel(dictionary,bowvec_all,modelType='lda',\
									tfDim=200,renewModel=False,modelPath=os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\')
					
					print(' * convert latest lda vector to CSR matrix of ' + code + ' ... ')
					NewCSRMatrix = self.ConvertToCSRMatrix(NewmodelVec)
					
					print(' * load SVM model of ' + code + ' ... ')
					clf = joblib.load(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_svm.pkl') 
					
					print(' * predicting ... ')
					if clf.predict(NewCSRMatrix[i-2,:])[0] == 1:
						print('   《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是利好消息 ...')
					elif clf.predict(NewCSRMatrix[i-2,:])[0] == -1:
						print('   《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是利空消息 ...')
					else:
						print('   《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是中立消息 ...')
		else:
			print(' * not any relevant stock ... ')