Python gensim.corpora.Dictionary() Examples

The following are 30 code examples of gensim.corpora.Dictionary(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.corpora , or try the search function .
Example #1
Source File: text.py    From nlp_learning with MIT License 6 votes vote down vote up
def main():
    corpora_documents = []
    for item_text in raw_documents:
        item_str = list(jieba.cut(item_text))
        corpora_documents.append(item_str)

    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]

    similarity =similarities.Similarity('-Similarity-index', corpus, num_features=400)

    test_data_1 = '你好,我想问一下我想离婚他不想离,孩子他说不要,是六个月就自动生效离婚'
    test_cut_raw_1 = jieba.cut(test_data_1)
    test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)
    similarity.num_best = 5
    # 返回最相似的样本材料,(index_of_document, similarity) tuples
    print(similarity[test_corpus_1]) 
Example #2
Source File: textpro.py    From comparable-text-miner with Apache License 2.0 6 votes vote down vote up
def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
################################################################################## 
Example #3
Source File: similarity.py    From bugbug with Mozilla Public License 2.0 6 votes vote down vote up
def __init__(
        self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []
        self.bug_ids = []
        for bug in bugzilla.get_bugs():
            self.corpus.append(self.text_preprocess(self.get_text(bug)))
            self.bug_ids.append(bug["id"])

        indexes = list(range(len(self.corpus)))
        random.shuffle(indexes)
        self.corpus = [self.corpus[idx] for idx in indexes]
        self.bug_ids = [self.bug_ids[idx] for idx in indexes]

        self.dictionary = Dictionary(self.corpus)

        self.model = LdaModel([self.dictionary.doc2bow(text) for text in self.corpus]) 
Example #4
Source File: similarity.py    From bugbug with Mozilla Public License 2.0 6 votes vote down vote up
def __init__(
        self,
        cut_off=0.2,
        cleanup_urls=True,
        nltk_tokenizer=False,
        confidence_threshold=0.8,
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )

        terms_idx = WordEmbeddingSimilarityIndex(self.w2vmodel.wv)
        self.dictionary = Dictionary(self.corpus)

        bow = [self.dictionary.doc2bow(doc) for doc in self.corpus]

        similarity_matrix = SparseTermSimilarityMatrix(terms_idx, self.dictionary)
        self.softcosinesimilarity = SoftCosineSimilarity(
            bow, similarity_matrix, num_best=10
        ) 
Example #5
Source File: lex_sem_ft.py    From DeepLearn with MIT License 6 votes vote down vote up
def LDA_train(doc):
    red = []
    en_stop = get_stop_words('en')
    for d in doc:
        try:
            raw = d.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            red.append(stopped_tokens)
        except:
            continue
    print("Forming Dictionary.....")
    dictionary = corpora.Dictionary(red)
    print("Forming Corpus.....")
    corpus = [dictionary.doc2bow(text) for text in red]
    print("Training Model.....")
    lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
    return lda

#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float): 
Example #6
Source File: lex_sem_ft.py    From DeepLearn with MIT License 6 votes vote down vote up
def sum_bigram(sent, model):
    sent = sent.split()
    first = True
    tot = 0
    for i in range(len(sent)):
        try:
            if first:
                tot += model[None][sent[i]]
                first = False
            else:
                tot += model[sent[i-1]][sent[i]]
        except:
            continue
    return tot

#Training Trigram Model[Returns Dictionary of Dictionaries]: 
Example #7
Source File: utils.py    From CLAtoolkit with GNU General Public License v3.0 6 votes vote down vote up
def get_LDAVis_JSON(platform, num_topics, course_code, start_date=None, end_date=None):
    #print "get_LDAVis_JSON"
    docs,ids = get_allcontent_byplatform(platform, course_code, start_date=start_date, end_date=end_date)
    documents = remove_stopwords(docs)

    # Make dictionary
    dictionary = corpora.Dictionary(documents)

    #Create and save corpus
    corpus = [dictionary.doc2bow(text) for text in documents]

    #Run LDA
    model = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

    tmp = pyLDAvis.gensim.prepare(model, corpus, dictionary).to_json()
    #print tmp
    #tmp = model.show_topics(num_topics=20, num_words=5, log=False, formatted=False)

    return tmp 
Example #8
Source File: textrank_gensim.py    From nlg-yongzhuo with MIT License 6 votes vote down vote up
def _build_corpus(sentences):
    """Construct corpus from provided sentences.

    Parameters
    ----------
    sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Given sentences.

    Returns
    -------
    list of list of (int, int)
        Corpus built from sentences.

    """
    split_tokens = [jieba_cut(sentence) for sentence in sentences]
    dictionary = Dictionary(split_tokens)
    return [dictionary.doc2bow(token) for token in split_tokens] 
Example #9
Source File: fastfm_recommender.py    From yelp with GNU Lesser General Public License v2.1 6 votes vote down vote up
def preprocess_records(train_records, test_records):
    """
    Creates a bag of words and a corpus for each record and creates a dictionary
    based on all the text contained in the records
    """

    records = train_records + test_records

    all_words = []

    for record in records:
        bow = record['context_text'].split()
        record[Constants.BOW_FIELD] = bow
        all_words.append(bow)

    dictionary = corpora.Dictionary(all_words)

    for record in records:
        record[Constants.CORPUS_FIELD] = \
            dictionary.doc2bow(record[Constants.BOW_FIELD])

    return dictionary 
Example #10
Source File: context_knn.py    From yelp with GNU Lesser General Public License v2.1 6 votes vote down vote up
def get_topic_distribution(self, review):
        """

        :type review: str
        """
        review_bow = lda_context_utils.create_bag_of_words([review])
        dictionary = corpora.Dictionary(review_bow)
        corpus = dictionary.doc2bow(review_bow[0])
        lda_corpus = self.lda_model.get_document_topics(corpus)

        topic_distribution =\
            lda_document_to_topic_distribution(lda_corpus, self.num_topics)

        return topic_distribution

    # TODO: Adapt this to a data structure in which a user can rate the same
    # item multiple times in different contexts 
Example #11
Source File: reviews_preprocessor.py    From yelp with GNU Lesser General Public License v2.1 6 votes vote down vote up
def build_dictionary(self):
        print('%s: build dictionary' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if self.use_cache and os.path.exists(Constants.DICTIONARY_FILE):
            print('Dictionary already exists')
            self.dictionary = corpora.Dictionary.load(Constants.DICTIONARY_FILE)
            return

        all_words = []

        for record in self.records:
            all_words.append(record[Constants.BOW_FIELD])

        self.dictionary = corpora.Dictionary(all_words)

        self.dictionary.filter_extremes(
            Constants.MIN_DICTIONARY_WORD_COUNT,
            Constants.MAX_DICTIONARY_WORD_COUNT)

        self.dictionary.save(Constants.DICTIONARY_FILE) 
Example #12
Source File: lex_sem_ft.py    From DL-text with MIT License 6 votes vote down vote up
def LDA_train(doc):
    red = []
    en_stop = get_stop_words('en')
    for d in doc:
        try:
            raw = d.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            red.append(stopped_tokens)
        except:
            continue
    print("Forming Dictionary.....")
    dictionary = corpora.Dictionary(red)
    print("Forming Corpus.....")
    corpus = [dictionary.doc2bow(text) for text in red]
    print("Training Model.....")
    lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
    return lda

#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float): 
Example #13
Source File: VectorSpaceModel.py    From Snowball with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens") 
Example #14
Source File: docsim.py    From nlp_learning with MIT License 6 votes vote down vote up
def train(self, prefix: str, corporas: list):
        """ 训练模型
        保存字典,语料,模型到磁盘

        Arguments:
            prefix {str} -- 模型名称前缀
            corpora_documents {list} -- 分词后的文本
        """
        # 生成字典和向量语料
        dictionary = corpora.Dictionary(corporas)
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corpus = [dictionary.doc2bow(text) for text in corporas]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)  # 保存生成的语料
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))  # 保存Tfidf模型 
Example #15
Source File: sent_utils.py    From embedding with MIT License 6 votes vote down vote up
def latent_dirichlet_allocation(corpus_fname, output_fname, tokenizer_name="mecab"):
    make_save_path(output_fname)
    documents, tokenized_corpus = [], []
    tokenizer = get_tokenizer(tokenizer_name)
    with open(corpus_fname, 'r', encoding='utf-8') as f:
        for document in f:
            tokens = list(set(tokenizer.morphs(document.strip())))
            documents.append(document)
            tokenized_corpus.append(tokens)
    dictionary = corpora.Dictionary(tokenized_corpus)
    corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]
    LDA = ldamulticore.LdaMulticore(corpus, id2word=dictionary,
                                    num_topics=30,
                                    minimum_probability=0.0,
                                    workers=4)
    # 특정 토픽의 확률이 0.5보다 클 경우에만 데이터를 리턴한다
    # 확률의 합은 1이기 때문에 해당 토픽이 해당 문서에서 확률값이 가장 큰 토픽이 된다
    all_topics = LDA.get_document_topics(corpus, minimum_probability=0.5, per_word_topics=False)
    with open(output_fname + ".results", 'w') as f:
        for doc_idx, topic in enumerate(all_topics):
            if len(topic) == 1:
                topic_id, prob = topic[0]
                f.writelines(documents[doc_idx].strip() + "\u241E" + ' '.join(tokenized_corpus[doc_idx]) + "\u241E" + str(topic_id) + "\u241E" + str(prob) + "\n")
    LDA.save(output_fname + ".model") 
Example #16
Source File: util.py    From seq2seq with MIT License 6 votes vote down vote up
def load_dictionary(filename: str) -> corpora.Dictionary:
    """辞書をロードする。

    Args:
        filename (str): ファイル名。
    Returns:
        corpora.Dictionary: 辞書。
    """
    dic = corpora.Dictionary.load(filename)
    # if with_symbol and \
    #         not (dic.token2id["<S>"] == 0 and dic.token2id["</S>"] == 1):
    #     raise Exception("<S> and </S> ids should be 0 and 1")

    print("load dictionary: {} items".format(len(dic.values())))
    # print([item for item in dic.items()][:10])
    return dic 
Example #17
Source File: text_processing.py    From Listed-company-news-crawl-and-text-analysis with MIT License 6 votes vote down vote up
def genDictionary(self,documents,**kwarg):
        '''Generate dictionary and bow-vector of all tokenzied news(articles).

        # Arguments:
            documents: List of news(articles).
            saveDict: Save dictionary or not(bool type).
            saveBowvec: Save bow-vector or not(bool type).
            returnValue: Return value or not(bool type).
        '''
        self._raw_documents = documents
        token = self.jieba_tokenize(documents) #jieba tokenize
        #corpora_documents = self.RemoveWordAppearOnce(token)  # remove thw words appearing once in the dictionary
        self._dictionary = corpora.Dictionary(token)  # generate dictionary using tokenized documents  
        if kwarg['saveDict']:
            self._dictionary.save(kwarg['saveDictPath']) # store the dictionary, for future reference
        self._BowVecOfEachDoc = [self._dictionary.doc2bow(text) for text in token]  # convert tokenized documents to vectors
        if kwarg['saveBowvec']:
            corpora.MmCorpus.serialize(kwarg['saveBowvecPath'], self._BowVecOfEachDoc)  # store to disk, for later use
        if kwarg['returnValue']:
            return token, self._dictionary, self._BowVecOfEachDoc 
Example #18
Source File: util.py    From seq2seq with MIT License 6 votes vote down vote up
def tokens2ids(
    tokens: List[str],
    dictionary: corpora.Dictionary,
    verbose: bool=False
) -> List[int]:
    if verbose:
        not_found_lst = [
            word for word in tokens if word not in dictionary.token2id
        ]
        if not_found_lst:
            print("not found in dict: {}".format(
                not_found_lst
            ))
        for word in tokens:
            if word in dictionary and dictionary.token2id[word] < 0:
                raise("word id < 0: {}".format(word))

    # 未知語は UNK にする
    return [
        dictionary.token2id[word] if word in dictionary.token2id
        else dictionary.token2id[config.UNK_SYMBOL]
        for word in tokens
    ] 
Example #19
Source File: cut_td_idf.py    From nlp_xiaojiang with MIT License 6 votes vote down vote up
def init_tfidf_chinese_or_pinyin(sources_path):
    """
      构建td_idf
    :param path: 
    :return: 
    """
    questions = txtRead(sources_path)
    corpora_documents = []
    for item_text in questions:
        item_seg = list(jieba.cut(str(item_text).strip()))
        corpora_documents.append(item_seg)

    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
    file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
    pickle.dump([dictionary, tfidf_model], file) 
Example #20
Source File: dbscan_analysis.py    From ns4_chatbot with Apache License 2.0 6 votes vote down vote up
def get_dictionary(df):
		logger.debug("一共%d行文本数据", len(df))

		all_rows = []
		for one in df['html_cut']:
			#不知为何从csv加载的有的html切词是nan,然后识别成float类型了,加上这个做错误处理
			if not isinstance(one, str):
				logger.error("当前行的html_cut数据类型不是Str:%r", one)
				continue
			cut_content_list = one.split(" ")
			all_rows.append(cut_content_list)
		# 得到词典
		dictionary = corpora.Dictionary(all_rows)
		#为了防止未来词表乱掉,先保存一个词表,固定下来。后续的分类验证的时候,会用这个固定词表
		# dictionary.save("out/dictionary.dic")
		logger.debug("词袋一共%d个词", len(dictionary.keys()))

		return dictionary 
Example #21
Source File: lex_sem_ft.py    From DL-text with MIT License 6 votes vote down vote up
def sum_bigram(sent, model):
    sent = sent.split()
    first = True
    tot = 0
    for i in range(len(sent)):
        try:
            if first:
                tot += model[None][sent[i]]
                first = False
            else:
                tot += model[sent[i-1]][sent[i]]
        except:
            continue
    return tot

#Training Trigram Model[Returns Dictionary of Dictionaries]: 
Example #22
Source File: rock_gensim.py    From MusicTaster with MIT License 5 votes vote down vote up
def prepare_song_artist_dict(tag=''):
    playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists')
    print playlist_dao_inst.db_inst.find(
        {'trackCount': {'$gte': 5, '$lte': 1000}, 'playCount': {'$gte': 5}},
        {'tracks': 1, 'name': 1}).limit(100000).count()
    find_result = playlist_dao_inst.db_inst.find(
        {'trackCount': {'$gte': 5, '$lte': 1000}, 'playCount': {'$gte': 5}},
        {'tracks': 1, 'name': 1}).limit(100000)
    # 将歌单中的歌曲名组合成歌曲名序列
    total_song_artist_set = []
    count = 0
    for item in find_result:
        data_process_logger.info('No.%s %s' % (count, item['name']))
        # 保存歌单中的歌曲序列
        song_artist_seq = []
        for song in item['tracks']:
            sname = song['name']
            artist = song['artists'][0]['name'].lower()
            song_artist_seq.append((sname.lower(), artist))
        total_song_artist_set.append(song_artist_seq)
        count += 1
    data_process_logger.info('start building dictionary')
    # song_dictionary = corpora.Dictionary(total_song_artist_set)
    # print u'歌单数', song_dictionary.num_docs
    # print u'歌曲数', song_dictionary.num_pos
    data_process_logger.info('start saving datas')
    # song_dictionary.save('../datas/song_artist_dictionary_%s.dict' % tag)
    pickle.dump(total_song_artist_set, open('../datas/songs_artists_seq_%s.dat' % tag, 'wb'))
    # return song_dictionary 
Example #23
Source File: document_sequence.py    From fake-news-detection-pipeline with Apache License 2.0 5 votes vote down vote up
def _set_dictionary(self):
        """stores the dictionary of current corpus"""
        self._dictionary = Dictionary(self._tokenized) 
Example #24
Source File: corpora.py    From Topic_Disc with MIT License 5 votes vote down vote up
def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for dialog in self.train_corpus:
            for turn in dialog:
                all_words.append(turn.utt)

        self.vocab_bow = Dictionary(all_words)
        raw_vocab_size = len(self.vocab_bow)
        raw_wc = np.sum(list(self.vocab_bow.dfs.values()))

        # build useless stopwords vocab (e.g, very few words, single ascii words, some punctuation ,."')
        self.vocab_bow.filter_extremes(no_below=20)
        self.vocab_bow.filter_extremes(keep_n=max_vocab_cnt)
        bad_ids = [HT, MEN, URL] + TWITTER_STOPWORDS
        self.vocab_bow.filter_tokens(list(map(self.vocab_bow.token2id.get, bad_ids)))
        len_1_words = list(filter(lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["?", "!", "\"", "i"] and True or False, self.vocab_bow.values()))
        self.vocab_bow.filter_tokens(list(map(self.vocab_bow.token2id.get, len_1_words)))
        self.vocab_bow.compactify()
        # here we keep stopwords and some meaningful punctuations
        non_stopwords = filter(lambda w: re.match(r"^(?=.*[a-zA-Z\d])[\w\d_-]*$", w) and w not in STOPWORDS and True or False, self.vocab_bow.values())
        self.vocab_bow_stopwords = copy.deepcopy(self.vocab_bow)
        self.vocab_bow_stopwords.filter_tokens(map(self.vocab_bow_stopwords.token2id.get, non_stopwords))
        self.vocab_bow_stopwords.compactify()
        self.vocab_bow_non_stopwords = copy.deepcopy(self.vocab_bow)
        self.vocab_bow_non_stopwords.filter_tokens(map(self.vocab_bow_non_stopwords.token2id.get, self.vocab_bow_stopwords.values()))
        self.vocab_bow_non_stopwords.compactify()
        remain_wc = np.sum(list(self.vocab_bow.dfs.values()))
        min_count = np.min(list(self.vocab_bow.dfs.values()))
        # create vocabulary list sorted by count
        print("Load corpus with train size %d, valid size %d, "
              "test size %d raw vocab size %d vocab size %d at cut_off %d OOV rate %f"
              % (len(self.train_corpus), len(self.valid_corpus),
                 len(self.test_corpus),
                 raw_vocab_size, len(self.vocab_bow), min_count,
                 1 - float(remain_wc) / raw_wc)) 
Example #25
Source File: models.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def tfidf_sim(self, train_data, body_dict, threshold):
        '''
        :param 
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
        
        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
        
        vocab = corpora.Dictionary(bodyText_w)
        corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
        tfidf_model = models.TfidfModel(corporaBody_bow)
        
        unrelated, related, y_true, y_pred = [], [], [], []
        for headline, bodyID, stance in train_data:        
            headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
            
            headlines_tfidf = tfidf_model[headline_bow]
            corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
            
            sim = cossim(headlines_tfidf, corporaBody_tfidf)
            unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
        
        print_results([unrelated, related, y_true, y_pred], self.model_type) 
Example #26
Source File: lda_model_calculator.py    From moviegeek with MIT License 5 votes vote down vote up
def build_lda_model(self, data, docs, n_topics=5):

        texts = []
        tokenizer = RegexpTokenizer(r'\w+')
        for d in tqdm(data):
            raw = d.lower()

            tokens = tokenizer.tokenize(raw)

            stopped_tokens = self.remove_stopwords(tokens)

            stemmed_tokens = stopped_tokens
            #stemmer = PorterStemmer()
            #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]

            texts.append(stemmed_tokens)

        dictionary = corpora.Dictionary(texts)

        corpus = [dictionary.doc2bow(text) for text in texts]

        lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                                 num_topics=n_topics)

        index = similarities.MatrixSimilarity(corpus)

        self.save_lda_model(lda_model, corpus, dictionary, index)
        self.save_similarities(index, docs)

        return dictionary, texts, lda_model 
Example #27
Source File: corpora.py    From Document2Vec with MIT License 5 votes vote down vote up
def __init__(self, series, vocab=None, stem=False, bigram=None,
                 labels=True):
        """ Create a corpus that returns one row at a time out
            of a Pandas Series"""
        self.series = series
        self.metadata = False
        if vocab is not None:
            vocab = set(vocab)
        self.vocab = vocab
        self.labels = labels
        self.kwargs = dict(stem=stem, bigram=bigram)
        logging.info("Building SeriesCorpus")
        self.dictionary = Dictionary()
        self.dictionary.add_documents(self.get_texts()) 
Example #28
Source File: similarity.py    From bugbug with Mozilla Public License 2.0 5 votes vote down vote up
def __init__(
        self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []

        for bug in bugzilla.get_bugs():

            textual_features = self.text_preprocess(self.get_text(bug))
            self.corpus.append([bug["id"], textual_features])

        # Assigning unique integer ids to all words
        self.dictionary = Dictionary(text for bug_id, text in self.corpus)

        # Conversion to BoW
        corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus]

        # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
        tfidf = models.TfidfModel(corpus_final)
        corpus_tfidf = tfidf[corpus_final]

        # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
        self.lsi = models.LsiModel(
            corpus_tfidf, id2word=self.dictionary, num_topics=300
        )
        corpus_lsi = self.lsi[corpus_tfidf]

        # Indexing the corpus
        self.index = similarities.Similarity(
            output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300
        ) 
Example #29
Source File: feature_engineering.py    From CIKM-AnalytiCup-2018 with Apache License 2.0 5 votes vote down vote up
def build_statistic(self):
        self.sentences = self.train_df['splited_spn_1'].tolist() + self.train_df['splited_spn_2'].tolist() + self.test_df['splited_spn_1'].tolist() + self.test_df['splited_spn_2'].tolist() + self.unlabeled_df['splited_spn_1'].tolist()
        self.sentences = np.unique(np.array(self.sentences)).tolist()

        words = []
        for comment in self.sentences:
            for w in comment:
                words.append(w)

        counts = Counter(words)
        self.weights = {word: self._get_weight(count) for word, count in counts.items()}

        self.dictionary = corpora.Dictionary(self.sentences)
        self.dictionary.compactify()
        print ("No of words in the dictionary = %s" % len(self.dictionary.token2id)) 
Example #30
Source File: tf_idf_helpers.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def generate_tf_idf_corpora(self):
        #data_path = myConstants.data_path
        #reader = CorpusReader(data_path)
        #body_dict = reader.load_body(myConstants.train_bodies)
        body_dict = myConstants.d.articles
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}

        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        self.vocab = corpora.Dictionary(bodyText_w)
        corporaBody_bow = [self.vocab.doc2bow(text) for text in bodyText_w]
        self.tfidf_model = models.TfidfModel(corporaBody_bow)