python source code of text

# -*- coding: UTF-8 -*- 
"""
Created on Fri Feb 23 12:37:46 2018

@author: Damon Li
"""

import numpy as np

import jieba, os
from gensim import corpora,similarities,models,matutils,utils


class TextProcessing(object):
    '''Text pre-processing functions class.

    # Arguments
        chnSTWPath: chinese stop words txt file path.
        finance_dict: latest financial related words txt file path.
    '''

    def __init__(self,chnSTWPath,finance_dict):
        self.chnSTWPath = chnSTWPath
        self.finance_dict = finance_dict

    def renewFinanceDict(self,new_Word_list):
        '''Add latest necessary financial words into financial dictionary
            for improving tokenization effect.

        # Arguments:
            new_Word_list: New financial words list, eg: ["区块链"，"离岸金融"].
        '''
        with open(self.finance_dict,'a',encoding='utf-8') as file:
            for word in new_Word_list:
                file.write(word + '\n')

    def getchnSTW(self):
        '''Load the stop words txt file.
        '''   
        stopwords = [line.strip() for line in open(self.chnSTWPath, 'r').readlines()]  
        return stopwords

    def jieba_tokenize(self,documents): 
        '''Cut the documents into a sequence of independent words.

        # Arguments:
            documents: List of news(articles).
        '''
        chnSTW = self.getchnSTW()
        corpora_documents = []
        jieba.load_userdict(self.finance_dict)
        for item_text in documents: 
            outstr = []
            sentence_seged = list(jieba.cut(item_text))
            for word in sentence_seged:  
                if word not in chnSTW and word != '\t' \
                and word != ' ':  
                    outstr.append(word)
            corpora_documents.append(outstr)
        return corpora_documents

    def RemoveWordAppearOnce(self,corpora_documents):
        '''Remove the words that appear once among all the tokenized news(articles).

        # Arguments:
             corpora_documents: List of tokenized news(articles).
        '''
        frequency = defaultdict(int)  
        for text in corpora_documents:  
            for token in text:      
                frequency[token] += 1 
        corpora_documents = [[token for token in text if frequency[token] > 1]  for text in corpora_documents] 
        return corpora_documents

    def genDictionary(self,documents,**kwarg):
        '''Generate dictionary and bow-vector of all tokenzied news(articles).

        # Arguments:
            documents: List of news(articles).
            saveDict: Save dictionary or not(bool type).
            saveBowvec: Save bow-vector or not(bool type).
            returnValue: Return value or not(bool type).
        '''
        self._raw_documents = documents
        token = self.jieba_tokenize(documents) #jieba tokenize
        #corpora_documents = self.RemoveWordAppearOnce(token)  # remove thw words appearing once in the dictionary
        self._dictionary = corpora.Dictionary(token)  # generate dictionary using tokenized documents  
        if kwarg['saveDict']:
            self._dictionary.save(kwarg['saveDictPath']) # store the dictionary, for future reference
        self._BowVecOfEachDoc = [self._dictionary.doc2bow(text) for text in token]  # convert tokenized documents to vectors
        if kwarg['saveBowvec']:
            corpora.MmCorpus.serialize(kwarg['saveBowvecPath'], self._BowVecOfEachDoc)  # store to disk, for later use
        if kwarg['returnValue']:
            return token, self._dictionary, self._BowVecOfEachDoc

    def CallTransformationModel(self,Dict,Bowvec,**kwarg):
        '''Invoke specific transformation models of Gensim module.

        # Arguments:
            Dict: Dictionary made by all tokenized news(articles/documents).
            Bowvec: Bow-vector created by all tokenized news(articles/documents).
            modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel.
            tfDim: The number of topics that will be extracted from each news(articles/documents). 
            renewModel: Re-train the transformation models or not(bool type).
            modelPath: The path of saving trained transformation models.
        '''
        if kwarg['renewModel']:
            tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
            tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
            tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
            if kwarg['modelType'] == 'lsi':
                model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
                modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                model.save(kwarg['modelPath']) # same for tfidf, lda, ...
            elif kwarg['modelType'] == 'lda':
                model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
                modelVec = model[tfidfVec] #每个文本对应的LDA向量，稀疏的，元素值是隶属与对应序数类的权重 
                model.save(kwarg['modelPath']) # same for tfidf, lda, ...
            elif kwarg['modelType'] == 'None': 
                model = tfidf
                modelVec = tfidfVec
        else:
            if not os.path.exists(kwarg['modelPath']+"tfidf_model.tfidf"):
                tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
                tfidfVec = tfidf[Bowvec] #
                tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
            else:
                tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
            if kwarg['modelType'] == 'lsi':
                if not os.path.exists(kwarg['modelPath']+"lsi_model.lsi"):
                    tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                    tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
                    model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
                    modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                    model.save(kwarg['modelPath']+"lsi_model.lsi") # same for tfidf, lda, ...
                else:
                    model = models.LsiModel.load(kwarg['modelPath']+"lsi_model.lsi")
                    modelVec = model[tfidfVec] 
            elif kwarg['modelType'] == 'lda':
                if not os.path.exists(kwarg['modelPath']+"lda_model.lda"):
                    tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                    tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
                    model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
                    modelVec = model[tfidfVec] #每个文本对应的LDA向量，稀疏的，元素值是隶属与对应序数类的权重 
                    model.save(kwarg['modelPath']+"lda_model.lda") # same for tfidf, lda, ...
                else:
                    model = models.LdaModel.load(kwarg['modelPath']+"lda_model.lda")
                    modelVec = model[tfidfVec] 
            elif kwarg['modelType'] == 'None': 
                model = tfidf
                modelVec = tfidfVec
        return tfidfVec, modelVec

    def CalSim(self,test_document,Type,best_num):
        '''Calculate similarities between test document wth all news(articles/documents).

        # Arguments:
            test_document: List of raw documents.
            Type: Models of calculating similarities.
            best_num: refer to 'num_best' parameter in Gensim module.
        '''
        if Type == 'Similarity-tfidf-index':
            tfidf = models.TfidfModel(self._BowVecOfEachDoc)  
            tfidfVec = tfidf[self._BowVecOfEachDoc]
            self._num_features = len(self._dictionary.token2id.keys())
            self._similarity = similarities.Similarity(Type, tfidfVec, \
                num_features=self._num_features,num_best=best_num)  
            test_cut_raw = list(jieba.cut(test_document))  
            test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) 
            self._test_BowVecOfEachDoc = tfidf[test_BowVecOfEachDoc]
        elif Type == 'Similarity-LSI-index':
            lsi_model = models.LsiModel(self._BowVecOfEachDoc)  
            corpus_lsi = lsi_model[self._BowVecOfEachDoc]
            self._num_features = len(self._dictionary.token2id.keys())
            self._similarity = similarities.Similarity(Type, corpus_lsi, \
                num_features=self._num_features,num_best=best_num)  
            test_cut_raw = list(jieba.cut(test_document))  
            test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) 
            self._test_BowVecOfEachDoc = lsi_model[test_BowVecOfEachDoc]
        self.Print_CalSim()
        IdLst = []
        SimRltLst = []
        SimTxLst = []
        for Id, Sim in self._similarity[self._test_BowVecOfEachDoc]:
            IdLst.append(Id)
            SimRltLst.append(Sim)
            SimTxLst.append(self._raw_documents[Id])
        return IdLst,SimTxLst,SimRltLst

    def PrintWorfCloud(self,documents,backgroundImgPath,fontPath):
        '''Print out the word cloud of all news(articles/documents).

        # Arguments:
            documents: Overall raw documents.
            backgroundImgPath: Background image path.
            fontPath: The path of windows fonts that used to create the word-cloud.
        '''
        from scipy.misc import imread
        import matplotlib.pyplot as plt
        from wordcloud import WordCloud
        corpora_documents = self.jieba_tokenize(documents) #分词
        for k in range(len(corpora_documents)):
            corpora_documents[k] = ' '.join(corpora_documents[k])
        corpora_documents = ' '.join(corpora_documents)
        color_mask = imread(backgroundImgPath) #"C:\\Users\\lenovo\\Desktop\\Text_Mining\\3.jpg"
        cloud = WordCloud(font_path=fontPath,mask=color_mask,background_color='white',\
                          max_words=2000,max_font_size=40) #"C:\\Windows\\Fonts\\simhei.ttf"
        word_cloud = cloud.generate(corpora_documents) 
        plt.imshow(word_cloud, interpolation='bilinear')
        plt.axis("off")

if __name__ == '__main__':
    tp = TextProcessing(os.getcwd() + '\\' + 'Chinese_Stop_Words.txt', \
    os.getcwd() + '\\' + 'finance_dict.txt')
    doc = ['中央、地方支持政策频出,煤炭行业站上了风口 券商研报浩如烟海，投资线索眼花缭乱，第一财经推出\
            《一财研选》产品，挖掘研报精华，每期梳理5条投资线索，便于您短时间内获取有价值的信息。专业团队\
            每周日至每周四晚8点准时“上新”，\
            助您投资顺利！1．中央、地方支持政策频出，这个行业站上了风口！（信达证券）近年来，利好住房租赁\
            市场发展的政策频频发布，顶层设计趋于完善。信达证券指出，2015年以来，住建部、国务院等机构相继出\
            台政策支持住房租赁市场发展，地方积极跟进，试点城市全部出台相关方案支持当地住房租赁市场发展。除\
            此之外，“租购同权”保障承租人享受公共服务的权益，稳定租赁关系，利好长租公寓发展。除政策利好长租\
            公寓外，需求的逐步释放对长租公寓市场形成支撑。信达证券研究发现，人口向核心一、二线城市流动趋势不\
            减，高房价刺激购房需求转向租房需求、首次置业年龄抬升、高校毕业生租房需求增加等因素将刺激长租公寓\
            需求进一步释放。总体而言，住房租赁市场容量逾万亿且具备区域性特征。2017年8月，国土资源部、住房和城\
            乡建设部联合印发《利用集体建设用地建设租赁住房试点方案》，选择13个试点城市推进利用集体建设用地建\
            设租赁住房，各地“只租不售”地块频出，彰显政府发展住房租赁市场决心。类REITs产品盘活租赁资产，解决\
            长租融资痛点，上述举措能够有效增加租赁住房供给。伴随政策利好，多主体纷纷进军住房租赁市场。信达证\
            券指出，截至目前，房企、房地产中介、专业租赁机构、连锁酒店、金融机构和互联网公司均已涉足住宅租赁市\
            场。其中，房企多采用自持物业的重资产运营方式，中介机构及其他公司多以轻资产运营方式为主，从房源获\
            取的角度看，集中与分散并行。信达证券指出，当前我国租赁住房的发展还处于初步阶段，多主体参与、多模式\
            并存。参与各方均凭借自身比较优势切入住房租赁领域。未来，房企、互联网公司、金融机构存在巨大的合作空间。\
            在市场细分的前提下，增值服务的提供将成为住房租赁市场发展的关键。信达证券推荐关注招商蛇口(21.100, \
            -1.43, -6.35%)（001979.SZ）、万科A(31.270, -1.48, -4.52%)（000002.SZ）、世联行(8.700, -0.87,\
             -9.09%)（002285.SZ）、昆百大A(7.510, -0.05, -0.66%)（000560.SZ）、天健集团(9.330, -0.56, -5.66%)\
            （000090.SZ）。2．煤炭库存创八年新低，缺煤升级，高煤价仍将持续（中银国际）截至1月30日，秦皇岛5500大\
            卡山西优混动力煤报755元，跳涨2%，再超预期，并创近6年新高，此轮上涨持续了10周时间，累计涨幅达13%。煤炭\
            行业是本栏重点追踪的行业板块，近期的大涨验证了此前选摘的多家研究机构的观点，今天我们再来看一下中银国际\
            对板块未来表现的分析观点。中银国际指出，六大电厂日耗量周均81万吨，环比增加9%，库存天数由13天下降至10.9天\
            ，为近8年新低，库存下降至899万吨，为近7年新低。缺煤情况非常突出。经济的强韧性叠加寒冷冰雪天气推升需求超预\
            期是主因，供应侧在年关生产积极性不高、运输不畅是辅因，且短期较难明显缓解，2月初地方矿也面临陆续放假，在\
            这种情况下煤价有继续攀高的可能。中银国际认为此轮煤价上涨包含着较多非季节性因素：六大电厂日耗从2017年12月\
            开始同比增幅都在10%以上，这还是在有工业限产的情况下，这是非常高的数字，在2017年7~8月旺季的同比增幅也只\
            有15%左右。经济较好下的需求超预期历来是煤炭股最好的催化剂。尽管2月份由于春节因素可能价格会回落，但在2018\
            年缺煤明显的情况下，幅度不会太大，高煤价还会继续维持。3月初两会召开，安全形势再度紧张，煤炭的供应仍然会偏\
            紧，在叠加3月15日后限产解除，限产解除前后下游补库存，高煤价可能会贯穿整个一季度。中银国际指出，2017年1月秦\
            皇岛煤价均价只有602元，2018年1月的均价为726元，同比增长21%，动力煤公司一季度的业绩大概率会上调。尽管后续煤\
            价调控的压力在加大，但近期效果可能不明显，中期有待观察。煤炭板块2018年市盈率15倍，估值不贵，且存在继续上调\
            盈利预测和估值下行的可能，股价仍有空间。继续推荐动力煤龙头陕西煤业(8.340, -0.77, -8.45%)（601225.SH）、\
            兖州煤业(15.150, -1.24, -7.57%)（600803.SH）、中国神华(24.290, -1.16, -4.56%)（601088.SH），以及优质\
            的国企改革兼并重组题材股潞安环能(11.590, -1.11, -8.74%)（601699.SH）、山西焦化(12.420, -1.38, -10.00%\
            )（600740.SH）、山煤国际(4.520, -0.50, -9.96%)（600546.SH）、阳泉煤业(7.780, -0.86, -9.95%)（600348.SH）\
            。',\
            '郭文仓到重点工程项目督导检查 2月2日,公司党委书记、董事长、总经理郭文仓,公司董事,股份公司副总经理、总工程师、\
            郭毅民,股份公司副总经理张国富、柴高贵及相关单位负责人到焦化厂煤场全封闭和1#—4#干熄焦等重点工程项目建设工地\
            督导检查施工进度和安全工作情况。郭文仓一行实地查看并详细了解了现场施工情况,询问了施工队伍人员状况,他说,\
            煤场全封闭项目和1#—4#干熄焦项目是公司的重点环保项目,一定要力争将重点工程项目建成精品工程、一流环保标杆项目\
            。近日天气寒冷,又临近春节,煤场全封闭项目进入收尾的关键阶段,施工负责人要紧绷安全弦,加强现场安全管理,从细节抓\
            起,消除隐患,确保收尾工作安全稳定顺利。1#—4#干熄焦项目在大面积开工的重要时期,一定要统筹安排项目进度和质量\
            管理,落实好冬季防护措施,管控好每一道施工环节,目前尤其要注重人员的思想状况,做到不安全不施工,保证施工安全和人\
            员人身安全,确保项目“安全无事故、质量全达标、进度按计划、投资不超概、投产即达效、竣工不留尾、审计无问题、廉政建\
            设好”,为公司打造成全国独立焦化旗舰企业奠定坚实的基础。']
    DictPath = os.getcwd() + '\\' + 'stock_dict_file'
    stockCode = '600740'
    print(DictPath)
    print(DictPath+'\\'+stockCode+'\\'+stockCode+'_dict.dict')
    print(DictPath+'\\'+stockCode+'\\'+stockCode+'_bowvec.mm')
    if not os.path.exists(DictPath+'\\'+stockCode):
        os.makedirs(DictPath+'\\'+stockCode)
    tp.genDictionary(doc,saveDict=True,saveDictPath=DictPath+'\\'+stockCode+'\\'+stockCode+'_dict.dict',\
        saveBowvec=True,saveBowvecPath=DictPath+'\\'+stockCode+'\\'+stockCode+'_bowvec.mm',returnValue=False)