# -*- coding: utf-8 -*-
"""
Created on Fri Aug 10 09:51:06 2018

p.s. 计算senti-score之前要确保classifier已经初始化
     带模型参数的文件都请用空格分隔

@author: HP
"""

import os
from NLP import NLP
import pandas as pd
import multiprocessing

def is_cut_file(filename):
    return filename.split('.')[0][-3:]=='cut'
    
def remove_stopwords_in_valid_word(valid_word_filename):
    fp=open(NLP.stopwords_txt,'r',encoding='utf-8')
    stopwords=fp.readlines()
    fp.close()
    fp=open(valid_word_filename,'r',encoding='utf-8')
    valid_word=fp.readlines()
    fp.close()
    valid_word_new=[]
    for word in valid_word:
        if word in stopwords:
            continue
        else:
            valid_word_new.append(word)
    fp=open(valid_word_filename,'w',encoding='utf-8')
    for word in valid_word_new:
        fp.writelines(word)
    fp.close()
    
class Senti:
    common_word_filename=r'D:\Mainland\Campus Life\ZXtrainee\data\common_words.txt'
    article_dir='D:\\Mainland\\Campus Life\\ZXtrainee\\article'
    
    nlp_vector_filename=r'D:\Mainland\Campus Life\ZXtrainee\data\nlp_vector\common_words_vector 0_0.txt'
    valid_word_filename=r'D:\Mainland\Campus Life\ZXtrainee\data\valid_word_valid_words 0_0.txt' # word2vec中有的词
    word_score_filename=r'D:\Mainland\Campus Life\ZXtrainee\data\word_score\common_words_score 0_0.txt'
    
    w2v_mincount=nlp_vector_filename.split('_')[-2]
    w2v_size=nlp_vector_filename.split('_')[-1].split('.')[0]
    word_classifier=dict()
    
    def __init__(self, Min_count=0, Size=0):
        self.nlp=NLP()
        try:
            self.renew_model(Min_count, Size)
        except:
            pass
        
        
    def renew_model(self, Min_count, Size):
        suffix=str(Min_count) + '_' + str(Size) + '.txt'
        self.nlp_vector_filename=' '.join(self.nlp_vector_filename.split(' ')[:-1]) + ' ' + suffix
        self.valid_word_filename=' '.join(self.valid_word_filename.split(' ')[:-1]) + ' ' + suffix
        self.word_score_filename=' '.join(self.word_score_filename.split(' ')[:-1]) + ' ' + suffix
        self.renew_word_score(Min_count, Size)
    
    
    def renew_word_score(self, Min_count, Size):
        common_words=pd.read_csv(self.word_score_filename)
        self.word_classifier = dict(zip(common_words.iloc[:,0], common_words.iloc[:,1]))
        
        
    def get_topn_topm(self, s1, s2, n=10, m=3):
        s1_sorted=s1.sort_values(ascending=False)
        s1topn_index=s1_sorted.index[:n]
        d=dict()
        for i in s1topn_index:
            d[i[:-3]]=s2[i[:-3]+'wn']
        s=pd.Series(d)
        s_sorted=s.sort_values(ascending=False)
        l=len(s_sorted[s_sorted!=0])
        if(l==0):
            index=[]
            for i in range(m):
                index.append(s1topn_index[i][:-3])
            return index
        elif(l<m):
            return s_sorted.index[:l]
        else:
            return s_sorted.index[:m]            
            
    
    def score_of_common_words(self, Min_count, Size, saveflag=1, savefilename=''):
        """
        calculate scores of common words, and save the results as you like.
        
        p.s. please make sure you have set savefilename.
        """
        self.set_model_parameters(Min_count, Size)
        table=pd.read_csv(self.nlp_vector_filename)
        #table=table.abs() # 余弦相似度直接取绝对值
        result=['']*table.shape[0]
        score=[0]*table.shape[0]
        label_num=(table.shape[1]-1)/2
        for i in range(table.shape[0]): 
            w2v=table.iloc[i,1:label_num+1]
            wn=table.iloc[i,len(label_num)+1:len(label_num)*2+1]
            result[i]=self.get_topn_topm(w2v, wn, n=9, m=3) # 这是一个字符串Index
            for reword in result[i]:
                score[i]+=table.loc[i, reword+'w2v']*self.nlp.Label_dict[reword]
            score[i]/=len(result[i])
        
        if saveflag:
            try:
                fp=open(self.valid_word_filename,'r',encoding='utf-8')
                txtlist=fp.readlines()
            except:
                fp=open(self.valid_word_filename,'r',encoding='gbk')
                txtlist=fp.readlines()
            valid_words=[]
            for t in txtlist:
                t=t.split('\n')[0]
                valid_words.append(t)
            fp.close()
            rawdata=pd.DataFrame(score, valid_words)
            pd.DataFrame.to_csv(rawdata, savefilename,encoding='gkb')
    
    
    def score_of_article(self, article_filename, mode='text'):
        title=os.path.split(article_filename)[1].split('.')[0]
        try_cut_file=os.path.join(os.path.split(article_filename)[0],title)+'_cut.txt'
        if mode=='text':
            if os.path.exists(try_cut_file): # 如果cut file已经存在了就直接用cut file处理了
                words = self.nlp.txt2wordbag(try_cut_file, cutflag=False, remove_stopwords=False)
            else:
                words = self.nlp.txt2wordbag(article_filename, cutflag=True, remove_stopwords=True)
        elif mode=='title':
            words = self.nlp.title2wordbag(title, remove_stopwords=True)
        senti_score = 0
        count=0
        for i in words:
            x = self.word_classifier.get(i)
            if x == None:
                x = 0
                count += 1
            else:
                senti_score += x
        return senti_score/(len(words)-count), title
    
    
    def p_score_of_article(self, article_filename, mode):
        """
        info 用于返回值
        """
        title=os.path.split(article_filename)[1].split('.')[0]
        try_cut_file=os.path.join(os.path.split(article_filename)[0],title)+'_cut.txt'
        if mode=='text':
            if os.path.exists(try_cut_file): # 如果cut file已经存在了就直接用cut file处理了
                words = self.nlp.txt2wordbag(try_cut_file, cutflag=False, remove_stopwords=False)
            else:
                words = self.nlp.txt2wordbag(article_filename, cutflag=True, remove_stopwords=True)
        elif mode=='title':
            words = self.nlp.title2wordbag(title, remove_stopwords=True)
        senti_score_article = 0
        count=0
        for i in words:
            x = self.word_classifier.get(i)
            if x == None:
                x = 0
                count += 1
            else:
                senti_score_article += x
        #info.append((senti_score_article, title))
        #senti_score_date += senti_score_article
    
    
    def score_of_date(self, date='2018-08-01'):
        """
        Returns
            tuple: double   score_of_date
                   tuple    info:( title, score)
        """        
        senti_score=0
        articles = os.listdir(os.path.join(self.article_dir, date))
        info = []
        count = 0
        for article in articles:
            if is_cut_file(os.path.join(self.article_dir, date, article)):
                continue
            score, title = self.score_of_article(os.path.join(self.article_dir, date, article))
            senti_score += score
            info.append((title, score))
            count +=1
        return senti_score/len(articles), info
    
    
    def p_score_of_date(self, date='2018-08-01'):
        """
        It seems that it cannot work.
        Returns
            tuple: double   score_of_date
                   tuple    info:( title, score)
        """      
        #lock=threading.RLock()
        #senti_score_date=0
        articles = os.listdir(os.path.join(self.article_dir, date))
        #info=[]
        pool_arg = []
        count = 0
        p=multiprocessing.Pool(multiprocessing.cpu_count())
        for article in articles:
            if is_cut_file(os.path.join(self.article_dir, date, article)):
                continue
            pool_arg.append(os.path.join(self.article_dir, date, article))
            pool_arg.append('text')
            count += 1
        result = p.map(self.p_score_of_article, pool_arg)
        return result
        """for article in articles:
            if is_cut_file(os.path.join(self.article_dir, date, article)):
                continue
            t=threading.Thread(target=self.p_score_of_article, args=(os.path.join(self.article_dir, date, article), info))
            th.append(t)
            t.start()
            count += 1
        for t in th:
            t.join()
        for i in range(count):
            senti_score_date += info[i][0]
        return senti_score_date/count, info"""

    
    def calculate_scores_of_all(self, saveflag=0, savefilename=''):
        dates = os.listdir(self.article_dir)
        all_date_score=[]
        for date in dates:
            try:
                score,info=self.score_of_date(date)
                all_date_score.append((date,score))
            except:
                continue
        if saveflag:
            rawdata=pd.DataFrame(all_date_score)
            pd.DataFrame.to_csv(rawdata, savefilename)
        return all_date_score,dates