python source code of main

"""
Online LDA with a soft alignment to integrate previous states.
"""

import os
import gc
import resource
import logging
import itertools
import time
import json
import cPickle
import re
import numpy as np
from scipy import sparse
from scipy.stats import entropy
from datetime import datetime, timedelta
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from collections import defaultdict
from gensim.models import Word2Vec, LdaMulticore, TfidfModel
from extractSentenceWords import *
from onlineLDA import *
from config import Config
from extract_phrase import extract_phrases

bigram = None; trigram = None; wv_model = None
my_stoplst = ["app", "good", "excellent", "awesome", "please", "they", "very", "too", "like", "love", "nice", "yeah",
"amazing", "lovely", "perfect", "much", "bad", "best", "yup", "suck", "super", "thank", "great", "really",
"omg", "gud", "yes", "cool", "fine", "hello", "alright", "poor", "plz", "pls", "google", "facebook",
"three", "ones", "one", "two", "five", "four", "old", "new", "asap", "version", "times", "update", "star", "first",
"rid", "bit", "annoying", "beautiful", "dear", "master", "evernote", "per", "line", "oh", "ah", "cannot", "doesnt",
"won't", "dont", "unless", "you're", "aren't", "i'd", "can't", "wouldn't", "around", "i've", "i'll", "gonna", "ago",
"you'll", "you'd", "28th", "gen", "it'll", "vice", "would've", "wasn't", "year", "boy", "they'd", "isnt", "1st", "i'm",
"nobody", "youtube", "isn't", "don't", "2016", "2017", "since", "near", "god"]


# dataset
app_files = Config.get_datasets()
app_files_pre = {}
validate_files = Config.get_validate_files()
candidate_num = Config.get_candidate_num()
topic_num = Config.get_topic_num()
win_size = Config.get_window_size()
bigram_min = Config.get_bigram_min()
trigram_min = Config.get_trigram_min()
info_num = Config.get_info_num()
store_num = Config.get_store_num()
val_index = Config.get_validate_or_not()


def extract_review():
    """
    Extract reviews with time and version stamp
    :return:
    """
    timed_reviews = {}
    num_docs = 0
    num_words = 0
    for apk, app in app_files:
        timed_reviews[apk] = []
        with open(app) as fin:
            lines = fin.readlines()
        for l_id, line in enumerate(lines):
            line = line.strip()
            terms = line.split("******")
            if len(terms) != info_num:
                logging.error("review format error at %s in %s" % (apk, line))
                continue
            if not store_num: ## for ios
                date = terms[3]
                version = terms[4]
            else:             ## for android
                date = terms[2]
                version = terms[3]
            review_o = terms[1]
            review_p, wc = extractSentenceWords(review_o)
            review = list(build_phrase(review_p))
            review = [list(replace_digit(s)) for s in review]
            rate = float(terms[0]) if re.match(r'\d*\.?\d+', terms[0]) else 2.0  # 2.0 is the average rate
            timed_reviews[apk].append({"review": review, "date": date, "rate": rate, "version": version})
            num_docs += 1
            num_words += wc
            if l_id % 1000 == 0:
                logging.info("processed %d docs of %s" % (l_id, apk))
    logging.info("total read %d reviews, %d words."%(num_docs, num_words))
    return timed_reviews

def replace_digit(sent):
    for w in sent:
        if w.isdigit():
            yield '<digit>'
        else:
            yield w

def build_phrase(doc):
    # load phrase model
    return trigram[bigram[doc]]


def update_phrase():
    """
    Update bigram and trigram model
    :return:
    """
    for apk, app in app_files:
        with open(app) as fin:
            lines = fin.readlines()
        for line in lines:
            line = line.strip()
            terms = line.split("******")
            if len(terms) != info_num:
                logging.error("review format error at %s in %s" % (apk, line))
                continue
            review_o = terms[1]
            review_p, wc = extractSentenceWords(review_o)
            bigram.add_vocab(review_p)
            trigram.add_vocab(bigram[review_p])
        # update
        bigram.save("../model/bigram.model")
        trigram.save("../model/trigram.model")

def load_phrase():
    global bigram
    global trigram
    bigram = Phrases.load(os.path.join("..", "model", "bigram.model"))
    trigram = Phrases.load(os.path.join("..", "model", "trigram.model"))

def save_obj(filename, rst):
    with open(filename, 'w') as fout:
        cPickle.dump(rst, fout)
def load_obj(filename):
    with open(filename) as fin:
        return cPickle.load(fin)

def build_AOLDA_input_version(timed_reviews=None):
    """
    build version-aligned input for AOLDA
    :param timed_reviews:
    :return:
    """
    if timed_reviews is None:
        with open("../result/timed_reviews") as fin:
            timed_reviews = json.load(fin)
    stoplist = stopwords.words('english') + my_stoplst

    OLDA_input = {}
    for apk, reviews in timed_reviews.items():
        # build a dictionary to store the version and review
        version_dict = {}
        input = []
        rate = []
        tag = []
        for review in reviews:
            review_ver = review['version']
            if review_ver == "Unknown":
                continue
            if review_ver not in version_dict:
                version_dict[review_ver] = ([], [])
            version_dict[review_ver][0].append(review['review'])
            version_dict[review_ver][1].append(review['rate'])

        # re-arrange the version sequence
        for ver in sorted(version_dict.iterkeys(), key=lambda s: map(int, s.split('.'))):
            if len(version_dict[ver][0]) > 50:          # skip versions with not enough reviews
                tag.append(ver)
                input.append(version_dict[ver][0])
                rate.append(version_dict[ver][1])

        dict_input = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(input))))
        dictionary = corpora.Dictionary(dict_input)
        dictionary.filter_tokens(map(dictionary.token2id.get, stoplist))
        dictionary.compactify()
        dictionary.filter_extremes(no_below=2, keep_n=None)
        dictionary.compactify()

        # for each interval, build bow
        input_X = []
        for t_i, text_period in enumerate(input):
            # construct sparse matrix
            text_period = list(itertools.chain.from_iterable(text_period))      # sentence level to doc level
            row = []
            col = []
            value = []
            r_id = 0
            for k, text in enumerate(text_period):
                empty = True
                for i, j in dictionary.doc2bow(text):
                    row.append(r_id)
                    col.append(i)
                    value.append(j)
                    empty = False
                if not empty:
                    r_id += 1
            input_X.append(sparse.coo_matrix((value, (row, col)), shape=(r_id, len(dictionary))))
        OLDA_input[apk] = (dictionary, input_X, input, rate, tag)      # input: raw input, with time and sent
    return OLDA_input

def generate_labeling_candidates(OLDA_input):
    """
    Filter phrase labels and choose for candidates
    :param OLDA_input:
    :return:
    """
    phrases = {}
    for apk, item in OLDA_input.items():
        dic, _, _1, _2, _3= item
        phrases[apk] = defaultdict(int)
        # filter bigram and trigram
        for word in dic.values():
            if '_' in word:
                phrase = word
                words, tags = zip(*nltk.pos_tag(phrase.split(b'_')))
                match = False
                for tag in tags:
                    if re.match(r"^NN", tag):
                        match = True
                        continue
                    if re.match(r"DT", tag):
                        match = False
                        break
                    if re.match(r"RB", tag):
                        match = False
                        break
                for word in words:
                    if word in stopwords.words('english') + my_stoplst:     # remove stop word
                        match = False
                        break
                    if len(word) < 3:
                        match = False
                        break
                    if "\\'" in word:
                        match = False
                        break
                if match:
                    # keep phrase
                    phrases[apk][phrase] = 1
    return phrases

def OLDA_fit(OLDA_input, n_topics, win_size):
    phis = {}
    theta = {}
    for apk, item in OLDA_input.items():
        dictionary, input_X, _, _1, _2 = item
        olda_model = OLDA(n_topics=n_topics, n_iter=1000, refresh=500, window_size=win_size)
        olda_model.fit(input_X)
        phis[apk] = olda_model.B
        theta[apk] = olda_model.A
        fout = open("../result/topic_words_%s_%s_%s"%(apk, n_topics, win_size), 'w')
        for t_i, phi in enumerate(phis[apk]):
            fout.write("time slice %s\n"%t_i)
            for i, topic_dist in enumerate(phi):
                topic_words = [dictionary[w_id] for w_id in np.argsort(topic_dist)[:-10:-1]]
                fout.write('Topic {}: {}\n'.format(i, ' '.join(topic_words)))
            fout.write('\n')
        fout.close()
    return phis

def count_occurence(dic, rawinput, label_ids):
    count = []
    for d_i, rawinput_i in enumerate(rawinput):
        count_i = defaultdict(int)
        for input in list(itertools.chain.from_iterable(rawinput_i)):
            bow = dic.doc2bow(input)
            for id, value in bow:
                count_i[id] += value
                if id in label_ids[d_i]:
                    for idx, valuex in bow:
                        count_i[id, idx] += min(value, valuex)    # label always first
        count.append(count_i)
    return count

def total_count_(dic, rawinput):
    total_count = []
    for rawinput_i in rawinput:
        total_count_i = 0
        for input in list(itertools.chain.from_iterable(rawinput_i)):
            bow = dic.doc2bow(input)
            for id, value in bow:
                total_count_i += value
        total_count.append(total_count_i)
    return total_count

def get_candidate_label_ids(dic, labels, rawinput):
    all_label_ids = map(dic.token2id.get, labels)
    label_ids = []

    for rawinput_i in rawinput:
        count = defaultdict(int)
        for input in list(itertools.chain.from_iterable(rawinput_i)):
            bow = dic.doc2bow(input)
            for id, value in bow:
                if id in all_label_ids:
                    count[id] += value
        label_ids.append(count.keys())
    return label_ids

def get_candidate_sentences_ids(rawinput, rates):
    sent_ids = []
    sent_rates = []
    index = 0
    for t_i, rawinput_i in enumerate(rawinput):
        sent_id = []
        sent_rate = []
        for i_d, input_d in enumerate(rawinput_i):
            for i_s, input_s in enumerate(input_d):
                if len(input_s) < 5:          # length should be bigger than 5
                    continue
                sent_id.append(index + i_s)
                sent_rate.append(rates[t_i][i_d])
            index += len(input_d)
        sent_ids.append(sent_id)
        sent_rates.append(sent_rate)
    return sent_ids, sent_rates

def get_sensitivities(dic, rawinput, rates, label_ids):
    sensi = []
    for t_i, rawinput_i in enumerate(rawinput):
        sensi_t = []
        label_sensi = [[] for _ in label_ids[t_i]]
        for d_i, input in enumerate(rawinput_i):
            doc_input = list(itertools.chain.from_iterable(input))
            bow = dic.doc2bow(doc_input)
            for id, value in bow:
                if id in label_ids[t_i]:
                    label_sensi[label_ids[t_i].index(id)].append([rates[t_i][d_i], len(doc_input)]) # record the rate and length
        for rl in label_sensi:
            rl = np.array(rl)
            m_rl = np.mean(rl, 0)
            sensi_t.append(np.exp(- m_rl[0]/np.log(1+m_rl[1])))
        sensi.append(np.array(sensi_t))
    return sensi

def get_sensitivities_sent(rawinput_sent, sent_rates, sent_ids):
    sensi = []
    for t_i, sent_id in enumerate(sent_ids):
        sensi_i = []
        for id, s_id in enumerate(sent_id):
            r = sent_rates[t_i][id]
            l = len(rawinput_sent[s_id])
            sensi_i.append(np.exp(- r / float(np.log(l))))
        sensi.append(np.array(sensi_i))
    return sensi


def JSD(P, Q):
    """
    Jensen-Shannon divergence
    :param P:
    :param Q:
    :return:
    """
    _M = 0.5 * (P + Q)
    return 0.5 * (entropy(P, _M) + entropy(Q, _M))

def sim_topic_word(phi, label_id, count):
    # sim = 0
    c_l = np.array([np.log((count[label_id, w_id] + 1) / float((count[w_id] + 1) * (count[label_id] + 1))) for w_id in range(len(phi))])
    return np.dot(phi, c_l)

def topic_labeling(OLDA_input, apk_phis, phrases, mu, lam, theta, save=True):
    """
    Topic labeling for phrase and sentence
    :param OLDA_input:
    :param apk_phis:
    :param phrases:
    :param mu:
    :param lam:
    :param theta:
    :param save:
    :return:
    """
    logging.info("labeling topics(mu: %f, lam: %f, theta: %f)......" % (mu, lam, theta))
    apk_jsds = {}
    for apk, item in OLDA_input.items():
        dictionary, _, rawinput, rates, tag = item
        phis = apk_phis[apk]
        labels = phrases[apk].keys()
        # label_ids = map(dictionary.token2id.get, labels)
        label_ids = get_candidate_label_ids(dictionary, labels, rawinput)
        count = count_occurence(dictionary, rawinput, label_ids)
        total_count = total_count_(dictionary, rawinput)
        sensi_label = get_sensitivities(dictionary, rawinput, rates, label_ids)
        rawinput_sent = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(rawinput))))
        sent_ids, sent_rates = get_candidate_sentences_ids(rawinput, rates)
        sensi_sent = get_sensitivities_sent(rawinput_sent, sent_rates, sent_ids)
        jsds = []
        label_phrases = []; label_sents = []; emerge_phrases = []; emerge_sents = []
        if save:
            result_path="../result/%s"%apk
            if not os.path.exists(result_path):
                os.makedirs(result_path)
            fout_labels = open(os.path.join(result_path, "topic_labels"), 'w')
            fout_emerging = open(os.path.join(result_path, "emerging_topic_labels"), 'w')
            fout_sents = open(os.path.join(result_path, "topic_sents"), "w")
            fout_emerging_sent = open(os.path.join(result_path, "emerging_topic_sents"), 'w')
            fout_topic_width = open(os.path.join(result_path, "topic_width"), 'w')

        for t_i, phi in enumerate(phis):
            # label topic
            logging.info("labeling topic at %s slice of %s" % (t_i, apk))
            topic_label_scores = topic_labeling_(count[t_i], total_count[t_i], label_ids[t_i], sensi_label[t_i], phi, mu, lam)
            topic_label_sent_score = topic_label_sent(dictionary, phi, rawinput_sent, sent_ids[t_i], sensi_sent[t_i], mu, lam)

            # write to file: topic phrase
            if save:
                fout_labels.write("time slice %s, tag: %s\n"%(t_i, tag[t_i]))
                for tp_i, label_scores in enumerate(topic_label_scores):
                    fout_labels.write("Topic %d:"%tp_i)
                    for w_id in np.argsort(label_scores)[:-candidate_num-1:-1]:
                        fout_labels.write("%s\t%f\t" % (dictionary[label_ids[t_i][w_id]], label_scores[w_id]))
                    fout_labels.write('\n')

                fout_sents.write("time slice %s, tag: %s\n" % (t_i, tag[t_i]))
                for tp_i, sent_scores in enumerate(topic_label_sent_score):
                    fout_sents.write("Topic %d:"%tp_i)
                    for s_id in np.argsort(sent_scores)[:-candidate_num-1:-1]:
                        fout_sents.write("%s\t%f\t"%(" ".join(rawinput_sent[sent_ids[t_i][s_id]]), sent_scores[s_id]))
                    fout_sents.write('\n')

            # store for verification
            label_phrases_ver = []; label_sents_ver = []
            for tp_i, label_scores in enumerate(topic_label_scores):
                label_phrases_ver.append(
                    [dictionary[label_ids[t_i][w_id]] for w_id in np.argsort(label_scores)[:-candidate_num-1:-1]])
            label_phrases.append(list(itertools.chain.from_iterable(label_phrases_ver)))
            for tp_i, sent_scores in enumerate(topic_label_sent_score):
                label_sents_ver.append(
                    [rawinput_sent[sent_ids[t_i][s_id]] for s_id in np.argsort(sent_scores)[:-candidate_num-1:-1]])
            label_sents.append(list(itertools.chain.from_iterable(label_sents_ver)))

            # detect emerging topic
            logging.info("detecting topic at %s slice of %s" % (t_i, apk))
            if save and t_i == 0:
                topic_width = count_width(dictionary, label_phrases_ver, count[t_i], sensi_label[t_i], label_ids[t_i])
                for theta in topic_width:
                    fout_topic_width.write("%f\t" % theta)
                fout_topic_width.write("\n")
                continue   # skip the first epoch
            emerging_label_scores, emerging_sent_scores = topic_detect(rawinput_sent, dictionary, phi, phis[t_i-1], count[t_i], count[t_i-1], total_count[t_i],
                                                 total_count[t_i-1], label_ids[t_i], sent_ids[t_i], sensi_label[t_i], sensi_sent[t_i], jsds, theta, mu, lam)
            # write to file
            if save:
                fout_emerging.write("time slice %s, tag: %s\n"%(t_i, tag[t_i]))
                for tp_i, label_scores in enumerate(emerging_label_scores):
                    fout_emerging.write("Topic %d: "%tp_i)
                    if np.sum(label_scores) == 0:
                        fout_emerging.write('None\n')
                    else:
                        for w_id in np.argsort(label_scores)[:-4:-1]:
                            fout_emerging.write("%s\t%f\t" % (dictionary[label_ids[t_i][w_id]], label_scores[w_id]))
                        fout_emerging.write('\n')
                fout_emerging_sent.write("time slice %s, tag: %s\n"%(t_i, tag[t_i]))
                for tp_i, sent_scores in enumerate(emerging_sent_scores):
                    fout_emerging_sent.write("Topic %d: "%tp_i)
                    if np.sum(sent_scores) == 0:
                        fout_emerging_sent.write('None\n')
                    else:
                        for s_id in np.argsort(sent_scores)[:-4:-1]:
                            fout_emerging_sent.write("%s\t%f\t" % (" ".join(rawinput_sent[sent_ids[t_i][s_id]]), sent_scores[s_id]))
                        fout_emerging_sent.write('\n')
            # store for verification
            emerge_phrases_ver = []; emerge_sents_ver = []
            emerge_phrases_width_ver = []
            for tp_i, label_scores in enumerate(emerging_label_scores):
                if np.sum(label_scores) == 0:
                    emerge_phrases_width_ver.append([])
                    continue
                emerge_phrases_ver.append(
                    [dictionary[label_ids[t_i][w_id]] for w_id in np.argsort(label_scores)[:-4:-1]])
                emerge_phrases_width_ver.append(
                    [dictionary[label_ids[t_i][w_id]] for w_id in np.argsort(label_scores)[:-4:-1]])
            emerge_phrases.append(emerge_phrases_ver)
            # merge emerge to label
            label_emerge_ver = [set(l)|set(e) for l, e in zip(label_phrases_ver, emerge_phrases_width_ver)]
            topic_width = count_width(dictionary, label_emerge_ver, count[t_i], sensi_label[t_i], label_ids[t_i])
            for tp_i, sent_scores in enumerate(emerging_sent_scores):
                if np.sum(sent_scores) == 0:
                    continue
                emerge_sents_ver.append(
                    [rawinput_sent[sent_ids[t_i][s_id]] for s_id in np.argsort(sent_scores)[:-4:-1]])
            emerge_sents.append(emerge_sents_ver)
            # write topic width
            if save:
                for theta in topic_width:
                    fout_topic_width.write("%f\t" % theta)
                fout_topic_width.write("\n")

        ############################################
        if val_index:
            validation(validate_files[apk], label_phrases, label_sents, emerge_phrases, emerge_sents)
        ############################################

        if save:
            fout_labels.close()
            fout_sents.close()
            fout_emerging.close()
            fout_emerging_sent.close()
            fout_topic_width.close()
        apk_jsds[apk] = jsds
    return apk_jsds

def topic_labeling_(count, total_count, label_ids, sensi, phi, mu, lam):
    topic_label_scores = rank_topic_label(count, total_count, phi, label_ids, mu)
    topic_label_scores += lam * sensi
    return topic_label_scores

# rank the label according to similarity of the topic dist and divergence of other topic dist
def rank_topic_label(count, total_count, phi, label_ids, mu=0.2):
    # matrix implementation for speed-up
    # construct topic matrix
    mu_div = mu / (len(phi) - 1)
    c_phi = phi * (1 + mu_div) - np.sum(phi, 0) * mu_div
    # construct label count matrix
    c_label_m = np.empty((len(label_ids), len(phi[0])), dtype=float)
    for ind, label_id in enumerate(label_ids):
        for w_id in range(len(phi[0])):
            c_label_m[ind, w_id] = count.get((label_id, w_id)) * total_count / float((count.get(w_id) + 1) * (count.get(label_id) + 1)) if (label_id, w_id) in count else 1.0
    c_label_m = np.log(c_label_m)
    # compute score matrix
    topic_label_scores = np.dot(c_phi, np.transpose(c_label_m))
    return topic_label_scores

def topic_detect(rawinput_sents, dic, phi, last_phi, count, last_count, total_count, last_total_count, label_ids, sent_ids, sensi_label, sensi_sent, jsds, theta, mu, lam):
    # matrix implementation for speed-up
    # construct count label matrix
    c_label_m = np.empty((len(label_ids), len(phi[0])), dtype=float)
    c_last_label_m = np.empty((len(label_ids), len(phi[0])), dtype=float)
    for ind, label_id in enumerate(label_ids):
        for w_id in range(len(phi[0])):
            c_label_m[ind, w_id] = count.get((label_id, w_id)) * total_count / float((count.get(w_id) + 1) * (count.get(w_id) + 1)) if (label_id, w_id) in count else 1.0
            c_last_label_m[ind, w_id] = last_count.get((label_id, w_id)) * last_total_count / float((last_count.get(w_id) + 1) * (last_count.get(label_id) + 1)) if (label_id, w_id) in last_count else 1.0

    c_label_m = np.log(c_label_m)
    c_last_label_m = np.log(c_last_label_m)
    # construct sentence count matrix
    sent_count = np.empty((len(sent_ids), len(phi[0])), dtype=float)
    for ind, s_id in enumerate(sent_ids):
        bow = dic.doc2bow(rawinput_sents[s_id])
        len_s = len(rawinput_sents[s_id])
        for w_id in range(len(phi[0])):
            sent_count[ind, w_id] = 0.00001
        for k, v in bow:
            sent_count[ind, k] = v / float(len_s)
    # # construct residuals
    # phi_logphi = np.log(phi) * phi
    # phi_logphi_last = np.log(last_phi) * last_phi

    # read topic distribution \phi
    emerging_label_scores_rst = np.zeros((len(phi), len(label_ids)))
    emerging_sent_scores_rst = np.zeros((len(phi), len(sent_ids)))
    js_d = []
    for t_i, phi_i in enumerate(phi):
        # labeling
        js_divergence = JSD(phi_i, last_phi[t_i])
        js_d.append(js_divergence)
        jsds.append(js_divergence)
        # logging.info("JSD for phi is %f"%js_divergence)
    # compute mean and variance of jsds
    js_mean = np.mean(jsds[:-3*len(phi)-1:-1])
    js_std = np.std(jsds[:-3*len(phi)-1:-1])
    # logging.info("JSD threshold is %f"%(js_mean+1.25*js_std))
    emerging_index = np.array(js_d) > js_mean + 1.25*js_std
    # TOPIC DETECT
    phi_e = phi[emerging_index]
    phi_last_e = last_phi[emerging_index]
    E = float(np.sum(emerging_index))
    if E == 0:
        return emerging_label_scores_rst, emerging_sent_scores_rst
    # TOPIC DETECT: construct phi - last_phi
    phi_m = (1 + mu/E) * phi_e - theta * last_phi[emerging_index] - mu/E * np.sum(phi_e, 0)
    # TOPIC DETECT: construct residuals
    residuals_m = (1 + mu/E) * np.log(phi_e) * phi_e - theta * np.log(phi_last_e) * phi_last_e - mu/E * np.sum(np.log(phi_e) * phi_e, 0)
    # TOPIC DETECT: compute labels
    emerging_label_scores = np.dot((1 + mu/E) * phi_e - mu/E * np.sum(phi_e, 0), np.transpose(c_label_m)) - theta * np.dot(last_phi[emerging_index], np.transpose(c_last_label_m)) + lam * sensi_label
    emerging_sent_scores = np.dot(phi_m, np.transpose(np.log(sent_count))) - np.sum(residuals_m, 1, keepdims=True) + lam * sensi_sent

    emerging_label_scores_rst[emerging_index] = emerging_label_scores
    emerging_sent_scores_rst[emerging_index] = emerging_sent_scores

    return emerging_label_scores_rst, emerging_sent_scores_rst

# rank sentence representation for topic
def topic_label_sent(dic, phi, rawinput_sents, sent_ids, sensi, mu, lam):
    # construct topic matrix
    mu_div = mu / (len(phi) - 1)
    c_phi = phi * (1 + mu_div) - np.sum(phi, 0) * mu_div
    # construct residual
    phi_logphi = phi * np.log(phi)
    residual_1 = mu_div * np.sum(phi_logphi)        # residual_1 is a value
    residual_2 = (1 + mu_div) * np.sum(phi_logphi, 1, keepdims=True)    # residual_2 is a n_topic*1
    # construct sentence count matrix
    sent_count = np.empty((len(sent_ids), len(phi[0])), dtype=float)
    for ind, s_id in enumerate(sent_ids):
        bow = dic.doc2bow(rawinput_sents[s_id])
        len_s = len(rawinput_sents[s_id])
        for w_id in range(len(phi[0])):
            sent_count[ind, w_id] = 0.00001
        for k, v in bow:
            sent_count[ind, k] = v / float(len_s)

    phi_sent = np.dot(c_phi, np.transpose(np.log(sent_count))) + residual_1 - residual_2 + lam * sensi
    return phi_sent

def count_width(dictionary, label_phrases_ver, counts, sensi_labels, label_ids):
    count_width_rst = []
    for phrases in label_phrases_ver:
        t_count = 0
        for phrase in phrases:
            pid = dictionary.token2id.get(phrase)
            t_count += np.log(counts.get(pid)+1) * sensi_labels[label_ids.index(pid)]
        count_width_rst.append(t_count)
    return np.array(count_width_rst)

def validation(logfile, label_phrases, label_sents, emerge_phrases, emerge_sents):
    # read changelog
    clog = []
    with open(logfile) as fin:
        for line in fin.readlines():
            line = line.strip()
            issue_kw = map(lambda s: s.strip().split(), line.split(","))
            clog.append(issue_kw)
    # check alignment
    if len(clog) != len(label_phrases):
        logging.error("length not corrected: %d, %d"%(len(clog), len(label_phrases)))
        exit(0)
    # compare topic label using keyword
    # load word2vec model
    wv_model = Word2Vec.load(os.path.join("..", "model", "wv", "word2vec_app.model"))
    label_phrase_precisions = []; label_phrase_recalls = []; label_sent_precisions = []; label_sent_recalls = []
    em_phrase_precisions = []; em_phrase_recalls = []; em_sent_precisions = []; em_sent_recalls = []
    # two list: [['keyword1', 'keyword2', ...], ['keyword1', 'keyword2', ...]]
    #           [['label1', 'label2', ...], ['label1', 'label2', ...]]
    for id, ver in enumerate(clog):
        if ver == [[]]: # skip the empty version changelog
            continue
        label_phrase_match_set = set(); label_phrase_issue_match_set = set(); label_sent_match_set = set(); label_sent_issue_match_set = set()
        em_phrase_match_set = set(); em_phrase_issue_match_set = set(); em_sent_match_set = set(); em_sent_issue_match_set = set()

        if id != len(clog) - 1 and clog[id+1] != [[]]:         # merge changelog with next version
            m_ver = ver + clog[id+1]
        else:
            m_ver = ver
        # phrase
        for issue in m_ver:
            for kw in issue:
                kw_match = False
                for w in label_phrases[id]:
                    label_match = False
                    for w_s in w.split("_"):
                        if sim_w(kw, w_s, wv_model) > 0.6:
                            # hit
                            #logging.info("hit: %s -> %s"%(w, kw))
                            label_match = True
                            kw_match = True
                            break
                    if label_match: # if label match found, add label to match set
                        label_phrase_match_set.add(w)
                if kw_match:    # if kw match found, add issue to match set
                    label_phrase_issue_match_set.add("_".join(issue))

        # sentence
        for issue in m_ver:
            for kw in issue:
                kw_match = False
                for sent in label_sents[id]:
                    for w in sent:
                        label_match = False
                        for w_s in w.split("_"):
                            if sim_w(kw, w_s, wv_model) > 0.6:
                                # hit
                                #logging.info("hit: %s -> %s"%(w, kw))
                                label_match = True
                                kw_match = True
                                break
                        if label_match:
                            label_sent_match_set.add("_".join(sent))   # if label match found, skip to next sentence
                            break
                if kw_match:
                    label_sent_issue_match_set.add("_".join(issue))

        # check emerging issue label
        # merge current version and next version
        # if id != len(clog) - 1:
        #     m_ver = ver + clog[id+1]
        # else:
        #     m_ver = ver
        if id != 0:     # skip the first epoch
            for issue in m_ver:
                for kw in issue:
                    kw_match = False
                    for tws in emerge_phrases[id-1]:
                        for w in tws:
                            label_match = False
                            for w_s in w.split("_"):
                                if sim_w(kw, w_s, wv_model) > 0.6:
                                    # hit
                                    #logging.info("hit: %s -> %s" % (w, kw))
                                    label_match = True
                                    kw_match = True
                                    break
                            if label_match:
                                em_phrase_match_set.add("_".join(tws))
                                break
                    if kw_match:
                        em_phrase_issue_match_set.add("_".join(issue))

            # sentence
            for issue in m_ver:
                for kw in issue:
                    kw_match = False
                    for tsents in emerge_sents[id-1]:
                        sent = list(itertools.chain.from_iterable(tsents))
                        label_match = False
                        for w in sent:
                            for w_s in w.split("_"):
                                if sim_w(kw, w_s, wv_model) > 0.6:
                                    # hit
                                    #logging.info("hit: %s -> %s" % (w, kw))
                                    label_match = True
                                    kw_match = True
                                    break
                            if label_match:
                                em_sent_match_set.add("_".join(sent))  # if label match found, skip to next sentence
                                break
                    if kw_match:
                        em_sent_issue_match_set.add("_".join(issue))

        # compute
        label_phrase_precision = len(label_phrase_match_set) / float(len(label_phrases[id]))
        label_phrase_recall = len(label_phrase_issue_match_set) / float(len(m_ver))
        label_sent_precision = len(label_sent_match_set) / float(len(label_sents[id]))
        label_sent_recall = len(label_sent_issue_match_set) / float(len(m_ver))
        label_phrase_precisions.append(label_phrase_precision)
        label_phrase_recalls.append(label_phrase_recall)
        label_sent_precisions.append(label_sent_precision)
        label_sent_recalls.append(label_sent_recall)

        if id != 0:
            if len(emerge_phrases[id-1]) != 0:
                em_phrase_precision = len(em_phrase_match_set) / float(len(emerge_phrases[id-1]))
                em_phrase_precisions.append(em_phrase_precision)
            em_phrase_recall = len(em_phrase_issue_match_set) / float(len(ver))
            if len(emerge_sents[id-1]) != 0:
                em_sent_precision = len(em_sent_match_set) / float(len(emerge_sents[id-1]))
                em_sent_precisions.append(em_sent_precision)
            em_sent_recall = len(em_sent_issue_match_set) / float(len(ver))
            em_phrase_recalls.append(em_phrase_recall)
            em_sent_recalls.append(em_sent_recall)
    label_phrase_fscore = 2 * np.mean(label_phrase_recalls) * np.mean(em_phrase_precisions) / (np.mean(label_phrase_recalls) + np.mean(em_phrase_precisions))
    label_sent_fscore = 2 * np.mean(label_sent_recalls) * np.mean(em_sent_precisions) / (np.mean(label_sent_recalls) + np.mean(em_sent_precisions))
    logging.info("Phrase label precision: %s\trecall: %f"%(np.mean(label_phrase_precisions), np.mean(label_phrase_recalls)))
    logging.info("Sentence label precision: %s\trecall: %f" % (np.mean(label_sent_precisions), np.mean(label_sent_recalls)))
    logging.info(
        "Emerging phrase precision: %s\trecall: %f" % (np.mean(em_phrase_precisions), np.mean(em_phrase_recalls)))
    logging.info(
        "Emerging sentence precision: %s\trecall: %f" % (np.mean(em_sent_precisions), np.mean(em_sent_recalls)))
    logging.info("Phrase F1 score: %f"%label_phrase_fscore)
    logging.info("Sentence F1 score: %f" % label_sent_fscore)
    with open("../result/statistics.txt", "a") as fout:
        fout.write("%s\t%f\t%f\t%f\t%f\t%f\t%f\n"%(logfile, np.mean(label_phrase_recalls), np.mean(label_sent_recalls), np.mean(em_phrase_precisions), np.mean(em_sent_precisions), label_phrase_fscore, label_sent_fscore))

def sim_w(w1, w2, wv_model):
    if w1 not in wv_model or w2 not in wv_model:
        return 0.0
    return wv_model.similarity(w1, w2)

def meminfo(str):
    print(str, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000)

def save_phrase(review_path, bigram_num, trigram):
    for apk, app in app_files:
        extract_phrases(app)

if __name__ == '__main__':
    extract_phrases(app_files, bigram_min, trigram_min)
    load_phrase()

    timed_reviews = extract_review()

    OLDA_input = build_AOLDA_input_version(timed_reviews)
    start_t = time.time()
    apk_phis = OLDA_fit(OLDA_input, topic_num, win_size)
    phrases = generate_labeling_candidates(OLDA_input)
    topic_labeling(OLDA_input, apk_phis, phrases, 1.0, 0.75, 0.0, save=True)
    print("Totally takes %.2f seconds" % (time.time() - start_t))