#!/usr/bin/python env
# -*- coding: utf-8 -*-
import MySQLdb
import MySQLdb.cursors
import numpy as np
__author__ = 'Huang yi'

divide_before_sql = '''INSERT INTO train_user_before
(user_id,item_id,behavior_type,user_geohash,item_catagory, daydiff)
SELECT user_id,item_id,behavior_type,user_geohash,item_catagory,DATEDIFF(time, '2014-11-18') AS daydiff
FROM train_user WHERE time<= '%s' '''

divide_after_sql = '''INSERT INTO train_user_after
(user_id,item_id,behavior_type,user_geohash,item_catagory, daydiff)
SELECT user_id,item_id,behavior_type,user_geohash,item_catagory,DATEDIFF(time, '2014-11-18') AS daydiff
FROM train_user WHERE time> '%s' '''

merge_look_sql = '''INSERT INTO user_look
(user_id, item_id, look)
SELECT user_id, item_id, group_concat(daydiff) as look
FROM train_user_before WHERE behavior_type=1 GROUP BY user_id, item_id '''

merge_store_sql = '''INSERT INTO user_store
(user_id, item_id, store)
SELECT user_id, item_id, group_concat(daydiff) as store
FROM train_user_before WHERE behavior_type=2 GROUP BY user_id, item_id '''

merge_cart_sql = '''INSERT INTO user_cart
(user_id, item_id, cart)
SELECT user_id, item_id, group_concat(daydiff) as cart
FROM train_user_before WHERE behavior_type=3 GROUP BY user_id, item_id '''

merge_buy_sql = '''INSERT INTO user_buy
(user_id, item_id, buy)
SELECT user_id, item_id, group_concat(daydiff) as buy
FROM train_user_before WHERE behavior_type=4 GROUP BY user_id, item_id '''

# 四个表合并太慢,先建立索引。这里应该用outer join,但mysql没有。后面再补充。
merge_all_behavoir = '''INSERT INTO user_features
(user_id, item_id, look, store, cart, buy)
SELECT user_look.user_id, user_look.item_id,
user_look.look, user_store.store, user_cart.cart, user_buy.buy
FROM user_look
LEFT join  user_store ON user_look.user_id = user_store.user_id AND
                        user_look.item_id = user_store.item_id
LEFT join  user_cart  ON user_look.user_id = user_cart.user_id AND
                        user_look.item_id = user_cart.item_id
LEFT join  user_buy   ON user_look.user_id = user_buy.user_id AND
                        user_look.item_id = user_buy.item_id'''

class TrainModel(object):
    def __init__(self):
        self.db = MySQLdb.connect("localhost", "root", "199194", "tianchi",
                                  cursorclass = MySQLdb.cursors.DictCursor)
        self.cursor = self.db.cursor()
        # 这个数据库连接用来查询train_user_after 31th的数据
        self.db2 = MySQLdb.connect("localhost", "root", "199194", "tianchi",
                                  cursorclass = MySQLdb.cursors.DictCursor)
        self.cursor2 = self.db2.cursor()

    def DivideByTime(self, breaktime):
        sql1 = divide_before_sql % breaktime
        sql2 = divide_after_sql % breaktime
        try:
            self.cursor.execute(sql1)
            self.db.commit()
            self.cursor.execute(sql2)
            self.db.commit()
        except MySQLdb.Error, e:
            print "Mysql Error %d: %s" % (e.args[0], e.args[1])
            self.db.rollback()
        self.db.close

    def MergeData(self):
        try:
            # self.cursor.execute(merge_look_sql)
            # self.db.commit()
            # self.cursor.execute(merge_store_sql)
            # self.db.commit()
            # self.cursor.execute(merge_cart_sql)
            # self.db.commit()
            # self.cursor.execute(merge_buy_sql)
            # self.db.commit()
            self.cursor.execute(merge_all_behavoir)
            self.db.commit()
        except MySQLdb.Error,e:
            print "Mysql Error %d: %s" % (e.args[0], e.args[1])
            self.db.rollback()
        self.db.close

    # 当用原始数据统计的四种行为(在表user_features中)符合某个标准,则输出到文本并插入到表pure_data中
    def SimplifyTrainUser(self):
        # 一次处理150万数据,多了内存占用过高
        # (limit 0, 1500000 -> limit 1500000, 1500000 -> limit 3000000, 2000000)
        self.cursor.execute('SELECT user_id, item_id, look, store, cart, buy FROM user_features limit') # 注意加limit再跑程序
        fop = open('feature.txt', 'a')
        while True:
            str_all=self.cursor.fetchone()
            if str_all:
                look_times = len(str_all['look'].split(',')) if str_all['look'] else 0
                store_times = len(str_all['store'].split(',')) if str_all['store'] else 0
                cart_times = len(str_all['cart'].split(',')) if str_all['cart'] else 0
                buy_times = len(str_all['buy'].split(',')) if str_all['buy'] else 0
                lru = max(str_all['look'].split(',') + str_all['store'].split(',') + str_all['cart'].split(',') + str_all['buy'].split(','))
                # 注意look等项在DB中应该为NOT NULL,不然lru会出错。
                user_id = str_all['user_id']
                item_id = str_all['item_id']

                buy_flag = False
                self.cursor2.execute('SELECT behavior_type FROM train_user_after WHERE user_id=%s AND item_id=%s', (user_id, item_id))
                behaviors = self.cursor2.fetchall()
                if behaviors:
                    for be in behaviors:
                        if be['behavior_type'] == 4:
                            buy_flag = True   # 说明在31th有购买行为,保留作为label

                # 如果浏览次数小于 && 收藏次数小于1 && 最近15天没有记录
                if not (look_times + store_times + cart_times <= 7 and buy_times==0 and int(lru) <= 21 and buy_flag == False ):
                    fop.write('%d,%d,\"%s\",\"%s\",\"%s\",\"%s\"' % (user_id,item_id,str_all['look'],str_all['store'],str_all['cart'],str_all['buy']) )
                    fop.write('\n')

            else:
                fop.close()
                return 0

    # 特征:一个内四种行为次数 + 总和 (基于简单特征的策略)
    def ExtractMonthlyBehavior(self, user_id, item_id):
        self.cursor.execute('SELECT look, store, cart, buy FROM pure_data WHERE user_id=%s AND item_id=%s', (user_id, item_id))
        str_all = self.cursor.fetchall()
        if not str_all:
            raise TypeError('Database NULL!')
        look_times = len(str_all[0]['look'].split(',')) if str_all[0]['look'] else 0
        store_times = len(str_all[0]['store'].split(',')) if str_all[0]['store'] else 0
        cart_times = len(str_all[0]['cart'].split(',')) if str_all[0]['cart'] else 0
        buy_times = len(str_all[0]['buy'].split(',')) if str_all[0]['buy'] else 0
        all_times = look_times + store_times + cart_times + buy_times

        feature = (look_times, store_times, cart_times, buy_times, all_times )
        return feature

    # 特征:最后四天四种行为次数 + 总和 (基于简单特征的策略)
    def ExtractLastdaysBehavior(self, user_id, item_id):
        self.cursor.execute('SELECT look, store, cart, buy FROM pure_data WHERE user_id=%s AND item_id=%s', (user_id, item_id))
        str_all = self.cursor.fetchall()
        if not str_all:
            raise TypeError('Database NULL!')

        look_times = store_times = cart_times = buy_times = 0
        look_date = str_all[0]['look'].split(',') if str_all[0]['look'] else 0
        store_date = str_all[0]['store'].split(',') if str_all[0]['store'] else 0
        cart_date = str_all[0]['cart'].split(',') if str_all[0]['cart'] else 0
        buy_date = str_all[0]['buy'].split(',') if str_all[0]['buy'] else 0

        if look_date:
            for l in look_date:
                if l and int(l) >= 26:
                    look_times += 1
        if store_date:
            for l in store_date:
                if l and int(l) >= 26:
                    store_times += 1
        if cart_date:
            for l in cart_date:
                if l and int(l) >= 26:
                    cart_times += 1
        if buy_date:
            for l in buy_date:
                if l and int(l) >= 26:
                    buy_times += 1

        all_times = look_times + store_times + cart_times + buy_times
        feature = (look_times, store_times, cart_times, buy_times, all_times )
        return feature

    # 特征:最后一次行为离分割点的间隔
    def ExtractLRUsed(self, user_id, item_id):
        self.cursor.execute(' SELECT max(daydiff) FROM train_user_before WHERE user_id=%s AND item_id=%s', (user_id, item_id))
        LRU = self.cursor.fetchall()
        LRUnumber = LRU[0]['max(daydiff)'] if LRU else 0
        feature = int(LRUnumber)
        return feature

    # 特征: 最近一天收藏,加购的次数(基于有购物倾向的策略)
    def ExtractLast1Trend(self, user_id, item_id):
        self.cursor.execute('SELECT store, cart FROM pure_data WHERE user_id=%s AND item_id=%s', (user_id, item_id))
        str_all = self.cursor.fetchall()
        if not str_all:
            raise TypeError('Database NULL!')

        store_times = cart_times = 0
        store_date = str_all[0]['store'].split(',') if str_all[0]['store'] else 0
        cart_date = str_all[0]['cart'].split(',') if str_all[0]['cart'] else 0

        if store_date:
            for l in store_date:
                if l and int(l) == 29:
                    store_times += 1
        if cart_date:
            for l in cart_date:
                if l and int(l) == 29:
                    cart_times += 1
        feature = (store_times, cart_times)
        return feature

    # 特征: 最近三天查看,收藏,加购的次数(基于有购物倾向的策略)
    def ExtractLast3Trend(self, user_id, item_id):
        self.cursor.execute('SELECT look, store, cart FROM pure_data WHERE user_id=%s AND item_id=%s', (user_id, item_id))
        str_all = self.cursor.fetchall()
        if not str_all:
            raise TypeError('Database NULL!')

        look_times=store_times = cart_times = 0
        look_date = str_all[0]['look'].split(',') if str_all[0]['look'] else 0
        store_date = str_all[0]['store'].split(',') if str_all[0]['store'] else 0
        cart_date = str_all[0]['cart'].split(',') if str_all[0]['cart'] else 0

        if look_date:
            for l in look_date:
                if l and int(l) >= 27:
                    look_times += 1
        if store_date:
            for l in store_date:
                if l and int(l) == 27:
                    store_times += 1
        if cart_date:
            for l in cart_date:
                if l and int(l) == 27:
                    cart_times += 1

        feature = (look_times, store_times, cart_times)
        return feature

    # 特征: 最近一天,三天,七天购买的次数(基于买过不会再买的策略)
    def ExtractLast7Buy(self, user_id, item_id):
        self.cursor.execute('SELECT buy FROM pure_data WHERE user_id=%s AND item_id=%s', (user_id, item_id))
        str_all = self.cursor.fetchall()
        if not str_all:
            raise TypeError('Database NULL!')

        buy_times_1day = buy_times_3day = buy_times_7day = 0
        buy_date = str_all[0]['buy'].split(',') if str_all[0]['buy'] else 0

        if buy_date:
            for l in buy_date:
                if l and int(l) == 29:
                    buy_times_1day += 1
                if l and int(l) >= 27:
                    buy_times_3day += 1
                if l and int(l) >= 23:
                    buy_times_7day += 1

        feature = (buy_times_1day, buy_times_3day, buy_times_7day)
        return feature


    def MergeFeatures(self):
        fop = open('data_features.txt', 'a')
        self.cursor.execute('SELECT user_id, item_id FROM pure_data')
        record = self.cursor.fetchall()
        for rec in record:
            userid = rec['user_id']
            itemid = rec['item_id']

            # 融合所有特征 19D
            feature0 = self.ExtractMonthlyBehavior(userid, itemid)    # 5D
            feature1 = self.ExtractLastdaysBehavior(userid, itemid)   # 5D
            feature2 = self.ExtractLRUsed(userid, itemid)             # 1D
            feature3 = self.ExtractLast1Trend(userid, itemid)         # 2D
            feature4 = self.ExtractLast3Trend(userid, itemid)         # 3D
            feature5 = self.ExtractLast7Buy(userid, itemid)           # 3D
            features = feature0, feature1, feature2, feature3, feature4, feature5
            line = features[0] + features[1] + (features[2],) + features[3] + features[4] + features[5]
            fop.write('%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d' % (line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7],line[8],line[9],
            line[10],line[11],line[12],line[13],line[14],line[15],line[16],line[17],line[18]))
            fop.write('\n')
        fop.close()

    def GenLabels(self):
        fop = open('data_labels.txt', 'a')
        self.cursor.execute('SELECT user_id, item_id FROM pure_data')
        record = self.cursor.fetchall()
        for rec in record:
            userid = rec['user_id']
            itemid = rec['item_id']
            purchase_flag = False
            self.cursor.execute('SELECT behavior_type FROM train_user_after WHERE user_id=%s AND item_id=%s', (userid, itemid))
            lines = self.cursor.fetchall()
            if lines:
                for line in lines:
                    if line['behavior_type'] == 4:
                        purchase_flag = True
            if purchase_flag:
                fop.write('1')
                fop.write('\n')
            else:
                fop.write('0')
                fop.write('\n')
        fop.close()

if __name__ == '__main__':
    model = TrainModel()
    # model.DivideByTime("2014-12-18 00:00:00")
    # model.MergeData()
    # model.SimplifyTrainUser()
    # model.MergeFeatures() # 比较耗时间
    model.GenLabels()