# -*- coding: utf-8 -*-

"""
创建训练情感模型用的数据集
"""

import codecs
from random import choices

from hanziconv import HanziConv
from jieba import cut

from utils.database import session, Review, Rate
from utils.path import TRAIN_DIR

TRAIN_POS_PATH = TRAIN_DIR + '/sentiment_train_pos.txt'
TRAIN_NEG_PATH = TRAIN_DIR + '/sentiment_train_neg.txt'
TEST_POS_PATH = TRAIN_DIR + '/sentiment_test_pos.txt'
TEST_NEG_PATH = TRAIN_DIR + '/sentiment_test_neg.txt'


def chinese_tokenizer(documents):
    """
    把中文文本转为词序列
    繁体转简体、英文转小写
    """

    for document in documents:
        text = HanziConv.toSimplified(document)
        text = text.lower()
        yield list(cut(text))


def create_train_test(train_pos_path=TRAIN_POS_PATH, train_neg_path=TRAIN_NEG_PATH,
                      test_pos_path=TEST_POS_PATH, test_neg_path=TEST_NEG_PATH):
    """
    用数据库中所有非默认评论创建训练和测试样本,保证正负样本数一样
    """

    pos, neg = [], []
    for content, rate in Review.filter_default(
        session.query(Review.content, Review.rate)
        .filter(Review.content != '')
    ):
        if Rate(rate).is_good:
            pos.append(content)
        else:
            neg.append(content)

    size = min(len(pos), len(neg))
    size_train = int(size * 0.8)
    pos = choices(pos, k=size)
    neg = choices(neg, k=size)

    for data, path in ((pos[:size_train], train_pos_path),
                       (neg[:size_train], train_neg_path),
                       (pos[size_train:], test_pos_path),
                       (neg[size_train:], test_neg_path)):
        with codecs.open(path, 'w', 'utf-8') as file:
            file.writelines(data)


if __name__ == '__main__':
    create_train_test()