python source code of main

from __future__ import absolute_import
import os.path
import argparse
import logging
import json
from six import iteritems
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from keras.models import load_model

from tensorflow.python.client import device_lib

from utils import load_data, Embeds, Logger, Params, embed_aggregate, similarity
from features import catboost_features
from preprocessing import clean_text, convert_text2seq, split_data, parse_seq
from models import cnn, dense, rnn, TFIDF, CatBoost, save_predictions
from train import train
from metrics import get_metrics, print_metrics


def get_kwargs(kwargs):
    parser = argparse.ArgumentParser(description='-f TRAIN_FILE -t TEST_FILE -o OUTPUT_FILE -e EMBEDS_FILE [-l LOGGER_FILE] [--swear-words SWEAR_FILE] [--wrong-words WRONG_WORDS_FILE] [--format-embeds FALSE]')
    parser.add_argument('-f', '--train', dest='train', action='store', help='/path/to/trian_file', type=str)
    parser.add_argument('-t', '--test', dest='test', action='store', help='/path/to/test_file', type=str)
    parser.add_argument('-o', '--output', dest='output', action='store', help='/path/to/output_file', type=str)
    parser.add_argument('-we', '--word_embeds', dest='word_embeds', action='store', help='/path/to/embeds_file', type=str)
    parser.add_argument('-ce', '--char_embeds', dest='char_embeds', action='store', help='/path/to/embeds_file', type=str)
    parser.add_argument('-c','--config', dest='config', action='store', help='/path/to/config.json', type=str)
    parser.add_argument('-l', '--logger', dest='logger', action='store', help='/path/to/log_file', type=str, default=None)
    parser.add_argument('--mode', dest='mode', action='store', help='preprocess / train / validate / all', type=str, default='all')
    parser.add_argument('--max-words', dest='max_words', action='store', type=int, default=300000)
    parser.add_argument('--use-only-exists-words', dest='use_only_exists_words', action='store_true')
    parser.add_argument('--swear-words', dest='swear_words', action='store', help='/path/to/swear_words_file', type=str, default=None)
    parser.add_argument('--wrong-words', dest='wrong_words', action='store', help='/path/to/wrong_words_file', type=str, default=None)
    parser.add_argument('--format-embeds', dest='format_embeds', action='store', help='file | json | pickle | binary', type=str, default='raw')
    parser.add_argument('--output-dir', dest='output_dir', action='store', help='/path/to/dir', type=str, default='.')
    parser.add_argument('--norm-prob', dest='norm_prob', action='store_true')
    parser.add_argument('--norm-prob-koef', dest='norm_prob_koef', action='store', type=float, default=1)
    parser.add_argument('--gpus', dest='gpus', action='store', help='count GPUs', type=int, default=0)
    for key, value in iteritems(parser.parse_args().__dict__):
        kwargs[key] = value


def main(*kargs, **kwargs):
    get_kwargs(kwargs)
    train_fname = kwargs['train']
    test_fname = kwargs['test']
    result_fname = kwargs['output']
    word_embeds_fname = kwargs['word_embeds']
    char_embeds_fname = kwargs['char_embeds']
    logger_fname = kwargs['logger']
    mode = kwargs['mode']
    max_words = kwargs['max_words']
    use_only_exists_words = kwargs['use_only_exists_words']
    swear_words_fname = kwargs['swear_words']
    wrong_words_fname = kwargs['wrong_words']
    embeds_format = kwargs['format_embeds']
    config = kwargs['config']
    output_dir = kwargs['output_dir']
    norm_prob = kwargs['norm_prob']
    norm_prob_koef = kwargs['norm_prob_koef']
    gpus = kwargs['gpus']

    seq_col_name_words = 'comment_seq_lw_use_exist{}_{}k'.format(int(use_only_exists_words), int(max_words/1000))
    seq_col_name_ll3 = 'comment_seq_ll3_use_exist{}_{}k'.format(int(use_only_exists_words), int(max_words/1000))

    model_file = {
        'dense': os.path.join(output_dir, 'dense.h5'),
        'cnn': os.path.join(output_dir, 'cnn.h5'),
        'lstm': os.path.join(output_dir, 'lstm.h5'),
        'lr': os.path.join(output_dir, '{}_logreg.bin'),
        'catboost': os.path.join(output_dir, '{}_catboost.bin')
    }

    # ====Create logger====
    logger = Logger(logging.getLogger(), logger_fname)

    # ====Detect GPUs====
    logger.debug(device_lib.list_local_devices())

    # ====Load data====
    logger.info('Loading data...')
    train_df = load_data(train_fname)
    test_df = load_data(test_fname)


    target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    num_classes = len(target_labels)

    # ====Load additional data====
    logger.info('Loading additional data...')
    swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None)
    wrong_words_dict = load_data(wrong_words_fname, func=lambda x: {val[0] : val[1] for val in x})

    # ====Load word vectors====
    logger.info('Loading embeddings...')
    embeds_word = Embeds().load(word_embeds_fname, embeds_format)
    embeds_ll3 = Embeds().load(char_embeds_fname, embeds_format)

    # ====Clean texts====
    if mode in ('preprocess', 'all'):
        logger.info('Cleaning text...')
        train_df['comment_text_clear'] = clean_text(train_df['comment_text'], wrong_words_dict, autocorrect=True)
        test_df['comment_text_clear'] = clean_text(test_df['comment_text'], wrong_words_dict, autocorrect=True)
        train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'), index=False)
        test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'), index=False)

    # ====Calculate maximum seq length====
    logger.info('Calc text length...')
    train_df.fillna('__NA__', inplace=True)
    test_df.fillna('__NA__', inplace=True)
    train_df['text_len'] = train_df['comment_text_clear'].apply(lambda words: len(words.split()))
    test_df['text_len'] = test_df['comment_text_clear'].apply(lambda words: len(words.split()))
    max_seq_len = np.round(train_df['text_len'].mean() + 3*train_df['text_len'].std()).astype(int)
    max_char_seq_len = 2000 # empirical
    logger.debug('Max seq length = {}'.format(max_seq_len))

    # ====Prepare data to NN====
    logger.info('Converting texts to sequences...')

    if mode in ('preprocess', 'all'):
        train_df[seq_col_name_words], test_df[seq_col_name_words], word_index, train_df[seq_col_name_ll3], test_df[seq_col_name_ll3], ll3_index = convert_text2seq(
                                                                                 train_df['comment_text_clear'].tolist(),
                                                                                 test_df['comment_text_clear'].tolist(),
                                                                                 max_words,
                                                                                 max_seq_len,
                                                                                 max_char_seq_len,
                                                                                 embeds_word,
                                                                                 lower=True,
                                                                                 oov_token='__NA__',
                                                                                 uniq=False,
                                                                                 use_only_exists_words=use_only_exists_words)
        logger.debug('Dictionary size use_exist{} = {}'.format(int(use_only_exists_words), len(word_index)))
        logger.debug('Char dict size use_exist{} = {}'.format(int(use_only_exists_words), len(ll3_index)))

        logger.info('Preparing embedding matrix...')
        words_not_found = embeds_word.set_matrix(max_words, word_index)
        embeds_ll3.matrix = np.random.normal(size=(len(ll3_index), embeds_word.shape[1]))
        embeds_ll3.word_index = ll3_index
        embeds_ll3.word_index_reverse = {val: key for key, val in ll3_index.items()}
        embeds_ll3.shape = np.shape(embeds_ll3.matrix)
        embeds_word.save(os.path.join(output_dir, 'wiki.embeds_lw.{}k'.format(int(max_words/1000))))
        embeds_ll3.save(os.path.join(output_dir, 'wiki.embeds_ll3.{}k'.format(int(max_words/1000))))

        # ====Get text vector====
        pooling = {
                'max': {'func': np.max},
                'avg': {'func': np.sum, 'normalize': True},
                'sum': {'func': np.sum, 'normalize': False}
        }
        for p in ['max', 'avg', 'sum']:
            train_df['comment_vec_{}'.format(p)] = train_df[seq_col_name_words].apply(lambda x: embed_aggregate(x, embeds_word, **pooling[p]))
            test_df['comment_vec_{}'.format(p)] = test_df[seq_col_name_words].apply(lambda x: embed_aggregate(x, embeds_word, **pooling[p]))
        train_df.to_csv(os.path.join(output_dir, 'train_clear1.csv'), index=False)
        test_df.to_csv(os.path.join(output_dir, 'test_clear1.csv'), index=False)
    else:
        for col in train_df.columns:
            if col.startswith('comment_seq'):
                train_df[col] = train_df[col].apply(lambda x: parse_seq(x, int))
                test_df[col] = test_df[col].apply(lambda x: parse_seq(x, int))
            elif col.startswith('comment_vec'):
                train_df[col] = train_df[col].apply(lambda x: parse_seq(x, float))
                test_df[col] = test_df[col].apply(lambda x: parse_seq(x, float))


    logger.debug('Embedding matrix shape = {}'.format(embeds_word.shape))
    logger.debug('Number of null word embeddings = {}'.format(np.sum(np.sum(embeds_word.matrix, axis=1) == 0)))

    # ====END OF `PREPROCESS`====
    if mode == 'preprocess':
        return True

    # ====Train/test split data====
    x = np.array(train_df[seq_col_name_words].values.tolist())
    y = np.array(train_df[target_labels].values.tolist())
    x_train_nn, x_val_nn, y_train, y_val, train_idxs, val_idxs = split_data(x, y, test_size=0.2, shuffle=True, random_state=42)
    x_test_nn = np.array(test_df[seq_col_name_words].values.tolist())

    x_char = np.array(train_df[seq_col_name_ll3].values.tolist())
    x_char_train_nn = x_char[train_idxs]
    x_char_val_nn = x_char[val_idxs]
    x_char_test_nn = np.array(test_df[seq_col_name_ll3].values.tolist())

    x_train_tfidf = train_df['comment_text_clear'].values[train_idxs]
    x_val_tfidf = train_df['comment_text_clear'].values[val_idxs]
    x_test_tfidf = test_df['comment_text_clear'].values

    catboost_cols = catboost_features(train_df, test_df)
    x_train_cb = train_df[catboost_cols].values[train_idxs].T
    x_val_cb = train_df[catboost_cols].values[val_idxs].T
    x_test_cb = test_df[catboost_cols].values.T

    # ====Train models====
    nn_models = {
        'cnn': cnn,
        'dense': dense,
        'rnn': rnn
    }

    params = Params(config)

    metrics = {}
    predictions = {}
    for param in params['models']:
        for model_label, model_params in param.items():
            if model_params.get('common', {}).get('warm_start', False) and os.path.exists(model_params.get('common', {}).get('model_file', '')):
                logger.info('{} warm starting...'.format(model_label))
                model = load_model(model_params.get('common', {}).get('model_file', None))
            elif model_label in nn_models:
                model = nn_models[model_label](
                            embeds_word.matrix,
                            embeds_ll3.matrix,
                            num_classes,
                            max_seq_len,
                            max_char_seq_len,
                            gpus=gpus,
                            **model_params['init'])
                model_alias = model_params.get('common', {}).get('alias', None)
                if model_alias is None or not model_alias:
                    model_alias = '{}_{}'.format(model_label, i)
                logger.info("training {} ...".format(model_label))
                if model_label == 'dense':
                    x_tr = [x_train_nn, x_char_train_nn]
                    x_val = [x_val_nn, x_char_val_nn]
                    x_test = [x_test_nn, x_char_test_nn]
                else:
                    x_tr = x_train_nn
                    x_val = x_val_nn
                    x_test = x_test_nn
                hist = train(x_tr,
                             y_train,
                             model,
                             logger=logger,
                             **model_params['train'])
                predictions[model_alias] = model.predict(x_val)
                save_predictions(test_df, model.predict(x_test), target_labels, model_alias)
            elif model_label == 'tfidf':
                model = TFIDF(target_labels, **model_params['init'])
                model.fit(x_train_tfidf, y_train, **model_params['train'])
                predictions[model_alias] = model.predict(x_val_tfidf)
                save_predictions(test_df, model.predict(x_test_tfidf), target_labels, model_alias)
            elif model_label == 'catboost':
                model = CatBoost(target_labels, **model_params['init'])
                model.fit(x_train_cb, y_train, eval_set=(x_val_cb, y_val), use_best_model=True)
                predictions[model_alias] = model.predict_proba(x_val_cb)
                save_predictions(test_df, model.predict_proba(x_test_cb), target_labels, model_alias)
            metrics[model_alias] = get_metrics(y_val, predictions[model_alias], target_labels)
            logger.debug('{} params:\n{}'.format(model_alias, model_params))
            logger.debug('{} metrics:\n{}'.format(model_alias, print_metrics(metrics[model_alias])))
            model.save(os.path.join(output_dir, model_params['common']['model_file']))

    logger.info('Saving metrics...')
    with open(os.path.join(output_dir, 'metrics.json'), 'w') as f:
        f.write(json.dumps(metrics))


    # ====END OF `VALIDATE`====
    if mode == 'validate':
        return True

    # Meta catboost
    logger.info('training catboost as metamodel...')

    x_meta = [predictions[model_alias] for model_alias in sorted(predictions.keys())]
    x_meta = np.array(x_train_meta).T

    x_train_meta, x_val_meta, y_train_meta, y_val_meta = train_test_split(x_meta, y_val, test_size=0.20, random_state=42)
    meta_model = CatBoost(target_labels,
                         loss_function='Logloss',
                         iterations=1000,
                         depth=6,
                         learning_rate=0.03,
                         rsm=1
    )
    meta_model.fit(x_train_meta, y_train_meta, eval_set=(x_val_meta, y_val_meta), use_best_model=True)
    y_hat_meta = meta_model.predict_proba(x_val_meta)
    metrics_meta = get_metrics(y_val_meta, y_hat_meta, target_labels)
    #model.save(os.path.join(output_dir, 'meta.catboost')
    logger.debug('{} metrics:\n{}'.format('META', print_metrics(metrics_meta)))

    # ====Predict====
    logger.info('Applying models...')
    test_cols = []
    for model_alias in sorted(predictions.keys()):
        for label in target_labels:
            test_cols.append('{}_{}'.format(model_alias, label))
    x_test = test_df[test_cols].values

    preds = meta_model.predict_proba(x_test)
    for i, label in enumerate(target_labels):
        test_df[label] = preds[:, i]

    # ====Normalize probabilities====
    if norm_prob:
        for label in target_labels:
            test_df[label] = norm_prob_koef * test_df[label]

    # ====Save results====
    logger.info('Saving results...')
    test_df[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].to_csv(result_fname, index=False, header=True)
    test_df.to_csv('{}_tmp'.format(result_fname), index=False, header=True)


if __name__=='__main__':
    main()