python source code of submission

import os
import time
import re
import string
import unicodedata
import pandas as pd
import numpy as np
from scipy import sparse
from contextlib import contextmanager
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression


"""
utils
"""


@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')


"""
text cleaning
"""


def normalize_unicode(text):
    """
    unicode string normalization
    """
    return unicodedata.normalize('NFKD', text)


def remove_newline(text):
    """
    remove \n and  \t
    """
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    text = re.sub('\b', ' ', text)
    text = re.sub('\r', ' ', text)
    return text


def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤$&#‘’])')
    return re_tok.sub(r' \1 ', text)


def remove_punctuation(text):
    """
    remove punctuation from text
    """
    re_tok = re.compile(f'([{string.punctuation}])')
    return re_tok.sub(' ', text)


def spacing_number(text):
    """
    add space before and after numbers
    """
    re_tok = re.compile('([0-9]{1,})')
    return re_tok.sub(r' \1 ', text)


def decontracted(text):
    """
    de-contract the contraction
    """
    # specific
    text = re.sub(r"(W|w)on\'t", "will not", text)
    text = re.sub(r"(C|c)an\'t", "can not", text)

    # general
    text = re.sub(r"(I|i)\'m", "i am", text)
    text = re.sub(r"(A|a)in\'t", "is not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    return text


def clean_number(text):
    """
    replace number with hash
    """
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)
    return text


def remove_number(text):
    """
    numbers are not toxic
    """
    return re.sub('\d+', ' ', text)


def remove_space(text):
    """
    remove extra spaces and ending space if any
    """
    text = re.sub('\s+', ' ', text)
    text = re.sub('\s+$', '', text)
    return text


def preprocess(text, remove_punct=False, remove_num=True):
    """
    preprocess text into clean text for tokenization
    """
    # 1. normalize
    text = normalize_unicode(text)
    # 2. to lower
    text = text.lower()
    # 3. space
    text = spacing_punctuation(text)
    text = spacing_number(text)
    # (optional)
    if remove_punct:
        text = remove_punctuation(text)
    # 4. de-contract
    text = decontracted(text)
    # 5. handle number
    if remove_num:
        text = remove_number(text)
    else:
        text = clean_number(text)
    # 6. remove space
    text = remove_space(text)
    return text


def word_tokenize(text, remove_punct=False, remove_num=True):
    """
    tokenize text into list of word tokens
    """
    # 1. preprocess
    text = preprocess(text, remove_punct, remove_num)
    # 2. tokenize
    tokens = text.split()
    return tokens


def char_tokenize(text, remove_punct=False, remove_num=True):
    """
    This is used to split strings in small lots
    I saw this in an article (I can't find the link anymore)
    so <talk> and <talking> would have <Tal> <alk> in common
    """
    tokens = word_tokenize(text, remove_punct, remove_num)
    return [token[i: i + 3] for token in tokens for i in range(len(token) - 2)]


"""
transformer
"""


def word_transformer(df_text, stop_words=None):
    """
    transform and extract word features from raw text dataframe

    Parameters
    ----------
    df_text: dataframe, single column with text

    stop_words: string {‘english’}, list, or None (default)

    Return
    ------
    df_features
    """
    def _tokenizer(text):
        return word_tokenize(text, remove_punct=False, remove_num=True)

    vectorizer = TfidfVectorizer(
        strip_accents='unicode',
        ngram_range=(1, 3),
        tokenizer=_tokenizer,
        analyzer='word',
        min_df=3, max_df=0.9, max_features=None,
        use_idf=True, smooth_idf=True, sublinear_tf=True,
        stop_words=stop_words)
    return vectorizer.fit_transform(df_text)


def char_transformer(df_text, stop_words=None):
    """
    transform and extract word features from raw text dataframe

    Parameters
    ----------
    df_text: dataframe, single column with text

    stop_words: string {‘english’}, list, or None (default)

    Return
    ------
    df_features
    """
    def _tokenizer(text):
        return char_tokenize(text, remove_punct=False, remove_num=True)

    vectorizer = TfidfVectorizer(
        strip_accents='unicode',
        ngram_range=(1, 1),
        tokenizer=_tokenizer,
        analyzer='word',
        min_df=3, max_df=0.9, max_features=None,
        use_idf=True, smooth_idf=True, sublinear_tf=True,
        stop_words=stop_words)
    return vectorizer.fit_transform(df_text)


def transform(df_text):
    """
    transform and extract features from raw text dataframe

    Parameters
    ----------
    df_text: dataframe, single column with text

    Return
    ------
    features: dataframe, or numpy, scipy
    """
    return sparse.hstack([word_transformer(df_text), char_transformer(df_text)]).tocsr()    # noqa


"""
model
"""


class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    """
    Naive Bayes - Support Vector Machine
    """
    def __init__(self, C=1.0, dual=True, n_jobs=-1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, X):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(X.multiply(self._r))

    def predict_proba(self, X):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(X.multiply(self._r))[:, 1]

    def fit(self, X, y):
        # Check that X and y have correct shape
        y = y.values
        X, y = check_X_y(X, y, accept_sparse=True)

        def pr(X, y_i, y):
            p = X[y == y_i].sum(0)
            return (p+1) / ((y == y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(X, 1, y) / pr(X, 0, y)))
        X_nb = X.multiply(self._r)
        self._clf = LogisticRegression(
            C=self.C,
            dual=self.dual,
            n_jobs=self.n_jobs
        ).fit(X_nb, y)
        return self


def get_model():
    return NbSvmClassifier()


def load_and_preprocess(datapath):
    """
    load and preprocess
    Parameters
    ----------
    datapath: str, data directory that contains train.csv and test.csv

    Returns
    -------
    df_train, df_test: dataframe with raw text

    X_train, X_test: matrix with proper features
    """
    print("loading data ......")
    df_train = pd.read_csv(os.path.join(datapath, "train.csv"))
    df_test = pd.read_csv(os.path.join(datapath, "test.csv"))
    train_test_cut = df_train.shape[0]
    print("train data with shape : ", df_train.shape)
    print("test data with shape : ", df_test.shape)
    # concat text data into single dataframe
    df_all = pd.concat(
        [df_train[['question_text']], df_test[['question_text']]],
        axis=0).reset_index(drop=True)
    # transform
    X_features = transform(df_all['question_text'])
    X_train = X_features[:train_test_cut]
    X_test = X_features[train_test_cut:]
    return df_train, df_test, X_train, X_test


def create_submission(X_train, y_train, X_test, df_test, thres):
    """
    train model with entire training data, predict test data,
    and create submission file

    Parameters
    ----------
    X_train, y_train, X_test: features and targets

    df_test: dataframe, test data

    thres: float, a decision threshold for classification

    module: a python module

    Return
    ------
    df_summission
    """
    # get model
    model = get_model()
    # train model
    print('fitting model')
    model = model.fit(X_train, y_train)
    # predict
    print('predicting probas')
    y_pred = (model.predict_proba(X_test) > thres).astype('int')
    # create submission file
    return pd.DataFrame({'qid': df_test.qid, 'prediction': y_pred})


if __name__ == '__main__':
    # config
    # SHUFFLE = True
    DATA_PATH = '../input/'
    FILE_PATH = 'submission.csv'
    THRES = 0.23

    t0 = time.time()
    # 1. load and preprocess data
    with timer("Load and Preprocess"):
        df_train, df_test, X_train, X_test = load_and_preprocess(DATA_PATH)
    # 2. create submission file
    with timer('Trainning and Creating Submission'):
        df_submission = create_submission(
            X_train, df_train.target,
            X_test, df_test,
            THRES)
        df_submission.to_csv(FILE_PATH, index=False)
        print('Save submission file to {}'.format(FILE_PATH))
    # record time spent
    print('Entire program is done and it took {:.2f}s'.format(time.time() - t0)) # noqa