import os import time import re import string import unicodedata import pandas as pd import numpy as np from scipy import sparse from contextlib import contextmanager from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.validation import check_X_y, check_is_fitted from sklearn.linear_model import LogisticRegression """ utils """ @contextmanager def timer(name): """ Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s """ t0 = time.time() yield print(f'[{name}] done in {time.time() - t0:.0f} s') """ text cleaning """ def normalize_unicode(text): """ unicode string normalization """ return unicodedata.normalize('NFKD', text) def remove_newline(text): """ remove \n and \t """ text = re.sub('\n', ' ', text) text = re.sub('\t', ' ', text) text = re.sub('\b', ' ', text) text = re.sub('\r', ' ', text) return text def spacing_punctuation(text): """ add space before and after punctuation and symbols """ re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤$&#‘’])') return re_tok.sub(r' \1 ', text) def remove_punctuation(text): """ remove punctuation from text """ re_tok = re.compile(f'([{string.punctuation}])') return re_tok.sub(' ', text) def spacing_number(text): """ add space before and after numbers """ re_tok = re.compile('([0-9]{1,})') return re_tok.sub(r' \1 ', text) def decontracted(text): """ de-contract the contraction """ # specific text = re.sub(r"(W|w)on\'t", "will not", text) text = re.sub(r"(C|c)an\'t", "can not", text) # general text = re.sub(r"(I|i)\'m", "i am", text) text = re.sub(r"(A|a)in\'t", "is not", text) text = re.sub(r"n\'t", " not", text) text = re.sub(r"\'re", " are", text) text = re.sub(r"\'s", " is", text) text = re.sub(r"\'d", " would", text) text = re.sub(r"\'ll", " will", text) text = re.sub(r"\'t", " not", text) text = re.sub(r"\'ve", " have", text) return text def clean_number(text): """ replace number with hash """ text = re.sub('[0-9]{5,}', '#####', text) text = re.sub('[0-9]{4}', '####', text) text = re.sub('[0-9]{3}', '###', text) text = re.sub('[0-9]{2}', '##', text) return text def remove_number(text): """ numbers are not toxic """ return re.sub('\d+', ' ', text) def remove_space(text): """ remove extra spaces and ending space if any """ text = re.sub('\s+', ' ', text) text = re.sub('\s+$', '', text) return text def preprocess(text, remove_punct=False, remove_num=True): """ preprocess text into clean text for tokenization """ # 1. normalize text = normalize_unicode(text) # 2. to lower text = text.lower() # 3. space text = spacing_punctuation(text) text = spacing_number(text) # (optional) if remove_punct: text = remove_punctuation(text) # 4. de-contract text = decontracted(text) # 5. handle number if remove_num: text = remove_number(text) else: text = clean_number(text) # 6. remove space text = remove_space(text) return text def word_tokenize(text, remove_punct=False, remove_num=True): """ tokenize text into list of word tokens """ # 1. preprocess text = preprocess(text, remove_punct, remove_num) # 2. tokenize tokens = text.split() return tokens def char_tokenize(text, remove_punct=False, remove_num=True): """ This is used to split strings in small lots I saw this in an article (I can't find the link anymore) so <talk> and <talking> would have <Tal> <alk> in common """ tokens = word_tokenize(text, remove_punct, remove_num) return [token[i: i + 3] for token in tokens for i in range(len(token) - 2)] """ transformer """ def word_transformer(df_text, stop_words=None): """ transform and extract word features from raw text dataframe Parameters ---------- df_text: dataframe, single column with text stop_words: string {‘english’}, list, or None (default) Return ------ df_features """ def _tokenizer(text): return word_tokenize(text, remove_punct=False, remove_num=True) vectorizer = TfidfVectorizer( strip_accents='unicode', ngram_range=(1, 3), tokenizer=_tokenizer, analyzer='word', min_df=3, max_df=0.9, max_features=None, use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words) return vectorizer.fit_transform(df_text) def char_transformer(df_text, stop_words=None): """ transform and extract word features from raw text dataframe Parameters ---------- df_text: dataframe, single column with text stop_words: string {‘english’}, list, or None (default) Return ------ df_features """ def _tokenizer(text): return char_tokenize(text, remove_punct=False, remove_num=True) vectorizer = TfidfVectorizer( strip_accents='unicode', ngram_range=(1, 1), tokenizer=_tokenizer, analyzer='word', min_df=3, max_df=0.9, max_features=None, use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words) return vectorizer.fit_transform(df_text) def transform(df_text): """ transform and extract features from raw text dataframe Parameters ---------- df_text: dataframe, single column with text Return ------ features: dataframe, or numpy, scipy """ return sparse.hstack([word_transformer(df_text), char_transformer(df_text)]).tocsr() # noqa """ model """ class NbSvmClassifier(BaseEstimator, ClassifierMixin): """ Naive Bayes - Support Vector Machine """ def __init__(self, C=1.0, dual=True, n_jobs=-1): self.C = C self.dual = dual self.n_jobs = n_jobs def predict(self, X): # Verify that model has been fit check_is_fitted(self, ['_r', '_clf']) return self._clf.predict(X.multiply(self._r)) def predict_proba(self, X): # Verify that model has been fit check_is_fitted(self, ['_r', '_clf']) return self._clf.predict_proba(X.multiply(self._r))[:, 1] def fit(self, X, y): # Check that X and y have correct shape y = y.values X, y = check_X_y(X, y, accept_sparse=True) def pr(X, y_i, y): p = X[y == y_i].sum(0) return (p+1) / ((y == y_i).sum()+1) self._r = sparse.csr_matrix(np.log(pr(X, 1, y) / pr(X, 0, y))) X_nb = X.multiply(self._r) self._clf = LogisticRegression( C=self.C, dual=self.dual, n_jobs=self.n_jobs ).fit(X_nb, y) return self def get_model(): return NbSvmClassifier() def load_and_preprocess(datapath): """ load and preprocess Parameters ---------- datapath: str, data directory that contains train.csv and test.csv Returns ------- df_train, df_test: dataframe with raw text X_train, X_test: matrix with proper features """ print("loading data ......") df_train = pd.read_csv(os.path.join(datapath, "train.csv")) df_test = pd.read_csv(os.path.join(datapath, "test.csv")) train_test_cut = df_train.shape[0] print("train data with shape : ", df_train.shape) print("test data with shape : ", df_test.shape) # concat text data into single dataframe df_all = pd.concat( [df_train[['question_text']], df_test[['question_text']]], axis=0).reset_index(drop=True) # transform X_features = transform(df_all['question_text']) X_train = X_features[:train_test_cut] X_test = X_features[train_test_cut:] return df_train, df_test, X_train, X_test def create_submission(X_train, y_train, X_test, df_test, thres): """ train model with entire training data, predict test data, and create submission file Parameters ---------- X_train, y_train, X_test: features and targets df_test: dataframe, test data thres: float, a decision threshold for classification module: a python module Return ------ df_summission """ # get model model = get_model() # train model print('fitting model') model = model.fit(X_train, y_train) # predict print('predicting probas') y_pred = (model.predict_proba(X_test) > thres).astype('int') # create submission file return pd.DataFrame({'qid': df_test.qid, 'prediction': y_pred}) if __name__ == '__main__': # config # SHUFFLE = True DATA_PATH = '../input/' FILE_PATH = 'submission.csv' THRES = 0.23 t0 = time.time() # 1. load and preprocess data with timer("Load and Preprocess"): df_train, df_test, X_train, X_test = load_and_preprocess(DATA_PATH) # 2. create submission file with timer('Trainning and Creating Submission'): df_submission = create_submission( X_train, df_train.target, X_test, df_test, THRES) df_submission.to_csv(FILE_PATH, index=False) print('Save submission file to {}'.format(FILE_PATH)) # record time spent print('Entire program is done and it took {:.2f}s'.format(time.time() - t0)) # noqa