from __future__ import absolute_import import numpy as np from scipy import sparse from catboost import CatBoostClassifier from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import TfidfVectorizer from keras import regularizers from keras.regularizers import l1, l2, l1_l2 from keras.utils import multi_gpu_model from keras.models import Sequential, Model from keras.layers import Input, Dropout, Concatenate, Embedding, BatchNormalization from keras.layers import Dense, Bidirectional, LSTM, GRU, CuDNNLSTM, CuDNNGRU from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D try: from utils import GlobalZeroMaskedAveragePooling1D, GlobalSumPooling1D except ImportError: from .utils import GlobalZeroMaskedAveragePooling1D, GlobalSumPooling1D def cnn(embedding_matrix, char_matrix, num_classes, max_seq_len, max_ll3_seq_len, num_filters=64, l2_weight_decay=0.0001, dropout_val=0.5, dense_dim=32, add_sigmoid=True, train_embeds=False, gpus=0, n_cnn_layers=1, pool='max', add_embeds=False): if pool == 'max': Pooling = MaxPooling1D GlobalPooling = GlobalMaxPooling1D elif pool == 'avg': Pooling = AveragePooling1D GlobalPooling = GlobalAveragePooling1D input_ = Input(shape=(max_seq_len,)) embeds = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], input_length=max_seq_len, trainable=train_embeds)(input_) x = embeds for i in range(n_cnn_layers-1): x = Conv1D(num_filters, 7, activation='relu', padding='same')(x) x = Pooling(2)(x) x = Conv1D(num_filters, 7, activation='relu', padding='same')(x) x = GlobalPooling()(x) if add_embeds: x1 = Conv1D(num_filters, 7, activation='relu', padding='same')(embeds) x1 = GlobalPooling()(x1) x = Concatenate()([x, x1]) x = BatchNormalization()(x) x = Dropout(dropout_val)(x) x = Dense(dense_dim, activation='relu', kernel_regularizer=regularizers.l2(l2_weight_decay))(x) if add_sigmoid: x = Dense(num_classes, activation='sigmoid')(x) model = Model(inputs=input_, outputs=x) if gpus > 0: model = multi_gpu_model(model, gpus=gpus) return model def _get_regularizer(regularizer_name, weight): if regularizer_name is None: return None if regularizer_name == 'l1': return l1(weight) if regularizer_name == 'l2': return l2(weight) if regularizer_name == 'l1_l2': return l1_l2(weight) return None def rnn(embedding_matrix, char_matrix, num_classes, max_seq_len, max_ll3_seq_len, l2_weight_decay=0.0001, rnn_dim=100, dropout_val=0.3, dense_dim=32, n_rnn_layers=1, n_dense_layers=1, add_sigmoid=True, train_embeds=False, gpus=0, rnn_type='lstm', mask_zero=True, kernel_regularizer=None, recurrent_regularizer=None, activity_regularizer=None, dropout=0.0, recurrent_dropout=0.0, pool='max', add_embeds=True, return_state=False): GlobalPool = { 'avg': GlobalZeroMaskedAveragePooling1D, 'max': GlobalMaxPooling1D, 'sum': GlobalSumPooling1D } rnn_regularizers = {'kernel_regularizer': _get_regularizer(kernel_regularizer, l2_weight_decay), 'recurrent_regularizer': _get_regularizer(recurrent_regularizer, l2_weight_decay), 'activity_regularizer': _get_regularizer(activity_regularizer, l2_weight_decay)} if gpus == 0: rnn_regularizers['dropout'] = dropout rnn_regularizers['recurrent_dropout'] = recurrent_dropout if rnn_type == 'lstm': RNN = LSTM # CuDNNLSTM if gpus > 0 else LSTM elif rnn_type == 'gru': RNN = GRU # CuDNNGRU if gpus > 0 else GRU mask_zero = mask_zero and gpus == 0 input_ = Input(shape=(max_seq_len,)) embeds = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], input_length=max_seq_len, mask_zero=mask_zero, trainable=train_embeds)(input_) x = embeds for _ in range(n_rnn_layers-1): x = Bidirectional(RNN(rnn_dim, return_sequences=True, **rnn_regularizers))(x) x = Bidirectional(RNN(rnn_dim, return_sequences=False, return_state=return_state, **rnn_regularizers))(x) if return_state: x = Concatenate()(x) if add_embeds: embeds2 = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], input_length=max_seq_len, mask_zero=False, trainable=train_embeds)(input_) if isinstance(pool, list) and len(pool) > 1: to_concat = [] for p in pool: to_concat.append(GlobalPool[p]()(embeds2)) x1 = Concatenate()(to_concat) else: x1 = GlobalPool[pool]()(embeds2) x = Concatenate()([x, x1]) x = BatchNormalization()(x) x = Dropout(dropout_val)(x) for _ in range(n_dense_layers-1): x = Dense(dense_dim, activation="relu")(x) x = Dropout(dropout_val)(x) x = Dense(dense_dim, activation="relu", kernel_regularizer=regularizers.l2(l2_weight_decay))(x) if add_sigmoid: x = Dense(num_classes, activation="sigmoid")(x) model = Model(inputs=input_, outputs=x) if gpus > 0: model = multi_gpu_model(model, gpus=gpus) return model def dense(embedding_matrix, ll3_matrix, num_classes, max_seq_len, max_ll3_seq_len, dense_dim=100, n_layers=10, concat=0, dropout_val=0.5, l2_weight_decay=0.0001, pool='max', add_sigmoid=True, train_embeds=False, gpus=0, add_ll3=True): GlobalPool = { 'avg': GlobalZeroMaskedAveragePooling1D, 'max': GlobalMaxPooling1D, 'sum': GlobalSumPooling1D } input_ = Input(shape=(max_seq_len,)) input2_ = Input(shape=(max_ll3_seq_len,)) embeds = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], input_length=max_seq_len, trainable=train_embeds)(input_) ll3_embeds = Embedding(ll3_matrix.shape[0], ll3_matrix.shape[1], weights=[ll3_matrix], input_length=max_ll3_seq_len, trainable=True)(input2_) if isinstance(pool, list) and len(pool) > 1: to_concat = [] for p in pool: to_concat.append(GlobalPool[p]()(embeds)) if add_ll3: to_concat.append(GlobalPool[p]()(ll3_embeds)) x = Concatenate()(to_concat) else: x = GlobalPool[pool]()(embeds) if add_ll3: x1 = GlobalPool[pool]()(ll3_embeds) x = Concatenate()([x, x1]) x = BatchNormalization()(x) prev = [] for i in range(n_layers): if concat > 0: if i == 0: prev.append(x) continue elif i % concat == 0: prev.append(x) x = Concatenate(axis=-1)(prev) x = Dense(dense_dim, activation="relu")(x) x = BatchNormalization()(x) x = Dropout(dropout_val)(x) output_ = Dense(dense_dim, activation="relu", kernel_regularizer=regularizers.l2(l2_weight_decay))(x) if add_sigmoid: output_ = Dense(num_classes, activation="sigmoid")(output_) if add_ll3: model = Model(inputs=[input_, input2_], outputs=output_) else: model = Model(inputs=input_, outputs=output_) if gpus > 0: model = multi_gpu_model(model, gpus=gpus) return model class TFIDF(object): def __init__(self, target_labels, *args, **kwargs): self.target_labels = target_labels self.n_classes = len(target_labels) params = { 'C': 4.0, 'solver': 'sag', 'max_iter': 1000, 'n_jobs': 16 } params.update(kwargs) self.models = [LogisticRegression(*args, **params) for _ in range(self.n_classes)] self.word_tfidf = None self.char_tfidf = None def fit(self, X, y, max_features=50000): assert np.shape(y)[1] == self.n_classes x_tfidf = self.fit_tfidf(X, max_features) for i, model in enumerate(self.models): model.fit(x_tfidf, y[:, i]) def predict(self, X): y = [] x_tfidf = self.transform_tfidf(X) for i, model in enumerate(self.models): y.append(model.predict(x_tfidf)) return np.transpose(y) def fit_tfidf(self, X, max_features): self.word_tfidf = TfidfVectorizer(max_features=max_features, analyzer='word', lowercase=True, ngram_range=(1, 3), token_pattern='[a-zA-Z0-9]') self.char_tfidf = TfidfVectorizer(max_features=max_features, analyzer='char', lowercase=True, ngram_range=(1, 5), token_pattern='[a-zA-Z0-9]') tfidf_word = self.word_tfidf.fit_transform(X) tfidf_char = self.char_tfidf.fit_transform(X) return sparse.hstack([tfidf_word, tfidf_char]) def transform_tfidf(self, X): assert self.word_tfidf != None and self.char_tfidf != None tfidf_word = self.word_tfidf.transform(X) tfidf_char = self.char_tfidf.transform(X) return sparse.hstack([tfidf_word, tfidf_char]) class CatBoost(object): def __init__(self, target_labels, *args, **kwargs): self.target_labels = target_labels self.n_classes = len(target_labels) self.models = [CatBoostClassifier(*args, **kwargs) for _ in range(self.n_classes)] def fit(self, X, y, eval_set=None, use_best_model=True): assert np.shape(y)[1] == self.n_classes for i, model in enumerate(self.models): if eval_set is not None: eval_set_i = (eval_set[0], eval_set[1][:, i]) else: eval_set_i = None model.fit(X, y[:, i], eval_set=eval_set_i, use_best_model=use_best_model) def predict(self, X): y = [] for i, model in enumerate(self.models): y.append(model.predict(X)) return np.transpose(y) def predict_proba(self, X): y = [] for i, model in enumerate(self.models): y.append(model.predict_proba(X)[:, 1]) return np.transpose(y) def save_predictions(df, predictions, target_labels, additional_name=None): for i, label in enumerate(target_labels): if additional_name is not None: label = '{}_{}'.format(additional_name, label) df[label] = predictions[:, i]