from sklearn.base import BaseEstimator, TransformerMixin from keras.models import Model, Input from keras.layers import Dense, LSTM, Dropout, Embedding, SpatialDropout1D, Bidirectional, concatenate, InputSpec from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D from keras.optimizers import Adam, RMSprop from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from sklearn.metrics import accuracy_score from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder import regex as re import pickle import numpy as np from keras.engine.topology import Layer from keras import initializers as initializers, regularizers, constraints from keras import backend as K class KerasTextClassifier(BaseEstimator, TransformerMixin): '''Wrapper class for keras text classification models that takes raw text as input.''' def __init__(self, max_words=30000, input_length=50, emb_dim=50, n_classes=10): self.max_words = max_words self.input_length = input_length self.emb_dim = emb_dim self.n_classes = n_classes self.return_attention = True self.model = self._get_model() self.encoder = LabelEncoder() self.tokenizer = Tokenizer(num_words=self.max_words+1, filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n', lower=True, split=' ', oov_token="UNK") def _get_model(self): d = 0.5 rd = 0.5 rnn_units = 128 input_text = Input((self.input_length,)) text_embedding = Embedding(input_dim=self.max_words + 2, output_dim=self.emb_dim, input_length=self.input_length, mask_zero=True)(input_text) text_embedding = SpatialDropout1D(0.5)(text_embedding) bilstm = Bidirectional(LSTM(units=rnn_units, return_sequences=True, dropout=d, recurrent_dropout=rd))(text_embedding) x, attn = AttentionWeightedAverage(return_attention=True)(bilstm) x = Dropout(0.5)(x) out = Dense(units=self.n_classes, activation="softmax")(x) model = Model(input_text, out) return model def _get_attention_map(self, texts): att_model_output = self.model.layers[0:-2] att_model = Model(att_model_output[0].input, att_model_output[-1].output) att_model.compile(optimizer=RMSprop(), loss="sparse_categorical_crossentropy", metrics=["accuracy"]) return att_model.predict(self._get_sequences(texts))[1] def _get_sequences(self, texts): seqs = self.tokenizer.texts_to_sequences(texts) return pad_sequences(seqs, maxlen=self.input_length, value=0, padding='post', truncating='post') def _labels(self, labels): return self.encoder.transform(labels) def fit(self, X, y, X_val=None, y_val=None, lr=0.001, resume=False, epochs=10, batch_size=32): ''' Fit the vocabulary and the model. :params: X: list of texts y: labels X_val: list of texts for validation y_val: labels for validation. ''' self.model.compile(optimizer=RMSprop(clipnorm=10., lr=lr), loss="sparse_categorical_crossentropy", metrics=["accuracy"]) if not resume: self.tokenizer.fit_on_texts(X) self.encoder.fit(y) self.tokenizer.word_index = {e: i for e,i in self.tokenizer.word_index.items() if i <= self.max_words} self.tokenizer.word_index[self.tokenizer.oov_token] = self.max_words + 1 else: print("Resuming training...") seqs = self._get_sequences(X) categorical_y = self._labels(y) print("Fit text model with {} classes".format(len(self.encoder.classes_))) if X_val: val_seqs = self._get_sequences(X_val) categorical_y_val = self._labels(y_val) self.model.fit(seqs, categorical_y, batch_size=batch_size, epochs=epochs, validation_data=(val_seqs, categorical_y_val)) else: self.model.fit(seqs, categorical_y, batch_size=batch_size, epochs=epochs, validation_split=0.1) def predict_proba(self, X, y=None): return self.model.predict(self._get_sequences(X)) def predict(self, X, y=None): return np.argmax(self.predict_proba(X), axis=1) def save(self, path="model"): self.model.save_weights('{}_weights.h5'.format(path)) with open("{}_index.pkl".format(path), "wb") as f: pickle.dump([self.encoder, self.tokenizer, self.max_words, self.emb_dim, self.input_length, self.n_classes], f) def load(self, path="model"): with open("{}_index.pkl".format(path), "rb") as f: self.encoder, self.tokenizer, self.max_words, self.emb_dim, self.input_length, self.n_classes = pickle.load(f) self.model = self._get_model() self.model.load_weights('{}_weights.h5'.format(path)) class AttentionWeightedAverage(Layer): """ Computes a weighted average attention mechanism from: Zhou, Peng, Wei Shi, Jun Tian, Zhenyu Qi, Bingchen Li, Hongwei Hao and Bo Xu. “Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification.” ACL (2016). http://www.aclweb.org/anthology/P16-2034 How to use: see: [BLOGPOST] """ def __init__(self, return_attention=False, **kwargs): self.init = initializers.get('uniform') self.supports_masking = True self.return_attention = return_attention super(AttentionWeightedAverage, self).__init__(** kwargs) def build(self, input_shape): self.input_spec = [InputSpec(ndim=3)] assert len(input_shape) == 3 self.w = self.add_weight(shape=(input_shape[2], 1), name='{}_w'.format(self.name), initializer=self.init) self.trainable_weights = [self.w] super(AttentionWeightedAverage, self).build(input_shape) def call(self, h, mask=None): h_shape = K.shape(h) d_w, T = h_shape[0], h_shape[1] logits = K.dot(h, self.w) # w^T h logits = K.reshape(logits, (d_w, T)) alpha = K.exp(logits - K.max(logits, axis=-1, keepdims=True)) # exp # masked timesteps have zero weight if mask is not None: mask = K.cast(mask, K.floatx()) alpha = alpha * mask alpha = alpha / K.sum(alpha, axis=1, keepdims=True) # softmax r = K.sum(h * K.expand_dims(alpha), axis=1) # r = h*alpha^T h_star = K.tanh(r) # h^* = tanh(r) if self.return_attention: return [h_star, alpha] return h_star def get_output_shape_for(self, input_shape): return self.compute_output_shape(input_shape) def compute_output_shape(self, input_shape): output_len = input_shape[2] if self.return_attention: return [(input_shape[0], output_len), (input_shape[0], input_shape[1])] return (input_shape[0], output_len) def compute_mask(self, input, input_mask=None): if isinstance(input_mask, list): return [None] * len(input_mask) else: return None