import os os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES']='0' import numpy as np import pandas as pd import gc import keras.backend as K from keras.models import Model from keras.layers import Input, Dense, Flatten, Embedding, Dropout, PReLU,ReLU from keras.layers import Bidirectional, SpatialDropout1D, CuDNNGRU,CuDNNLSTM, Conv1D,Conv2D,MaxPool2D,Reshape from keras.layers import GlobalAvgPool1D, GlobalMaxPool1D, concatenate,GlobalMaxPooling1D,GlobalAveragePooling1D from keras.regularizers import l2,l1 from keras.layers.normalization import BatchNormalization from keras.engine import Layer from keras.layers.core import Flatten from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau,EarlyStopping from keras.callbacks import Callback from keras.optimizers import Optimizer from keras import backend as K, initializers, regularizers, constraints from keras.engine.topology import Layer from keras.datasets import reuters from keras.models import Sequential from keras.layers import Dense, Dropout, Activation,BatchNormalization from keras.preprocessing.text import Tokenizer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer def squash(x, axis=-1): s_squared_norm = K.sum(K.square(x), axis, keepdims=True) scale = K.sqrt(s_squared_norm + K.epsilon()) return x / scale def dot_product(x, kernel): """ Wrapper for dot product operation, in order to be compatible with both Theano and Tensorflow Args: x (): input kernel (): weights Returns: """ if K.backend() == 'tensorflow': return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) else: return K.dot(x, kernel) class AttentionWithContext(Layer): """ Attention operation, with a context/query vector, for temporal data. Supports Masking. Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] "Hierarchical Attention Networks for Document Classification" by using a context vector to assist the attention # Input shape 3D tensor with shape: `(samples, steps, features)`. # Output shape 2D tensor with shape: `(samples, features)`. How to use: Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. The dimensions are inferred based on the output shape of the RNN. Note: The layer has been tested with Keras 2.0.6 Example: model.add(LSTM(64, return_sequences=True)) model.add(AttentionWithContext()) # next add a Dense layer (for classification/regression) or whatever... """ def __init__(self, W_regularizer=None, u_regularizer=None, b_regularizer=None, W_constraint=None, u_constraint=None, b_constraint=None, bias=True, **kwargs): self.supports_masking = True self.init = initializers.get('glorot_uniform') self.W_regularizer = regularizers.get(W_regularizer) self.u_regularizer = regularizers.get(u_regularizer) self.b_regularizer = regularizers.get(b_regularizer) self.W_constraint = constraints.get(W_constraint) self.u_constraint = constraints.get(u_constraint) self.b_constraint = constraints.get(b_constraint) self.bias = bias super(AttentionWithContext, self).__init__(**kwargs) def build(self, input_shape): assert len(input_shape) == 3 self.W = self.add_weight((input_shape[-1], input_shape[-1],), initializer=self.init, name='{}_W'.format(self.name), regularizer=self.W_regularizer, constraint=self.W_constraint) if self.bias: self.b = self.add_weight((input_shape[-1],), initializer='zero', name='{}_b'.format(self.name), regularizer=self.b_regularizer, constraint=self.b_constraint) self.u = self.add_weight((input_shape[-1],), initializer=self.init, name='{}_u'.format(self.name), regularizer=self.u_regularizer, constraint=self.u_constraint) super(AttentionWithContext, self).build(input_shape) def compute_mask(self, input, input_mask=None): # do not pass the mask to the next layers return None def call(self, x, mask=None): uit = dot_product(x, self.W) if self.bias: uit += self.b uit = K.tanh(uit) ait = dot_product(uit, self.u) a = K.exp(ait) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a return K.sum(weighted_input, axis=1) def compute_output_shape(self, input_shape): return input_shape[0], input_shape[-1] def RnnVersion1( n_recurrent=50, n_filters=30, dropout_rate=0.2, l2_penalty=0.0001,n_capsule = 10, n_routings = 5, capsule_dim = 16): K.clear_session() def conv_block(x, n, kernel_size): x = Conv1D(n, kernel_size, activation='relu') (x) x = Conv1D(n_filters, kernel_size, activation='relu') (x) x_att = AttentionWithContext()(x) x_avg = GlobalAveragePooling1D()(x) x_max = GlobalMaxPooling1D()(x) return concatenate([x_att, x_avg, x_max]) def att_max_avg_pooling(x): x_att = AttentionWithContext()(x) x_avg = GlobalAveragePooling1D()(x) x_max = GlobalMaxPooling1D()(x) return concatenate([x_att, x_avg, x_max]) inputs = Input(shape=(170,)) emb = Embedding(21099, 300, trainable=True)(inputs) # model 0 x0 = BatchNormalization()(emb) x0 = SpatialDropout1D(dropout_rate)(x0) x0 = Bidirectional( CuDNNGRU(n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x0) x0 = Conv1D(n_filters, kernel_size=3)(x0) x0 = PReLU()(x0) # x0 = Dropout(dropout_rate)(x0) x0 = att_max_avg_pooling(x0) # model 1 x1 = SpatialDropout1D(dropout_rate)(emb) x1 = Bidirectional( CuDNNGRU(2*n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x1) x1 = Conv1D(2*n_filters, kernel_size=2)(x1) x1 = PReLU()(x1) # x1 = Dropout(dropout_rate)(x1) x1 = att_max_avg_pooling(x1) x = concatenate([x0, x1],name='concatenate') # fc = Dense(128, activation='sigmoid')(x) outputs = Dense(6, activation='softmax')(x)# , kernel_regularizer=l2(l2_penalty), activity_regularizer=l2(l2_penalty) model = Model(inputs=inputs, outputs=outputs) model.compile(loss='categorical_crossentropy', optimizer='Nadam',metrics =['accuracy']) return model def mlp_v2(): model = Sequential() model.add(Dense(2048, input_shape=(21099,))) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(BatchNormalization()) # model.add(Dense(1024)) # model.add(Activation('relu')) # model.add(Dropout(0.5)) # model.add(BatchNormalization()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(BatchNormalization()) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(BatchNormalization()) model.add(Dense(6)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy']) return model def RnnVersion2(n_recurrent=50, n_dense=50, word_embedding_matrix= None, n_filters=50,dropout_rate=0.2, l2_penalty=0.0001, n_capsule = 10, n_routings = 5, capsule_dim = 16,max_len = 170, emb_size = 21099): K.clear_session() def conv_block(x, n, kernel_size): x = Conv1D(n, kernel_size, activation='relu') (x) x = Conv1D(n_filters, kernel_size, activation='relu') (x) x_att = AttentionWithContext()(x) x_avg = GlobalAvgPool1D()(x) x_max = GlobalMaxPool1D()(x) return concatenate([x_att, x_avg, x_max]) def att_max_avg_pooling(x): x_att = AttentionWithContext()(x) x_avg = GlobalAvgPool1D()(x) x_max = GlobalMaxPool1D()(x) return concatenate([x_att, x_avg, x_max]) inputs = Input(shape=(max_len,)) emb = Embedding(emb_size, 300,trainable=True)(inputs) # model 0 x0 = SpatialDropout1D(dropout_rate)(emb) s0 = Bidirectional( CuDNNGRU(2*n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x0) x0 = att_max_avg_pooling(s0) # model 1 x1 = SpatialDropout1D(dropout_rate)(emb) s1 = Bidirectional( CuDNNGRU(2*n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x1) x1 = att_max_avg_pooling(s1) # combine sequence output x = concatenate([s0, s1]) # x = att_max_avg_pooling(x) x = Bidirectional( CuDNNGRU(n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x) x = att_max_avg_pooling(x) # combine it all x = concatenate([x,x0, x1],name = 'concatenate') outputs = Dense(6, activation='softmax')(x) model = Model(inputs=inputs, outputs=outputs) model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics =['accuracy']) return model def RnnVersion3(n_recurrent=50, n_dense=50, word_embedding_matrix= None, n_filters=50,dropout_rate=0.2, l2_penalty=0.0001, n_capsule = 10, n_routings = 5, capsule_dim = 16,): K.clear_session() def conv_block(x, n, kernel_size): x = Conv1D(n, kernel_size, activation='relu') (x) x = Conv1D(n_filters, kernel_size, activation='relu') (x) x_att = AttentionWithContext()(x) x_avg = GlobalAvgPool1D()(x) x_max = GlobalMaxPool1D()(x) return concatenate([x_att, x_avg, x_max]) def att_max_avg_pooling(x): x_att = AttentionWithContext()(x) x_avg = GlobalAvgPool1D()(x) x_max = GlobalMaxPool1D()(x) return concatenate([x_att, x_avg, x_max]) input1_= Input(shape=(170, ), name='input1') input2_ = Input(shape=(433, ), name='input2') emb = Embedding(21099, 300,trainable=True)(input1_) # model 0 x0 = SpatialDropout1D(dropout_rate)(emb) s0 = Bidirectional( CuDNNGRU(2*n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x0) x0 = att_max_avg_pooling(s0) # model 1 x1 = SpatialDropout1D(dropout_rate)(emb) s1 = Bidirectional( CuDNNGRU(2*n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x1) x1 = att_max_avg_pooling(s1) # combine sequence output x = concatenate([s0, s1]) # x = att_max_avg_pooling(x) x = Bidirectional( CuDNNGRU(n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x) x = att_max_avg_pooling(x) # combine it all x = concatenate([x,x0, x1,input2_],name = 'concatenate') x = Dense(1024, activation='relu')(x) x = Dropout(dropout_rate)(x) x = Dense(256, activation='relu')(x) x = Dropout(dropout_rate)(x) x = Dense(128, activation='relu')(x) x = Dropout(dropout_rate)(x) # fc = Dense(120, activation='relu')(x) outputs = Dense(6, activation='softmax')(x) model = Model(inputs=[input1_,input2_], outputs=outputs) model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics =['accuracy']) return model class Capsule(Layer): def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True, activation='default', **kwargs): super(Capsule, self).__init__(**kwargs) self.num_capsule = num_capsule self.dim_capsule = dim_capsule self.routings = routings self.kernel_size = kernel_size self.share_weights = share_weights if activation == 'default': self.activation = squash else: self.activation = Activation(activation) def build(self, input_shape): super(Capsule, self).build(input_shape) input_dim_capsule = input_shape[-1] if self.share_weights: self.W = self.add_weight(name='capsule_kernel', shape=(1, input_dim_capsule, self.num_capsule * self.dim_capsule), # shape=self.kernel_size, initializer='glorot_uniform', trainable=True) else: input_num_capsule = input_shape[-2] self.W = self.add_weight(name='capsule_kernel', shape=(input_num_capsule, input_dim_capsule, self.num_capsule * self.dim_capsule), initializer='glorot_uniform', trainable=True) def call(self, u_vecs): if self.share_weights: u_hat_vecs = K.conv1d(u_vecs, self.W) else: u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) batch_size = K.shape(u_vecs)[0] input_num_capsule = K.shape(u_vecs)[1] u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) b = K.zeros_like(u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] for i in range(self.routings): b = K.permute_dimensions(b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule] c = K.softmax(b) c = K.permute_dimensions(c, (0, 2, 1)) b = K.permute_dimensions(b, (0, 2, 1)) outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) return outputs def compute_output_shape(self, input_shape): return (None, self.num_capsule, self.dim_capsule) def dot_product(x, kernel): """ Wrapper for dot product operation, in order to be compatible with both Theano and Tensorflow Args: x (): input kernel (): weights Returns: """ if K.backend() == 'tensorflow': return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) else: return K.dot(x, kernel) class AttentionWithContext(Layer): """ Attention operation, with a context/query vector, for temporal data. Supports Masking. Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] "Hierarchical Attention Networks for Document Classification" by using a context vector to assist the attention # Input shape 3D tensor with shape: `(samples, steps, features)`. # Output shape 2D tensor with shape: `(samples, features)`. How to use: Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. The dimensions are inferred based on the output shape of the RNN. Note: The layer has been tested with Keras 2.0.6 Example: model.add(LSTM(64, return_sequences=True)) model.add(AttentionWithContext()) # next add a Dense layer (for classification/regression) or whatever... """ def __init__(self, W_regularizer=None, u_regularizer=None, b_regularizer=None, W_constraint=None, u_constraint=None, b_constraint=None, bias=True, **kwargs): self.supports_masking = True self.init = initializers.get('glorot_uniform') self.W_regularizer = regularizers.get(W_regularizer) self.u_regularizer = regularizers.get(u_regularizer) self.b_regularizer = regularizers.get(b_regularizer) self.W_constraint = constraints.get(W_constraint) self.u_constraint = constraints.get(u_constraint) self.b_constraint = constraints.get(b_constraint) self.bias = bias super(AttentionWithContext, self).__init__(**kwargs) def build(self, input_shape): assert len(input_shape) == 3 self.W = self.add_weight((input_shape[-1], input_shape[-1],), initializer=self.init, name='{}_W'.format(self.name), regularizer=self.W_regularizer, constraint=self.W_constraint) if self.bias: self.b = self.add_weight((input_shape[-1],), initializer='zero', name='{}_b'.format(self.name), regularizer=self.b_regularizer, constraint=self.b_constraint) self.u = self.add_weight((input_shape[-1],), initializer=self.init, name='{}_u'.format(self.name), regularizer=self.u_regularizer, constraint=self.u_constraint) super(AttentionWithContext, self).build(input_shape) def compute_mask(self, input, input_mask=None): # do not pass the mask to the next layers return None def call(self, x, mask=None): uit = dot_product(x, self.W) if self.bias: uit += self.b uit = K.tanh(uit) ait = dot_product(uit, self.u) a = K.exp(ait) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a return K.sum(weighted_input, axis=1) def compute_output_shape(self, input_shape): return input_shape[0], input_shape[-1] class GetBest(Callback): """Get the best model at the end of training. # Arguments monitor: quantity to monitor. verbose: verbosity mode, 0 or 1. mode: one of {auto, min, max}. The decision to overwrite the current stored weights is made based on either the maximization or the minimization of the monitored quantity. For `val_acc`, this should be `max`, for `val_loss` this should be `min`, etc. In `auto` mode, the direction is automatically inferred from the name of the monitored quantity. period: Interval (number of epochs) between checkpoints. # Example callbacks = [GetBest(monitor='val_acc', verbose=1, mode='max')] mode.fit(X, y, validation_data=(X_eval, Y_eval), callbacks=callbacks) """ def __init__(self, monitor='val_loss', verbose=0, mode='auto', period=1): super(GetBest, self).__init__() self.monitor = monitor self.verbose = verbose self.period = period self.best_epochs = 0 self.epochs_since_last_save = 0 if mode not in ['auto', 'min', 'max']: warnings.warn('GetBest mode %s is unknown, ' 'fallback to auto mode.' % (mode), RuntimeWarning) mode = 'auto' if mode == 'min': self.monitor_op = np.less self.best = np.Inf elif mode == 'max': self.monitor_op = np.greater self.best = -np.Inf else: if 'acc' in self.monitor or self.monitor.startswith('fmeasure'): self.monitor_op = np.greater self.best = -np.Inf else: self.monitor_op = np.less self.best = np.Inf def on_train_begin(self, logs=None): self.best_weights = self.model.get_weights() def on_epoch_end(self, epoch, logs=None): logs = logs or {} self.epochs_since_last_save += 1 if self.epochs_since_last_save >= self.period: self.epochs_since_last_save = 0 #filepath = self.filepath.format(epoch=epoch + 1, **logs) current = logs.get(self.monitor) if current is None: warnings.warn('Can pick best model only with %s available, ' 'skipping.' % (self.monitor), RuntimeWarning) else: if self.monitor_op(current, self.best): if self.verbose > 0: print('\nEpoch %05d: %s improved from %0.5f to %0.5f,' ' storing weights.' % (epoch + 1, self.monitor, self.best, current)) self.best = current self.best_epochs = epoch + 1 self.best_weights = self.model.get_weights() else: if self.verbose > 0: print('\nEpoch %05d: %s did not improve' % (epoch + 1, self.monitor)) def on_train_end(self, logs=None): if self.verbose > 0: print('Using epoch %05d with %s: %0.5f' % (self.best_epochs, self.monitor, self.best)) self.model.set_weights(self.best_weights) class AMSgrad(Optimizer): """AMSGrad optimizer. Default parameters follow those provided in the Adam paper. # Arguments lr: float >= 0. Learning rate. beta_1: float, 0 < beta < 1. Generally close to 1. beta_2: float, 0 < beta < 1. Generally close to 1. epsilon: float >= 0. Fuzz factor. decay: float >= 0. Learning rate decay over each update. # References - [On the Convergence of Adam and Beyond](https://openreview.net/forum?id=ryQu7f-RZ) - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) """ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0., **kwargs): super(AMSgrad, self).__init__(**kwargs) with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations') self.lr = K.variable(lr, name='lr') self.beta_1 = K.variable(beta_1, name='beta_1') self.beta_2 = K.variable(beta_2, name='beta_2') self.decay = K.variable(decay, name='decay') self.epsilon = epsilon self.initial_decay = decay def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /(1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) self.updates.append(K.update(vhat, vhat_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates def get_config(self): config = {'lr': float(K.get_value(self.lr)), 'beta_1': float(K.get_value(self.beta_1)), 'beta_2': float(K.get_value(self.beta_2)), 'decay': float(K.get_value(self.decay)), 'epsilon': self.epsilon} base_config = super(AMSgrad, self).get_config() return dict(list(base_config.items()) + list(config.items())) def CapsuleNet(n_capsule = 10, n_routings = 5, capsule_dim = 16, n_recurrent=100, dropout_rate=0.2, l2_penalty=0.0001): K.clear_session() inputs = Input(shape=(170,)) x = Embedding(21099, 300, trainable=True)(inputs) x = SpatialDropout1D(dropout_rate)(x) x = Bidirectional( CuDNNGRU(n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x) x = PReLU()(x) x = Capsule( num_capsule=n_capsule, dim_capsule=capsule_dim, routings=n_routings, share_weights=True)(x) x = Flatten(name = 'concatenate')(x) x = Dropout(dropout_rate)(x) # fc = Dense(128, activation='sigmoid')(x) outputs = Dense(6, activation='softmax')(x) model = Model(inputs=inputs, outputs=outputs) model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy']) return model def CapsuleNet_v2(n_capsule = 10, n_routings = 5, capsule_dim = 16, n_recurrent=100, dropout_rate=0.2, l2_penalty=0.0001): K.clear_session() inputs = Input(shape=(200,)) x = Embedding(20000, 300, trainable=True)(inputs) x = SpatialDropout1D(dropout_rate)(x) x = Bidirectional( CuDNNGRU(n_recurrent, return_sequences=True, kernel_regularizer=l2(l2_penalty), recurrent_regularizer=l2(l2_penalty)))(x) x = PReLU()(x) x = Capsule( num_capsule=n_capsule, dim_capsule=capsule_dim, routings=n_routings, share_weights=True)(x) x = Flatten(name = 'concatenate')(x) x = Dropout(dropout_rate)(x) # fc = Dense(128, activation='sigmoid')(x) outputs = Dense(6, activation='softmax')(x) model = Model(inputs=inputs, outputs=outputs) model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy']) return model