Python keras.backend.softmax() Examples

The following are code examples for showing how to use keras.backend.softmax(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: keras-utility-layer-collection   Author: zimmerrol   File: attention.py    MIT License 6 votes vote down vote up
def step(self, x, states):   
        h = states[0]
        # states[1] necessary?

        # equals K.dot(X, self._W1) + self._b2 with X.shape=[bs, T, input_dim]
        total_x_prod = states[-1]
        # comes from the constants (equals the input sequence)
        X = states[-2]
        
        # expand dims to add the vector which is only valid for this time step
        # to total_x_prod which is valid for all time steps
        hw = K.expand_dims(K.dot(h, self._W2), 1)
        additive_atn = total_x_prod + hw
        attention = K.softmax(K.dot(additive_atn, self._V), axis=1)
        x_weighted = K.sum(attention * X, [1])

        x = K.dot(K.concatenate([x, x_weighted], 1), self._W3) + self._b3
        
        h, new_states = self.layer.cell.call(x, states[:-2])
        
        return h, new_states 
Example 2
Project: keras-utility-layer-collection   Author: zimmerrol   File: attention.py    MIT License 6 votes vote down vote up
def step(self, x, states):  
        h = states[0]
        # states[1] necessary?
        
        # comes from the constants
        X_static = states[-2]
        # equals K.dot(static_x, self._W1) + self._b2 with X.shape=[bs, L, static_input_dim]
        total_x_static_prod = states[-1]

        # expand dims to add the vector which is only valid for this time step
        # to total_x_prod which is valid for all time steps
        hw = K.expand_dims(K.dot(h, self._W2), 1)
        additive_atn = total_x_static_prod + hw
        attention = K.softmax(K.dot(additive_atn, self._V), axis=1)
        static_x_weighted = K.sum(attention * X_static, [1])
        
        x = K.dot(K.concatenate([x, static_x_weighted], 1), self._W3) + self._b3

        h, new_states = self.layer.cell.call(x, states[:-2])
        
        # append attention to the states to "smuggle" it out of the RNN wrapper
        attention = K.squeeze(attention, -1)
        h = K.concatenate([h, attention])

        return h, new_states 
Example 3
Project: lmtc-eurlex57k   Author: iliaschalkidis   File: attention.py    Apache License 2.0 6 votes vote down vote up
def call(self, x, mask=None):

        a = dot_product(x, self.Wa)

        def label_wise_attention(values):
            doc_repi, ai = values
            ai = K.softmax(K.transpose(ai))
            label_aware_doc_rep = K.dot(ai, doc_repi)
            if self.return_attention:
                return [label_aware_doc_rep, ai]
            else:
                return [label_aware_doc_rep, label_aware_doc_rep]

        label_aware_doc_reprs, attention_scores = K.tf.map_fn(label_wise_attention, [x, a])

        # Compute label-scores
        label_aware_doc_reprs = K.sum(label_aware_doc_reprs * self.Wo, axis=-1) + self.bo
        label_aware_doc_reprs = K.sigmoid(label_aware_doc_reprs)

        if self.return_attention:
            return [label_aware_doc_reprs, attention_scores]

        return label_aware_doc_reprs 
Example 4
Project: spektral   Author: danielegrattarola   File: pooling.py    MIT License 6 votes vote down vote up
def call(self, inputs):
        if self.data_mode == 'graph':
            X, I = inputs
            if K.ndim(I) == 2:
                I = I[:, 0]
        else:
            X = inputs
        attn_coeff = K.dot(X, self.attn_kernel)
        attn_coeff = K.squeeze(attn_coeff, -1)
        attn_coeff = K.softmax(attn_coeff)
        if self.data_mode == 'single':
            output = K.dot(attn_coeff[None, ...], X)
        elif self.data_mode == 'batch':
            output = K.batch_dot(attn_coeff, X)
        else:
            output = attn_coeff[:, None] * X
            output = tf.segment_sum(output, I)

        return output 
Example 5
Project: NTM-Keras   Author: SigmaQuan   File: memory.py    MIT License 6 votes vote down vote up
def content_addressing(memory_t,  key_vector_t, key_strength_t):
    '''
    Focusing by content.
    :param memory_t: external memory.
    :param key_vector_t: key vector.
    :param key_strength_t: the strength of key.
    :return:
    '''
    # print("content addressing:")
    # print(">>memory_t")
    # print(key_vector_t)
    # print(">>key_vector_t")
    # print(key_vector_t)
    # print(">>key_strength_t")
    # print(key_strength_t)
    _weight_content_t = \
        key_strength_t * cosine_similarity_group(key_vector_t, memory_t)
    weight_content_t = softmax(_weight_content_t)
    # print("_weight_content_t")
    # print(_weight_content_t)
    return weight_content_t 
Example 6
Project: keras-CF-NADE   Author: AlexGidiotis   File: test_module.py    MIT License 6 votes vote down vote up
def rating_cost_lambda_func(args):
	alpha=1.
	std=0.01
	"""
	"""
	pred_score,true_ratings,input_masks,output_masks,D,d = args
	pred_score_cum = K.cumsum(pred_score, axis=2)

	prob_item_ratings = K.softmax(pred_score_cum)
	accu_prob_1N = K.cumsum(prob_item_ratings, axis=2)
	accu_prob_N1 = K.cumsum(prob_item_ratings[:, :, ::-1], axis=2)[:, :, ::-1]
	mask1N = K.cumsum(true_ratings[:, :, ::-1], axis=2)[:, :, ::-1]
	maskN1 = K.cumsum(true_ratings, axis=2)
	cost_ordinal_1N = -K.sum((K.log(prob_item_ratings) - K.log(accu_prob_1N)) * mask1N, axis=2)
	cost_ordinal_N1 = -K.sum((K.log(prob_item_ratings) - K.log(accu_prob_N1)) * maskN1, axis=2)
	cost_ordinal = cost_ordinal_1N + cost_ordinal_N1
	nll_item_ratings = K.sum(-(true_ratings * K.log(prob_item_ratings)),axis=2)
	nll = std * K.sum(nll_item_ratings,axis=1) * 1.0 * D / (D - d + 1e-6) + alpha * K.sum(cost_ordinal,axis=1) * 1.0 * D / (D - d + 1e-6)
	cost = K.mean(nll)
	cost = K.expand_dims(cost, 0)


	return cost 
Example 7
Project: onnx-keras   Author: leodestiny   File: backend.py    MIT License 6 votes vote down vote up
def handle_softmax(cls, node, input_dict):
        x = input_dict[node.inputs[0]]
        shape = K.int_shape(x)
        if "axis" in node.attrs.keys() \
                and (node.attrs['axis'] == -1 or node.attrs["axis"] == len(shape) - 1):
            return [Lambda(lambda a: K.softmax(a))(x)]
        if "axis" in node.attrs.keys():
            axis = node.attrs["axis"]
            axis = (axis if axis >= 0 else
                    len(shape) + axis)
        else:
            axis = 1
        if axis == 1:
            cal_shape = [np.prod(shape[1:])]
        else:
            cal_shape = (np.prod(shape[1:axis], np.prod(shape[axis:])))
        x = keras.layers.Reshape(cal_shape)(x)
        x = Lambda(lambda _x: K.softmax(_x))(x)
        x = keras.layers.Reshape(shape[1:])(x)
        return [x] 
Example 8
Project: Keras-TextClassification   Author: yongzhuo   File: capsule.py    MIT License 6 votes vote down vote up
def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        outputs = None
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs 
Example 9
Project: keras-transformer   Author: kpot   File: attention.py    MIT License 6 votes vote down vote up
def mask_attention_if_needed(self, dot_product):
        """
        Makes sure that (when enabled) each position
        (of a decoder's self-attention) cannot attend to subsequent positions.
        This is achieved by assigning -inf (or some large negative number)
        to all invalid connections. Later softmax will turn them into zeros.
        We need this to guarantee that decoder's predictions are based
        on what has happened before the position, not after.
        The method does nothing if masking is turned off.
        :param dot_product: scaled dot-product of Q and K after reshaping them
        to 3D tensors (batch * num_heads, rows, cols)
        """
        if not self.use_masking:
            return dot_product
        last_dims = K.int_shape(dot_product)[-2:]
        low_triangle_ones = (
            np.tril(np.ones(last_dims))
            # to ensure proper broadcasting
            .reshape((1,) + last_dims))
        inverse_low_triangle = 1 - low_triangle_ones
        close_to_negative_inf = -1e9
        result = (
            K.constant(low_triangle_ones, dtype=K.floatx()) * dot_product +
            K.constant(close_to_negative_inf * inverse_low_triangle))
        return result 
Example 10
Project: stock-price-predict   Author: kaka-lin   File: seq2seq_attention_2.py    MIT License 6 votes vote down vote up
def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D') 
Example 11
Project: stock-price-predict   Author: kaka-lin   File: seq2seq_attention.py    MIT License 6 votes vote down vote up
def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D') 
Example 12
Project: deep-colorization   Author: tomasmikeska   File: self_attention.py    MIT License 6 votes vote down vote up
def call(self, x):
        f = K.conv2d(x,
                     kernel=self.kernel_f,
                     strides=(1, 1), padding='same')  # [bs, h, w, c']
        g = K.conv2d(x,
                     kernel=self.kernel_g,
                     strides=(1, 1), padding='same')  # [bs, h, w, c']
        h = K.conv2d(x,
                     kernel=self.kernel_h,
                     strides=(1, 1), padding='same')  # [bs, h, w, c]

        s = K.batch_dot(_hw_flatten(g), K.permute_dimensions(_hw_flatten(f), (0, 2, 1)))  # # [bs, N, N]

        beta = K.softmax(s, axis=-1)  # Attention map

        o = K.batch_dot(beta, _hw_flatten(h))  # [bs, N, C]

        o = K.reshape(o, shape=K.shape(x))  # [bs, h, w, C]
        x = self.gamma * o + x

        return x 
Example 13
Project: experiments   Author: Octavian-ai   File: adjacency_layer.py    MIT License 6 votes vote down vote up
def call_dot_softmax(self, x):
		pr = self.product
		pe = self.person

		pr = K.softmax(self.product)
		pe = K.softmax(self.person)

		m = K.dot(pr, K.transpose(pe))
		m = (self.w3 * m) + self.b3
		m = K.relu(m, alpha=0.1)

		m = m * x

		return m

	# 100pc test accuracy 
Example 14
Project: experiments   Author: Octavian-ai   File: adjacency_layer.py    MIT License 6 votes vote down vote up
def call_dense_conv(self, x):
		self.jitter(idx=[0,1])

		pr = self.product
		pe = self.person

		pr = K.softmax(pr)
		pe = K.softmax(pe)

		all_pairs = self.cartesian_product_matrix(pr, pe)

		flat = K.reshape(all_pairs, (self.product_count * self.person_count * self.style_width, 2))
		m = K.dot(flat, self.wc1)
		m = K.tanh(m)

		m = K.reshape(m, (self.product_count * self.person_count, self.style_width))
		m = K.dot(m, self.w2)
		m = K.relu(m, alpha=0.1)

		m = K.reshape(m, (1, self.product_count, self.person_count))
		masked = m * x
		return masked 
Example 15
Project: ntm_keras   Author: flomlo   File: ntm.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _get_weight_vector(self, M, w_tm1, k, beta, g, s, gamma):
#        M = tf.Print(M, [M, w_tm1, k], message='get weights beg1: ')
#        M = tf.Print(M, [beta, g, s, gamma], message='get weights beg2: ')
        # Content adressing, see Chapter 3.3.1:
        num = beta * _cosine_distance(M, k)
        w_c  = K.softmax(num) # It turns out that equation (5) is just softmax.
        # Location adressing, see Chapter 3.3.2:
        # Equation 7:
        w_g = (g * w_c) + (1-g)*w_tm1
        # C_s is the circular convolution
        #C_w = K.sum((self.C[None, :, :, :] * w_g[:, None, None, :]),axis=3)
        # Equation 8:
        # TODO: Explain
        C_s = K.sum(K.repeat_elements(self.C[None, :, :, :], self.batch_size, axis=0) * s[:,:,None,None], axis=1)
        w_tilda = K.batch_dot(C_s, w_g)
        # Equation 9:
        w_out = _renorm(w_tilda ** gamma)

        return w_out 
Example 16
Project: Quora   Author: KevinLiao159   File: neural_networks.py    MIT License 6 votes vote down vote up
def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))    # noqa
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]  # noqa

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]  # noqa
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]    # noqa
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))    # noqa
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])
        return outputs 
Example 17
Project: Quora   Author: KevinLiao159   File: submission_v50.py    MIT License 6 votes vote down vote up
def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))    # noqa
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]  # noqa

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]  # noqa
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]    # noqa
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))    # noqa
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])
        return outputs 
Example 18
Project: CDRextraction   Author: Xls1994   File: customize_layer.py    Apache License 2.0 6 votes vote down vote up
def call(self, inputs,mask=None):
        aspect =inputs[0]
        memory =inputs[1]
        print K.int_shape(aspect)
        aspect =K.reshape(aspect,(-1,K.int_shape(aspect)[2]))
        # (-1,100) ->(-1,n,100)
        vaspect =K.repeat(aspect,K.int_shape(memory)[1])
        print K.int_shape(aspect)
        print ' vaspect',K.int_shape(vaspect)
        print ' memory',K.int_shape(memory)
        x =concatenate(inputs=[memory,vaspect],axis=-1)
        print 'x...shape',K.int_shape(x)
        gi =K.tanh(K.dot(x,self.W)+self.b)  #32 *6 *1
        gi =K.sum(gi,axis=-1)   # 32 *6
        alfa =K.softmax(gi)
        self.alfa =alfa
        output =K.sum(memory*K.expand_dims(alfa,axis=-1),axis=1) #sum(32 *6 *310)
        print 'output..shape',K.int_shape(output)
        return output 
Example 19
Project: nlp_toolkit   Author: stevewyl   File: multi_dim_attention.py    MIT License 6 votes vote down vote up
def call(self, x, mask=None):
        uit = K.tanh(K.dot(x, self.Ws1))
        ait = K.dot(uit, self.Ws2)
        ait = K.permute_dimensions(ait, (0, 2, 1))
        A = K.softmax(ait, axis=1)
        M = K.batch_dot(A, x)
        if self.punish:
            A_T = K.permute_dimensions(A, (0, 2, 1))
            tile_eye = K.tile(K.eye(self.weight_ws2), [self.batch_size, 1])
            tile_eye = K.reshape(
                tile_eye, shape=[-1, self.weight_ws2, self.weight_ws2])
            AA_T = K.batch_dot(A, A_T) - tile_eye
            P = K.l2_normalize(AA_T, axis=(1, 2))
            return M, P
        else:
            return M 
Example 20
Project: nlp_toolkit   Author: stevewyl   File: self_attention.py    MIT License 6 votes vote down vote up
def Mask(self, inputs, seq_len, mode='mul'):
        """
        # Arguments:
            inputs: input tensor with shape (batch_size, seq_len, input_size)
            seq_len: Each sequence's actual length with shape (batch_size,)
            mode:
                mul: mask the rest dim with zero, used before fully-connected layer
                add: subtract a big constant from the rest, used before softmax layer
        # Reutrns:
            Masked tensors with the same shape of input tensor
        """
        if seq_len is None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:, 0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape) - 2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12 
Example 21
Project: knowledge_distillation   Author: wmpauli   File: kd_squeezenet.py    MIT License 6 votes vote down vote up
def knowledge_distillation_loss(y_true, y_pred, temperature):    
    
    # split in 
    #    true targets
    #    logits from xception
    y_true, logits = y_true[:, :256], y_true[:, 256:]
    
    # convert logits to soft targets
    y_soft = K.softmax(logits/temperature)
    
    # split in 
    #    usual output probabilities
    #    probabilities made softer with temperature
    y_pred, y_pred_soft = y_pred[:, :256], y_pred[:, 256:]    
    
    return K.in_train_phase(logloss(y_soft, y_pred_soft), logloss(y_true, y_pred))
    

# # For testing use usual output probabilities (without temperature) 
Example 22
Project: knowledge_distillation   Author: wmpauli   File: kd_squeezenet.py    MIT License 6 votes vote down vote up
def knowledge_distillation_loss(y_true, y_pred, lambda_const, temperature):    
    
    # split in 
    #    onehot hard true targets
    #    logits from xception
    y_true, logits = y_true[:, :256], y_true[:, 256:]
    
    # convert logits to soft targets
    y_soft = K.softmax(logits/temperature)
    
    # split in 
    #    usual output probabilities
    #    probabilities made softer with temperature
    y_pred, y_pred_soft = y_pred[:, :256], y_pred[:, 256:]    
    
    return K.in_train_phase(logloss(y_true, y_pred), logloss(y_soft, y_pred_soft))
    # return lambda_const*logloss(y_true, y_pred) + logloss(y_soft, y_pred_soft)
    # return logloss(y_soft, y_pred_soft)


# # For testing use usual output probabilities (without temperature)

# In[13]: 
Example 23
Project: tying-wv-and-wc   Author: icoxfog417   File: augmented_model.py    MIT License 6 votes vote down vote up
def __init__(self, 
        vocab_size, 
        sequence_size,
        setting=None,
        checkpoint_path="",
        temperature=10,
        tying=False):

        super().__init__(vocab_size, sequence_size, setting, checkpoint_path)
        self.temperature = temperature
        self.tying = tying
        self.gamma = self.setting.gamma

        if tying:
            self.model.pop()  # remove activation
            self.model.pop()  # remove projection (use self embedding)
            self.model.add(Lambda(lambda x: K.dot(x, K.transpose(self.embedding.embeddings))))
            self.model.add(Activation("softmax")) 
Example 24
Project: tying-wv-and-wc   Author: icoxfog417   File: augmented_model.py    MIT License 6 votes vote down vote up
def augmented_loss(self, y_true, y_pred):
        _y_pred = Activation("softmax")(y_pred)
        loss = K.categorical_crossentropy(_y_pred, y_true)

        # y is (batch x seq x vocab)
        y_indexes = K.argmax(y_true, axis=2)  # turn one hot to index. (batch x seq)
        y_vectors = self.embedding(y_indexes)  # lookup the vector (batch x seq x vector_length)

        #v_length = self.setting.vector_length
        #y_vectors = K.reshape(y_vectors, (-1, v_length))
        #y_t = K.map_fn(lambda v: K.dot(self.embedding.embeddings, K.reshape(v, (-1, 1))), y_vectors)
        #y_t = K.squeeze(y_t, axis=2)  # unknown but necessary operation
        #y_t = K.reshape(y_t, (-1, self.sequence_size, self.vocab_size))

        # vector x embedding dot products (batch x seq x vocab)
        y_t = tf.tensordot(y_vectors, K.transpose(self.embedding.embeddings), 1)
        y_t = K.reshape(y_t, (-1, self.sequence_size, self.vocab_size))  # explicitly set shape
        y_t = K.softmax(y_t / self.temperature)
        _y_pred_t = Activation("softmax")(y_pred / self.temperature)
        aug_loss = kullback_leibler_divergence(y_t, _y_pred_t)
        loss += (self.gamma * self.temperature) * aug_loss
        return loss 
Example 25
Project: VisualNN   Author: angelhunt   File: cifar10_cnn_capsule.py    GNU General Public License v3.0 5 votes vote down vote up
def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    scale = K.sqrt(s_squared_norm) / (0.5 + s_squared_norm)
    return scale * x


# define our own softmax function instead of K.softmax
# because K.softmax can not specify axis. 
Example 26
Project: VisualNN   Author: angelhunt   File: cifar10_cnn_capsule.py    GNU General Public License v3.0 5 votes vote down vote up
def softmax(x, axis=-1):
    ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
    return ex / K.sum(ex, axis=axis, keepdims=True)


# define the margin loss like hinge loss 
Example 27
Project: VisualNN   Author: angelhunt   File: cifar10_cnn_capsule.py    GNU General Public License v3.0 5 votes vote down vote up
def call(self, inputs):
        """Following the routing algorithm from Hinton's paper,
        but replace b = b + <u,v> with b = <u,v>.

        This change can improve the feature representation of Capsule.

        However, you can replace
            b = K.batch_dot(outputs, hat_inputs, [2, 3])
        with
            b += K.batch_dot(outputs, hat_inputs, [2, 3])
        to realize a standard routing.
        """

        if self.share_weights:
            hat_inputs = K.conv1d(inputs, self.kernel)
        else:
            hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1])

        batch_size = K.shape(inputs)[0]
        input_num_capsule = K.shape(inputs)[1]
        hat_inputs = K.reshape(hat_inputs,
                               (batch_size, input_num_capsule,
                                self.num_capsule, self.dim_capsule))
        hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3))

        b = K.zeros_like(hat_inputs[:, :, :, 0])
        for i in range(self.routings):
            c = softmax(b, 1)
            o = self.activation(K.batch_dot(c, hat_inputs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(o, hat_inputs, [2, 3])
                if K.backend() == 'theano':
                    o = K.sum(o, axis=1)

        return o 
Example 28
Project: keras-utility-layer-collection   Author: zimmerrol   File: attention.py    MIT License 5 votes vote down vote up
def call(self, x, mask=None):
        q, k, v = x
        d_k = q.shape.as_list()[2]

        # in pure tensorflow:
        # weights = tf.matmul(x_batch, tf.transpose(y_batch, perm=[0, 2, 1]))
        # normalized_weights = tf.nn.softmax(weights/scaling)
        # output = tf.matmul(normalized_weights, x_batch)
        
        weights = K.batch_dot(q,  k, axes=[2, 2])

        if mask is not None:
            # add mask weights
            if isinstance(mask, (list, tuple)):
                if len(mask) > 0:
                    raise ValueError("mask can only be a Tensor or a list of length 1 containing a tensor.")

                mask = mask[0]

            weights += -1e10*(1-mask)

        normalized_weights = K.softmax(weights / np.sqrt(d_k))
        output = K.batch_dot(normalized_weights, v)
        
        if self._return_attention:
            return [output, normalized_weights]
        else:
            return output 
Example 29
Project: keras-utility-layer-collection   Author: zimmerrol   File: attention.py    MIT License 5 votes vote down vote up
def call(self, x):
        source, query = x
        
        similarity = self._similarity(source, query)
        expected_similarity_shape = [source.shape.as_list()[0], source.shape.as_list()[1], source.shape.as_list()[1]]
       
        if similarity.shape.as_list() != expected_similarity_shape:
            raise RuntimeError("The similarity function has returned a similarity with shape {0}, but expected {1}".format(similarity.shape.as_list()[:2], expected_similarity_shape))
        
        score = K.softmax(similarity)
        output = K.batch_dot(score, source, axes=[1, 1])
        
        return output 
Example 30
Project: blackbox-attacks   Author: sunblaze-ucb   File: tf_utils.py    MIT License 5 votes vote down vote up
def tf_test_error_rate(logits, x, X_test, y_test):
    """
    Compute test error.
    """
    assert len(X_test) == len(y_test)

    # Predictions for the test set
    eval_prediction = K.softmax(logits)

    predictions = batch_eval([x], [eval_prediction], [X_test])[0]

    return error_rate(predictions, y_test) 
Example 31
Project: blackbox-attacks   Author: sunblaze-ucb   File: tf_utils.py    MIT License 5 votes vote down vote up
def tf_test_error_rate(model, x, X_test, y_test):
    """
    Compute test error.
    """
    assert len(X_test) == len(y_test)

    # Predictions for the test set
    eval_prediction = K.softmax(model(x))

    predictions = batch_eval([x], [eval_prediction], [X_test])[0]

    return error_rate(predictions, y_test) 
Example 32
Project: lmtc-eurlex57k   Author: iliaschalkidis   File: attention.py    Apache License 2.0 5 votes vote down vote up
def call(self, x, mask=None):
        # Unfold inputs (document representations, label representations)
        doc_reps, label_reps = x

        doc2_reps = K.tanh(dot_product(doc_reps, self.W_d) + self.b_d)

        # Compute Attention Scores
        doc_a = dot_product(doc2_reps, label_reps)

        def label_wise_attention(values):
            doc_repi, ai = values
            ai = K.softmax(K.transpose(ai))
            label_aware_doc_rep = K.dot(ai, doc_repi)
            if self.return_attention:
                return [label_aware_doc_rep, ai]
            else:
                return [label_aware_doc_rep, label_aware_doc_rep]

        label_aware_doc_reprs, attention_scores = K.tf.map_fn(label_wise_attention, [doc_reps, doc_a])

        label_aware_doc_reprs = K.sum(label_aware_doc_reprs * label_reps, axis=-1)
        label_aware_doc_reprs = K.sigmoid(label_aware_doc_reprs)

        if self.return_attention:
            return [label_aware_doc_reprs, attention_scores]

        return label_aware_doc_reprs 
Example 33
Project: Document-Classifier-LSTM   Author: AlexGidiotis   File: attention.py    MIT License 5 votes vote down vote up
def call(self, x):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.softmax(ait)
        a = K.expand_dims(a)
        weighted_input = x * a

        return K.sum(weighted_input, axis=1) 
Example 34
Project: AI2-Reasoning-Challenge-ARC   Author: SebiSebi   File: keras_custom_layers.py    GNU General Public License v3.0 5 votes vote down vote up
def call(self, inputs):
        if not isinstance(inputs, list):
            raise ValueError('Linked Attention layer expects a list '
                             'of tensors as inputs.')
        if len(inputs) != 2:
            raise ValueError('Linked Attention layer expects two tensors as '
                             'input, {} were given'.format(len(inputs)))
        input_states = inputs[0]
        last_state = inputs[1]

        # Each LSTM state is a row vector in "input_states".
        # Apply a linear transformation to each hidden state.
        # The same transformation to all states.
        # hs.shape = (batch_size, timestamps, self.dim)
        hs = K.dot(input_states, self.Wy)

        # Apply a linear function to last_state and expand
        # it to each row vector.
        # aux3.shape = (batch_size, timestamps, size_LSTM_2)
        # aux4.shape = (batch_size, timestamps, self.dim)
        aux1 = K.expand_dims(last_state, -1)
        aux2 = K.dot(aux1, K.ones(shape=(1, self.num_timestamps)))
        aux3 = K.permute_dimensions(aux2, (0, 2, 1))
        aux4 = K.dot(aux3, self.Wh)
        assert(aux3.shape[1] == hs.shape[1])
        assert(aux3.shape[2] == last_state.shape[1])
        assert(aux4.shape[1] == hs.shape[1])
        assert(aux4.shape[2] == hs.shape[2])
        assert(aux4.shape[2] == self.dim)

        m = K.relu(hs + aux4)
        alpha = K.expand_dims(K.softmax(K.squeeze(K.dot(m, self.w), -1)), 1)

        # r.shape = (batch_size, 1, size_LSTM_1)
        r = K.batch_dot(alpha, input_states)

        output_1 = K.dot(r, self.Wp)
        output_2 = K.dot(K.expand_dims(last_state, 1), self.Wx)
        output_3 = K.squeeze(output_1, 1) + K.squeeze(output_2, 1)
        return K.relu(output_3) 
Example 35
Project: gccaps   Author: tqbl   File: capsules.py    MIT License 5 votes vote down vote up
def call(self, inputs, training=None):
        """Apply transformation followed by capsule routing."""
        # Create dimension for output capsules and tile along this dim
        # (None, *n_capsules*, n_input_capsules, dim_input_capsules)
        inputs_tiled = K.tile(K.expand_dims(inputs, 1),
                              [1, self.n_capsules, 1, 1])

        # Apply linear transformation to compute prediction vectors
        inputs_hat = K.map_fn(lambda x: K.batch_dot(x, self.W, [2, 3]),
                              elems=inputs_tiled)
        # Add bias to prediction vectors if specified
        if self.use_bias:
            inputs_hat = K.bias_add(inputs_hat, self.bias,
                                    data_format='channels_first')

        # Initialize logit variables to zero
        b = K.zeros(shape=[K.shape(inputs_hat)[0],
                           self.n_capsules,
                           self.n_input_capsules])

        # Apply routing algorithm
        for i in range(self.routings):
            # Compute coupling coefficients
            c = K.softmax(b, axis=1)
            # Apple squashing function
            outputs = squash(K.batch_dot(c, inputs_hat, [2, 2]))
            # Update logits by computing agreement
            if i < self.routings - 1:
                b += K.batch_dot(outputs, inputs_hat, [2, 3])

        return outputs 
Example 36
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def step(self, x, states):
        # This is based on [tensorflows implementation](https://github.com/tensorflow/tensorflow/blob/c8a45a8e236776bed1d14fd71f3b6755bd63cc58/tensorflow/python/ops/seq2seq.py#L506).
        # First, we calculate new attention masks:
        #   attn = softmax(V^T * tanh(W2 * X +b2 + W1 * h))
        # and we make the input as a concatenation of the input and weighted inputs which is then
        # transformed back to the shape x of using W3
        #   x = W3*(x+X*attn)+b3
        # Then, we run the cell on a combination of the input and previous attention masks:
        #   h, state = cell(x, h).
        
        nb_samples, nb_time, input_dim = self.input_spec[0].shape
        h = states[0]
        X = states[-1]
        xW1 = states[-2]
        
        Xr = K.reshape(X,(-1,nb_time,1,input_dim))
        hW2 = K.dot(h,self.W2)+self.b2
        hW2 = K.reshape(hW2,(-1,1,1,input_dim)) 
        u = K.tanh(xW1+hW2)
        a = K.sum(self.V*u,[2,3])
        a = K.softmax(a)
        a = K.reshape(a,(-1, nb_time, 1, 1))
        
        # Weight attention vector by attention
        Xa = K.sum(a*Xr,[1,2])
        Xa = K.reshape(Xa,(-1,input_dim))
        
        # Merge input and attention weighted inputs into one vector of the right size.
        x = K.dot(K.concatenate([x,Xa],1),self.W3)+self.b3    
        
        h, new_states = self.layer.step(x, states)
        return h, new_states 
Example 37
Project: BERT   Author: yyht   File: funcs.py    Apache License 2.0 5 votes vote down vote up
def scaled_dot_product_attention_tf(q, k, v, attn_mask, attention_dropout: float):
    w = K.batch_dot(q, k)  # w is B, H, L, L
    w = w / K.sqrt(K.cast(shape_list(v)[-1], K.floatx()))
    if attn_mask is not None:
        w = attn_mask * w + (1.0 - attn_mask) * -1e9
    w = K.softmax(w)
    w = Dropout(attention_dropout)(w)
    return K.batch_dot(w, v)  # it is B, H, L, C//H [like v] 
Example 38
Project: squeezedet-keras   Author: omni-us   File: utils.py    MIT License 5 votes vote down vote up
def softmax(x, axis=-1):
    """Compute softmax values for each sets of scores in x."""

    e_x = np.exp(x - np.max(x))
    return e_x / np.expand_dims(np.sum(e_x,axis=axis), axis=axis) 
Example 39
Project: Coloring-greyscale-images   Author: emilwallner   File: attention.py    MIT License 5 votes vote down vote up
def call(self, x):
        def hw_flatten(x):
            return K.reshape(x, shape=[K.shape(x)[0], K.shape(x)[1]*K.shape(x)[2], K.shape(x)[-1]])

        f = K.conv2d(x,
                     kernel=self.kernel_f,
                     strides=(1, 1), padding='same')  # [bs, h, w, c']
        f = K.bias_add(f, self.bias_f)
        g = K.conv2d(x,
                     kernel=self.kernel_g,
                     strides=(1, 1), padding='same')  # [bs, h, w, c']
        g = K.bias_add(g, self.bias_g)
        h = K.conv2d(x,
                     kernel=self.kernel_h,
                     strides=(1, 1), padding='same')  # [bs, h, w, c]
        h = K.bias_add(h, self.bias_h)

        s = tf.matmul(hw_flatten(g), hw_flatten(f), transpose_b=True)  # # [bs, N, N]

        beta = K.softmax(s, axis=-1)  # attention map

        o = K.batch_dot(beta, hw_flatten(h))  # [bs, N, C]

        o = K.reshape(o, shape=K.shape(x))  # [bs, h, w, C]
        x = self.gamma * o + x

        return x 
Example 40
Project: NTM-Keras   Author: SigmaQuan   File: memory.py    MIT License 5 votes vote down vote up
def softmax(x):
    # print("x")
    # print(x)
    _softmax = K.softmax(x)
    # print("softmax(x)")
    # print(_softmax)
    return _softmax 
Example 41
Project: Keras-MDN   Author: omimo   File: mdn.py    MIT License 5 votes vote down vote up
def __init__(self, output_dim, num_mix, kernel='unigaussian', **kwargs):
        self.output_dim = output_dim
        self.kernel = kernel
        self.num_mix = num_mix
        
        with tf.name_scope('MDNLayer'):
            # self.inputs      = Input(shape=(input_dim,), dtype='float32', name='msn_input')
            self.mdn_mus     = Dense(self.num_mix * self.output_dim, name='mdn_mus')#(self.inputs)
            self.mdn_sigmas  = Dense(self.num_mix, activation=K.exp, name='mdn_sigmas')#(self.inputs)
            self.mdn_pi      = Dense(self.num_mix, activation=K.softmax, name='mdn_pi')#(self.inputs)
            # self.mdn_out     = merge([self.mdn_mus, self.mdn_sigmas, self.mdn_pi], mode='concat', name='mdn_out')

        super(MDN, self).__init__(**kwargs) 
Example 42
Project: RPGOne   Author: RTHMaK   File: masked_operations.py    Apache License 2.0 5 votes vote down vote up
def masked_softmax(vector, mask):
    """
    `K.softmax(vector)` does not work if some elements of `vector` should be masked.  This performs
    a softmax on just the non-masked portions of `vector` (passing None in for the mask is also
    acceptable; you'll just get a regular softmax).

    We assume that both `vector` and `mask` (if given) have shape (batch_size, vector_dim).

    In the case that the input vector is completely masked, this function returns an array
    of ``0.0``. This behavior may cause ``NaN`` if this is used as the last layer of a model
    that uses categorial cross-entropy loss.
    """
    # We calculate masked softmax in a numerically stable fashion, as done
    # in https://github.com/rkadlec/asreader/blob/master/asreader/custombricks/softmax_mask_bricks.py
    if mask is not None:
        # Here we get normalized log probabilities for
        # enhanced numerical stability.
        mask = K.cast(mask, "float32")
        input_masked = mask * vector
        shifted = mask * (input_masked - K.max(input_masked, axis=1,
                                               keepdims=True))
        # We add epsilon to avoid numerical instability when
        # the sum in the log yields 0.
        normalization_constant = K.log(K.sum(mask * K.exp(shifted), axis=1,
                                             keepdims=True) + K.epsilon())
        normalized_log_probabilities = mask * (shifted - normalization_constant)
        unmasked_probabilities = K.exp(normalized_log_probabilities)
        return switch(mask, unmasked_probabilities, K.zeros_like(unmasked_probabilities))
    else:
        # There is no mask, so we use the provided ``K.softmax`` function.
        return K.softmax(vector) 
Example 43
Project: RPGOne   Author: RTHMaK   File: word_alignment.py    Apache License 2.0 5 votes vote down vote up
def _align(source_embedding, target_embedding, source_mask, target_mask, normalize_alignment=True):
        '''
        Takes source and target sequence embeddings and returns a source-to-target alignment weights.
        That is, for each word in the source sentence, returns a probability distribution over target_sequence
        that shows how well each target word aligns (i.e. is similar) to it.

        source_embedding: (batch_size, source_length, embed_dim)
        target_embedding: (batch_size, target_length, embed_dim)
        source_mask: None or (batch_size, source_length, 1)
        target_mask: None or (batch_size, target_length, 1)
        normalize_alignment (bool): Will apply a (masked) softmax over alignments is True.

        Returns:
        s2t_attention: (batch_size, source_length, target_length)
        '''
        source_dot_target = masked_batch_dot(source_embedding, target_embedding, source_mask, target_mask)
        if normalize_alignment:
            alignment_shape = K.shape(source_dot_target)
            flattened_products_with_source = last_dim_flatten(source_dot_target)
            if source_mask is None and target_mask is None:
                flattened_s2t_attention = K.softmax(flattened_products_with_source)
            elif source_mask is not None and target_mask is not None:
                float_source_mask = K.cast(source_mask, 'float32')
                float_target_mask = K.cast(target_mask, 'float32')
                # (batch_size, source_length, target_length)
                s2t_mask = K.expand_dims(float_source_mask, axis=-1) * K.expand_dims(float_target_mask, axis=1)
                flattened_s2t_mask = last_dim_flatten(s2t_mask)
                flattened_s2t_attention = masked_softmax(flattened_products_with_source, flattened_s2t_mask)
            else:
                # One of the two inputs is masked, and the other isn't. How did this happen??
                raise NotImplementedError('Cannot handle only one of the inputs being masked.')
            # (batch_size, source_length, target_length)
            s2t_attention = K.reshape(flattened_s2t_attention, alignment_shape)
            return s2t_attention
        else:
            return source_dot_target 
Example 44
Project: RPGOne   Author: RTHMaK   File: knowledge_backed_lstm.py    Apache License 2.0 5 votes vote down vote up
def step(self, inputs, states):
        # While the actual input to the layer is of
        # shape (batch_size, time, knowledge_length, token_dim+knowledge_dim), inputs in this function
        # is of shape (batch_size, knowledge_length, token_dim+knowledge_dim) as Keras iterates over
        # the time dimension and calls this function once per timestep.

        # TODO(matt): this variable is not used.  Should we be using the previous hidden state in
        # here?
        h_tm1 = states[0]  # Output from previous (t-1) timestep; pylint: disable=unused-variable
        token_t = inputs[:, 0, :self.token_dim]  # Current token (batch_size, token_dim)

        # Repeated along knowledge_len (batch_size, knowledge_len, token_dim)
        tiled_token_t = inputs[:, :, :self.token_dim]
        knowledge_t = inputs[:, :, self.token_dim:]  # Current knowledge (batch_size, knowledge_len, knowledge_dim)

        # TODO(pradeep): Try out other kinds of interactions between knowledge and tokens.
        # Candidates: dot product, difference, element wise product, inner product ..
        projected_combination = self.attention_activation(
                K.dot(knowledge_t, self.knowledge_projector) +
                K.dot(tiled_token_t, self.token_projector)) # (batch_size, knowledge_len, proj_dim)
        # Shape: (batch_size, knowledge_len)
        attention_scores = K.softmax(K.dot(projected_combination, self.attention_scorer))

        # Add a dimension at the end for attention scores to make the number of
        # dimensions the same as that of knowledge_t, multiply and compute sum along knowledge_len to
        # get a weighted average of all pieces of background information.
        # Shape: (batch_size, knowledge_dim)
        attended_knowledge = K.sum(knowledge_t * K.expand_dims(attention_scores, -1), axis=1)
        lstm_input_t = K.concatenate([token_t, attended_knowledge])  # (batch_size, tok_dim+knowledge_dim)

        # Now pass the concatenated input to LSTM's step function like nothing ever happened.
        return super(KnowledgeBackedLSTM, self).step(lstm_input_t, states) 
Example 45
Project: keras-CF-NADE   Author: AlexGidiotis   File: test_module.py    MIT License 5 votes vote down vote up
def prediction_layer(x):
	# x.shape = (?,6040,5)
	x_cumsum = K.cumsum(x, axis=2)
	# x_cumsum.shape = (?,6040,5)
	
	output = K.softmax(x_cumsum)
	# output = (?,6040,5)
	return output 
Example 46
Project: deeplogic   Author: nuric   File: ilp.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def call(self, inputs):
    """Return the rule weights."""
    h = K.constant(self.goal) # (1, 1, pred_len, embed_size)
    b = K.softmax(self.body)
    r = K.concatenate([h, b], axis=1)
    r = K.expand_dims(r, axis=0)
    r = K.tile(r, [K.shape(inputs)[0], 1, 1, 1, 1])
    return r 
Example 47
Project: gcnet_stereo   Author: hmarechal   File: gcnet_builder.py    MIT License 5 votes vote down vote up
def _soft_arg_min(input_, dmax):
    """softargmin = Sum(d x softmax(-cost[d])) for d=0,...,D
    input_ has shape [DxHxWx1]"""
    x = K.squeeze(input_, axis=1)
    x = K.permute_dimensions(x, (0, 2, 3, 1))
    softmax = K.softmax(x)
    softmax = K.permute_dimensions(softmax, (0, 1, 3, 2))
    disparities = K.expand_dims(K.arange(dmax, dtype='float32'), axis=0)
    output = K.dot(disparities, softmax)
    return K.squeeze(output, axis=0) 
Example 48
Project: pycorrector   Author: shibing624   File: seq2seq_attn_model.py    Apache License 2.0 5 votes vote down vote up
def call(self, inputs):
        q, v, v_mask = inputs
        k = v
        mv = K.max(v - (1. - v_mask) * 1e10, axis=1, keepdims=True)  # maxpooling1d
        mv = mv + K.zeros_like(q[:, :, :1])  # 将mv重复至“q的timesteps”份
        # 下面几步只是实现了一个乘性attention
        qw = K.dot(q, self.kernel)
        a = K.batch_dot(qw, k, [2, 2]) / 10.
        a -= (1. - K.permute_dimensions(v_mask, [0, 2, 1])) * 1e10
        a = K.softmax(a)
        o = K.batch_dot(a, v, [2, 1])
        # 将各步结果拼接
        return K.concatenate([o, q, mv], 2) 
Example 49
Project: BERT-keras   Author: Separius   File: funcs.py    GNU General Public License v3.0 5 votes vote down vote up
def scaled_dot_product_attention_tf(q, k, v, attn_mask, attention_dropout: float, neg_inf: float):
    w = K.batch_dot(q, k)  # w is B, H, L, L
    w = w / K.sqrt(K.cast(shape_list(v)[-1], K.floatx()))
    if attn_mask is not None:
        w = attn_mask * w + (1.0 - attn_mask) * neg_inf
    w = K.softmax(w)
    w = Dropout(attention_dropout)(w)
    return K.batch_dot(w, v)  # it is B, H, L, C//H [like v] 
Example 50
Project: dynamic_memory_networks_with_keras   Author: vchudinov   File: episodic_memory_module.py    GNU General Public License v3.0 5 votes vote down vote up
def compute_attention_gate(self, fact, question, memory):
        """Computes an attention score over a single fact vector,
        question and memory
        """
        f_i = [ fact * question,
                fact * memory,
                K.abs(
                    fact - question),
                K.abs(
                    fact - memory),
                ]

        g_t_i = K.tanh(K.dot(K.concatenate(f_i, axis=1), self.l_1) + self.bias_l1)
        g_t_i = K.softmax(K.dot(g_t_i, self.l_2) + self.bias_l2)
        return g_t_i 
Example 51
Project: applications   Author: geomstats   File: cifar10_cnn_capsule.py    MIT License 5 votes vote down vote up
def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    scale = K.sqrt(s_squared_norm) / (0.5 + s_squared_norm)
    return scale * x


# define our own softmax function instead of K.softmax
# because K.softmax can not specify axis. 
Example 52
Project: applications   Author: geomstats   File: cifar10_cnn_capsule.py    MIT License 5 votes vote down vote up
def softmax(x, axis=-1):
    ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
    return ex / K.sum(ex, axis=axis, keepdims=True)


# define the margin loss like hinge loss 
Example 53
Project: applications   Author: geomstats   File: cifar10_cnn_capsule.py    MIT License 5 votes vote down vote up
def call(self, inputs):
        """Following the routing algorithm from Hinton's paper,
        but replace b = b + <u,v> with b = <u,v>.

        This change can improve the feature representation of Capsule.

        However, you can replace
            b = K.batch_dot(outputs, hat_inputs, [2, 3])
        with
            b += K.batch_dot(outputs, hat_inputs, [2, 3])
        to realize a standard routing.
        """

        if self.share_weights:
            hat_inputs = K.conv1d(inputs, self.kernel)
        else:
            hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1])

        batch_size = K.shape(inputs)[0]
        input_num_capsule = K.shape(inputs)[1]
        hat_inputs = K.reshape(hat_inputs,
                               (batch_size, input_num_capsule,
                                self.num_capsule, self.dim_capsule))
        hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3))

        b = K.zeros_like(hat_inputs[:, :, :, 0])
        for i in range(self.routings):
            c = softmax(b, 1)
            if K.backend() == 'theano':
                o = K.sum(o, axis=1)
            o = self.activation(K.batch_dot(c, hat_inputs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(o, hat_inputs, [2, 3])
                if K.backend() == 'theano':
                    o = K.sum(o, axis=1)

        return o 
Example 54
Project: Keras-TextClassification   Author: yongzhuo   File: attention_self.py    MIT License 5 votes vote down vote up
def call(self, x):
        WQ = K.dot(x, self.kernel[0])
        WK = K.dot(x, self.kernel[1])
        WV = K.dot(x, self.kernel[2])
        print("WQ.shape",WQ.shape)
        print("K.permute_dimensions(WK, [0, 2, 1]).shape",K.permute_dimensions(WK, [0, 2, 1]).shape)
        QK = K.batch_dot(WQ,K.permute_dimensions(WK, [0, 2, 1]))
        QK = QK / (64**0.5)
        QK = K.softmax(QK)
        print("QK.shape",QK.shape)
        V = K.batch_dot(QK,WV)
        return V 
Example 55
Project: keras-transformer   Author: kpot   File: attention.py    MIT License 5 votes vote down vote up
def __init__(self, num_heads: int, use_masking: bool,
                 dropout: float = 0.0,
                 compression_window_size: int = None,
                 **kwargs):
        """
        :param num_heads: number of attention heads
        :param use_masking: when True, forbids the attention to see the further
          elements in the sequence (particularly important in language
          modelling).
        :param dropout: dropout that should be applied to the attention
          (after the softmax).
        :param compression_window_size: an integer value >= 1 controlling
          how much we should compress the attention. For more details,
          read about memory-compressed self-attention in
          "Generating Wikipedia by summarizing long sequences"
          (https://arxiv.org/pdf/1801.10198.pdf).
        :param kwargs: any extra arguments typical for a Keras layer,
          such as name, etc.
        """
        self.num_heads = num_heads
        self.use_masking = use_masking
        self.dropout = dropout
        if (compression_window_size is not None
                and compression_window_size <= 0):
            assert ValueError(
                f"Too small compression window ({compression_window_size})")
        self.compression_window_size = compression_window_size
        super().__init__(**kwargs) 
Example 56
Project: stock-price-predict   Author: kaka-lin   File: seq2seq_attention_2.py    MIT License 5 votes vote down vote up
def seq2seq_attention(feature_len=1, after_day=1, input_shape=(20, 1), time_step=20):
    # Define the inputs of your model with a shape (Tx, feature)
    X = Input(shape=input_shape)

    # Initialize empty list of outputs
    all_outputs = []

    # Encoder: pre-attention LSTM
    encoder = LSTM(units=100, return_state=True, return_sequences=True, name='encoder')
    # Decoder: post-attention LSTM
    decoder = LSTM(units=100, return_state=True, name='decoder')
    # Output
    decoder_output = Dense(units=feature_len, activation='linear', name='output')
    model_output = Reshape((1, feature_len))

    # Attention
    repeator = RepeatVector(time_step)
    concatenator = Concatenate(axis=-1)
    densor = Dense(1, activation = "relu")
    activator = Activation(softmax, name='attention_weights')
    dotor =  Dot(axes = 1)

    encoder_outputs, s, c = encoder(X)

    for t in range(after_day):
        context = one_step_attention(encoder_outputs, s, repeator, concatenator, densor, activator, dotor)

        a, s, c = decoder(context, initial_state=[s, c])

        outputs = decoder_output(a)
        outputs = model_output(outputs)
        all_outputs.append(outputs)

    all_outputs = Lambda(lambda x: K.concatenate(x, axis=1))(all_outputs)
    model = Model(inputs=X, outputs=all_outputs)

    return model 
Example 57
Project: stock-price-predict   Author: kaka-lin   File: seq2seq_attention.py    MIT License 5 votes vote down vote up
def seq2seq_attention(feature_len=1, after_day=1, input_shape=(20, 1), time_step=20):
    # Define the inputs of your model with a shape (Tx, feature)
    X = Input(shape=input_shape)
    s0 = Input(shape=(100, ), name='s0')
    c0 = Input(shape=(100, ), name='c0')
    s = s0
    c = c0

    # Initialize empty list of outputs
    all_outputs = []

    # Encoder: pre-attention LSTM
    encoder = LSTM(units=100, return_state=False, return_sequences=True, name='encoder')
    # Decoder: post-attention LSTM
    decoder = LSTM(units=100, return_state=True, name='decoder')
    # Output
    decoder_output = Dense(units=feature_len, activation='linear', name='output')
    model_output = Reshape((1, feature_len))

    # Attention
    repeator = RepeatVector(time_step)
    concatenator = Concatenate(axis=-1)
    densor = Dense(1, activation = "relu")
    activator = Activation(softmax, name='attention_weights')
    dotor =  Dot(axes = 1)

    encoder_outputs = encoder(X)

    for t in range(after_day):
        context = one_step_attention(encoder_outputs, s, repeator, concatenator, densor, activator, dotor)

        a, s, c = decoder(context, initial_state=[s, c])

        outputs = decoder_output(a)
        outputs = model_output(outputs)
        all_outputs.append(outputs)

    all_outputs = Lambda(lambda x: K.concatenate(x, axis=1))(all_outputs)
    model = Model(inputs=[X, s0, c0], outputs=all_outputs)

    return model 
Example 58
Project: keras-SiameseRPN   Author: Alexlastname   File: eval_graph.py    MIT License 5 votes vote down vote up
def eval_graph(*args, config=None):
    '''
    Input:
        box_map: [batch, 19, 19, 5*4]. Float32. 
        class_map: [batch, 19, 19, 5*2]. Float32.
        anchors: [batch, 19,19, 5,4]. Int 16. Absolute value coordinate with input_shape
    Return:
        max_delat: [dx, dy, dw, dh]
        max_anchor: [x, y, w, h]
    '''
    box_map = args[0]
    class_map = args[1]
    anchors = args[2]
    
    # When evaluating batch size must be 1.!!!!!!
    # Change pytorch type data to tensorflow data
    # reshape to -1, so argmax can be used
    box_map = K.reshape(box_map, (-1,4,5))
    box_map = tf.transpose(box_map,(0,2,1))
    box_map = tf.reshape(box_map,(1,-1,4))
    
    class_map = K.reshape(class_map, (-1,2,5))
    class_map = tf.transpose(class_map,(0,2,1))
    class_map = tf.reshape(class_map,(1,-1,2))
    
    anchors = K.reshape(anchors, (1,-1,4))
    
    refined_box = refine_boxes(box_map, anchors)
    # Softmax activation
    class_map = K.softmax(class_map, -1)
    class_map = class_map[...,1]
    return [refined_box,class_map] 
Example 59
Project: keras-contrib   Author: keras-team   File: capsule.py    MIT License 5 votes vote down vote up
def call(self, inputs):
        if self.share_weights:
            u_hat_vectors = K.conv1d(inputs, self.W)
        else:
            u_hat_vectors = K.local_conv1d(inputs, self.W, [1], [1])

        # u_hat_vectors : The spatially transformed input vectors (with local_conv_1d)

        batch_size = K.shape(inputs)[0]
        input_num_capsule = K.shape(inputs)[1]
        u_hat_vectors = K.reshape(u_hat_vectors, (batch_size,
                                                  input_num_capsule,
                                                  self.num_capsule,
                                                  self.dim_capsule))

        u_hat_vectors = K.permute_dimensions(u_hat_vectors, (0, 2, 1, 3))
        routing_weights = K.zeros_like(u_hat_vectors[:, :, :, 0])

        for i in range(self.routings):
            capsule_weights = K.softmax(routing_weights, 1)
            outputs = K.batch_dot(capsule_weights, u_hat_vectors, [2, 2])
            if K.ndim(outputs) == 4:
                outputs = K.sum(outputs, axis=1)
            if i < self.routings - 1:
                outputs = K.l2_normalize(outputs, -1)
                routing_weights = K.batch_dot(outputs, u_hat_vectors, [2, 3])
                if K.ndim(routing_weights) == 4:
                    routing_weights = K.sum(routing_weights, axis=1)

        return self.activation(outputs) 
Example 60
Project: experiments   Author: Octavian-ai   File: adjacency_layer.py    MIT License 5 votes vote down vote up
def call_dense(self, x):
		self.jitter(idx=[0,1], var=0.1)

		pr = self.product
		pe = self.person

		pr = K.softmax(pr)
		pe = K.softmax(pe)

		all_pairs = self.cartesian_product_matrix(pr, pe)
		flat = K.reshape(all_pairs, (self.product_count * self.person_count, self.style_width * 2))

		m = K.dot(flat, self.w1)
		# m = K.bias_add(m, self.b1)
		m = K.relu(m, alpha=0.1)

		m = K.dropout(m, level=0.1)

		m = K.dot(m, self.w2)
		m = K.relu(m, alpha=0.1)

		m = K.reshape(m, (1, self.product_count, self.person_count))
		masked = m * x
		return masked



	# 100pc test accuracy 
Example 61
Project: keras_cbof   Author: passalis   File: cbof.py    MIT License 5 votes vote down vote up
def call(self, x):

        # Calculate the pairwise distances between the codewords and the feature vectors
        x_square = K.sum(x ** 2, axis=3, keepdims=True)
        y_square = K.sum(self.V ** 2, axis=2, keepdims=True)
        dists = x_square + y_square - 2 * K.conv2d(x, self.V, strides=(1, 1), padding='valid')
        dists = K.maximum(dists, 0)

        # Quantize the feature vectors
        quantized_features = K.softmax(- dists / (self.sigmas ** 2))

        # Compile the histogram
        if self.spatial_level == 0:
            histogram = K.mean(quantized_features, [1, 2])
        elif self.spatial_level == 1:
            shape = K.shape(quantized_features)
            mid_1 = K.cast(shape[1] / 2, 'int32')
            mid_2 = K.cast(shape[2] / 2, 'int32')
            histogram1 = K.mean(quantized_features[:, :mid_1, :mid_2, :], [1, 2])
            histogram2 = K.mean(quantized_features[:, mid_1:, :mid_2, :], [1, 2])
            histogram3 = K.mean(quantized_features[:, :mid_1, mid_2:, :], [1, 2])
            histogram4 = K.mean(quantized_features[:, mid_1:, mid_2:, :], [1, 2])
            histogram = K.stack([histogram1, histogram2, histogram3, histogram4], 1)
            histogram = K.reshape(histogram, (-1, 4 * self.N_k))
        else:
            # No other spatial level is currently supported (it is trivial to extend the code)
            assert False

        # Simple trick to avoid rescaling issues
        return histogram * self.N_k 
Example 62
Project: CDRextraction   Author: Xls1994   File: customize_layer.py    Apache License 2.0 5 votes vote down vote up
def call(self, inputs,mask=None):

        memory =inputs
        print 'memory shape',K.int_shape(memory)
        gi =K.tanh(K.dot(memory,self.W)+self.b)  #32 *6 *1
        gi =K.sum(gi,axis=-1)   # 32 *6
        alfa =K.softmax(gi)
        self.alfa =alfa
        output =K.sum(memory*K.expand_dims(alfa,axis=-1),axis=1) #sum(32 *6 *310)
        print 'output..shape',K.int_shape(output)
        return output 
Example 63
Project: nlp_toolkit   Author: stevewyl   File: self_attention.py    MIT License 5 votes vote down vote up
def call(self, x):
        # if only pass in [Q_seq,K_seq,V_seq], then no Mask operation
        # if you also pass in [Q_len,V_len], Mask will apply to the redundance
        if len(x) == 3:
            Q_seq, K_seq, V_seq = x
            Q_len, V_len = None, None
        elif len(x) == 5:
            Q_seq, K_seq, V_seq, Q_len, V_len = x
        # linear transformation of Q, K, V
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(
            Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(
            K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(
            V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3))
        # compute inner product, then mask, then softmax
        A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head ** 0.5
        A = K.permute_dimensions(A, (0, 3, 2, 1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0, 3, 2, 1))
        A = K.softmax(A)
        # output and mask
        O_seq = K.batch_dot(A, V_seq, axes=[3, 2])
        O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq 
Example 64
Project: knowledge_distillation   Author: wmpauli   File: kd_squeezenet.py    MIT License 5 votes vote down vote up
def softmax(x):
    return np.exp(x)/np.exp(x).sum()


# get a random batch 
Example 65
Project: knowledge_distillation   Author: wmpauli   File: kd_squeezenet.py    MIT License 5 votes vote down vote up
def soft_logloss(y_true, y_pred):     
    logits = y_true[:, 256:]
    y_soft = K.softmax(logits/temperature)
    y_pred_soft = y_pred[:, 256:]    
    return logloss(y_soft, y_pred_soft) 
Example 66
Project: knowledge_distillation   Author: wmpauli   File: kd_squeezenet.py    MIT License 5 votes vote down vote up
def softmax(x):
    return np.exp(x)/np.exp(x).sum()


# In[7]:


# get a random batch 
Example 67
Project: tying-wv-and-wc   Author: icoxfog417   File: augmented_model.py    MIT License 5 votes vote down vote up
def perplexity(cls, y_true, y_pred):
        _y_pred = Activation("softmax")(y_pred)
        return super(AugmentedModel, cls).perplexity(y_true, _y_pred) 
Example 68
Project: keras-attention-augmented-convs   Author: titu1994   File: attn_augconv.py    MIT License 4 votes vote down vote up
def call(self, inputs, **kwargs):
        if self.axis == 1:
            # If channels first, force it to be channels last for these ops
            inputs = K.permute_dimensions(inputs, [0, 2, 3, 1])

        q, k, v = tf.split(inputs, [self.depth_k, self.depth_k, self.depth_v], axis=-1)

        q = self.split_heads_2d(q)
        k = self.split_heads_2d(k)
        v = self.split_heads_2d(v)

        # scale query
        depth_k_heads = self.depth_k / self.num_heads
        q *= (depth_k_heads ** -0.5)

        # [Batch, num_heads, height * width, depth_k or depth_v] if axis == -1
        qk_shape = [self._batch, self.num_heads, self._height * self._width, self.depth_k // self.num_heads]
        v_shape = [self._batch, self.num_heads, self._height * self._width, self.depth_v // self.num_heads]
        flat_q = K.reshape(q, K.stack(qk_shape))
        flat_k = K.reshape(k, K.stack(qk_shape))
        flat_v = K.reshape(v, K.stack(v_shape))

        # [Batch, num_heads, HW, HW]
        logits = tf.matmul(flat_q, flat_k, transpose_b=True)

        # Apply relative encodings
        if self.relative:
            h_rel_logits, w_rel_logits = self.relative_logits(q)
            logits += h_rel_logits
            logits += w_rel_logits

        weights = K.softmax(logits, axis=-1)
        attn_out = tf.matmul(weights, flat_v)

        attn_out_shape = [self._batch, self.num_heads, self._height, self._width, self.depth_v // self.num_heads]
        attn_out_shape = K.stack(attn_out_shape)
        attn_out = K.reshape(attn_out, attn_out_shape)
        attn_out = self.combine_heads_2d(attn_out)
        # [batch, height, width, depth_v]

        if self.axis == 1:
            # return to [batch, depth_v, height, width] for channels first
            attn_out = K.permute_dimensions(attn_out, [0, 3, 1, 2])

        return attn_out 
Example 69
Project: AI2-Reasoning-Challenge-ARC   Author: SebiSebi   File: arch.py    GNU General Public License v3.0 4 votes vote down vote up
def simple_LSTM(num_words, embeddings_matrix, ce_loader,
                scope, embedding_dim=64):
    # (batch, input_len) => (batch, input_len, embedding_dim)
    q_input = Input(shape=(QUESTION_LEN,), name="q_input")
    a_input = Input(shape=(ANSWER_LEN,), name="a_input")
    c_input = Input(shape=(CONTEXT_LEN,), name="c_input")

    q_emb = Embedding(input_dim=num_words + 1,  # word 0 used for padding
                      output_dim=embedding_dim,
                      weights=[embeddings_matrix],
                      input_length=QUESTION_LEN,
                      name="embedding_q_" + scope,
                      mask_zero=False,
                      trainable=False)
    a_emb = Embedding(input_dim=num_words + 1,  # word 0 used for padding
                      output_dim=embedding_dim,
                      weights=[embeddings_matrix],
                      input_length=ANSWER_LEN,
                      name="embedding_a_" + scope,
                      mask_zero=False,
                      trainable=False)
    c_emb = Embedding(input_dim=num_words + 1,  # word 0 used for padding
                      output_dim=embedding_dim,
                      weights=[embeddings_matrix],
                      input_length=CONTEXT_LEN,
                      name="embedding_c_" + scope,
                      mask_zero=False,
                      trainable=False)

    q = q_emb(q_input)
    a = a_emb(a_input)
    c = c_emb(c_input)

    q_lstm = LSTM(150, recurrent_dropout=0.15)(q)
    a_lstm = LSTM(150, recurrent_dropout=0.15)(a)
    c_lstm = LSTM(150, recurrent_dropout=0.15)(c)

    cqa = concatenate([c_lstm, q_lstm, a_lstm], axis=1)
    cqa = Dropout(0.25)(cqa)

    output_1 = Dense(250, activation='relu')(cqa)
    output_1 = Dropout(0.25)(output_1)
    output_2 = Dense(350, activation='relu')(output_1)

    output = Dense(2, activation='softmax')(output_2)
    model = Model(inputs=[q_input,
                          a_input, c_input], outputs=[output])
    model.compile(loss=categorical_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])
    plot_model(model, to_file='2way_model.png', show_shapes=True)
    return model 
Example 70
Project: AI2-Reasoning-Challenge-ARC   Author: SebiSebi   File: arch.py    GNU General Public License v3.0 4 votes vote down vote up
def attention_heatmap(num_words, embeddings_matrix, scope, embedding_dim=64):
    # (batch, input_len) => (batch, input_len, embedding_dim)
    q_input = Input(shape=(QUESTION_LEN,), name="q_input")
    a_input = Input(shape=(ANSWER_LEN,), name="a_input")
    c_input = Input(shape=(CONTEXT_LEN,), name="c_input")

    q_emb = Embedding(input_dim=num_words + 1,  # word 0 used for padding
                      output_dim=embedding_dim,
                      weights=[embeddings_matrix],
                      input_length=QUESTION_LEN,
                      name="embedding_q_" + scope,
                      mask_zero=False,
                      trainable=False)
    a_emb = Embedding(input_dim=num_words + 1,  # word 0 used for padding
                      output_dim=embedding_dim,
                      weights=[embeddings_matrix],
                      input_length=ANSWER_LEN,
                      name="embedding_a_" + scope,
                      mask_zero=False,
                      trainable=False)
    c_emb = Embedding(input_dim=num_words + 1,  # word 0 used for padding
                      output_dim=embedding_dim,
                      weights=[embeddings_matrix],
                      input_length=CONTEXT_LEN,
                      name="embedding_c_" + scope,
                      mask_zero=False,
                      trainable=False)

    q = q_emb(q_input)
    a = a_emb(a_input)
    c = c_emb(c_input)

    q = TimeDistributed(Dense(300, activation='tanh'))(q)
    a = TimeDistributed(Dense(300, activation='tanh'))(a)
    c = TimeDistributed(Dense(300, activation='tanh'))(c)

    q_lstm = Bidirectional(LSTM(50, recurrent_dropout=0.35))(q)
    c_lstm = Bidirectional(LSTM(50, recurrent_dropout=0.35,
                                return_sequences=True))(c)

    aux1 = TimeDistributed(Dense(200, activation=None,
                                 use_bias=False))(c_lstm)

    aux2 = Dense(200, activation=None, use_bias=False)(q_lstm)
    aux2 = RepeatVector(CONTEXT_LEN)(aux2)

    mt = Add()([aux1, aux2])
    mt = TimeDistributed(Activation('tanh'))(mt)

    st = TimeDistributed(Dense(1, activation=None, use_bias=False))(mt)
    st = Reshape((CONTEXT_LEN,))(st)
    st = Activation('softmax')(st)
    st = Reshape((CONTEXT_LEN, 1))(st)

    model = Model(inputs=[q_input, a_input, c_input], outputs=[st])
    model.compile(loss=categorical_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])
    return model 
Example 71
Project: AI2-Reasoning-Challenge-ARC   Author: SebiSebi   File: arch.py    GNU General Public License v3.0 4 votes vote down vote up
def linked_attention(num_words, embeddings_matrix, scope, embedding_dim=64):
    # (batch, input_len) => (batch, input_len, embedding_dim)
    premise_input = Input(shape=(PREMISE_LEN,), name="p_input")
    premise_emb = Embedding(input_dim=num_words + 1,  # word 0 used for padding
                            output_dim=embedding_dim,
                            weights=[embeddings_matrix],
                            input_length=PREMISE_LEN,
                            name="embedding_prem_" + scope,
                            mask_zero=False,
                            trainable=False)(premise_input)
    hypothesis_input = Input(shape=(HYPOTHESIS_LEN,), name="h_input")
    hypothesis_emb = Embedding(input_dim=num_words + 1,
                               output_dim=embedding_dim,
                               weights=[embeddings_matrix],
                               input_length=HYPOTHESIS_LEN,
                               name="embeding_hypo_" + scope,
                               mask_zero=False,
                               trainable=False)(hypothesis_input)

    premise_emb = TimeDistributed(
                Dense(200, activation='relu'),
                input_shape=(PREMISE_LEN, embedding_dim))(premise_emb)
    hypothesis_emb = TimeDistributed(
                Dense(200, activation='relu'),
                input_shape=(HYPOTHESIS_LEN, embedding_dim))(hypothesis_emb)
    premise_emb = Dropout(0.4)(premise_emb)
    hypothesis_emb = Dropout(0.4)(hypothesis_emb)

    output_1, state_h, state_c = LSTM(175, recurrent_dropout=0.12,
                                      return_sequences=True,
                                      return_state=True)(premise_emb)
    output_2 = LSTM(175)(hypothesis_emb, initial_state=[state_h, state_c])

    output_3 = LinkedAttention(350)([output_1, output_2])
    output_4 = Dropout(0.4)(output_3)

    output_5 = Dense(512)(output_4)
    output_5 = BatchNormalization()(output_5)
    output_5 = Activation('relu')(output_5)
    output_5 = Dropout(0.5)(output_5)

    output_6 = Dense(512)(output_5)
    output_6 = Activation('relu')(output_6)
    output_6 = Dropout(0.4)(output_6)

    output = Dense(3, activation='softmax')(output_6)

    model = Model(inputs=[premise_input, hypothesis_input], outputs=[output])
    model.compile(loss=categorical_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])
    return model 
Example 72
Project: spektral   Author: danielegrattarola   File: convolutional.py    MIT License 4 votes vote down vote up
def call(self, inputs):
        X = inputs[0]
        A = inputs[1]

        outputs = []
        output_attn = []
        for head in range(self.attn_heads):
            kernel = self.kernels[head]
            attention_kernel = self.attn_kernels[head]  # Attention kernel a in the paper (2F' x 1)

            # Compute inputs to attention network
            features = K.dot(X, kernel)

            # Compue attention coefficients
            # [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j]
            attn_for_self = K.dot(features, attention_kernel[0])    # [a_1]^T [Wh_i]
            attn_for_neighs = K.dot(features, attention_kernel[1])  # [a_2]^T [Wh_j]
            if len(K.int_shape(features)) == 2:
                # Single / mixed mode
                attn_for_neighs_T = K.transpose(attn_for_neighs)
            else:
                # Batch mode
                attn_for_neighs_T = K.permute_dimensions(attn_for_neighs, (0, 2, 1))
            attn_coef = attn_for_self + attn_for_neighs_T
            attn_coef = LeakyReLU(alpha=0.2)(attn_coef)

            # Mask values before activation (Vaswani et al., 2017)
            mask = -10e9 * (1.0 - A)
            attn_coef += mask

            # Apply softmax to get attention coefficients
            attn_coef = K.softmax(attn_coef)
            output_attn.append(attn_coef)

            # Apply dropout to attention coefficients
            attn_coef_drop = Dropout(self.dropout_rate)(attn_coef)

            # Convolution
            features = filter_dot(attn_coef_drop, features)
            if self.use_bias:
                features = K.bias_add(features, self.biases[head])

            # Add output of attention head to final output
            outputs.append(features)

        # Aggregate the heads' output according to the reduction method
        if self.concat_heads:
            output = K.concatenate(outputs)
        else:
            output = K.mean(K.stack(outputs), axis=0)

        output = self.activation(output)

        if self.return_attn_coef:
            return output, output_attn
        else:
            return output 
Example 73
Project: squeezedet-keras   Author: omni-us   File: utils.py    MIT License 4 votes vote down vote up
def slice_predictions(y_pred, config):
    """

    :param y_pred: network output
    :param config: config file
    :return: unpadded and sliced predictions
    """
    
    # calculate non padded entries
    n_outputs = config.CLASSES + 1 + 4
    # slice and reshape network output
    y_pred = y_pred[:, :, 0:n_outputs]
    y_pred = K.reshape(y_pred, (config.BATCH_SIZE, config.N_ANCHORS_HEIGHT, config.N_ANCHORS_WIDTH, -1))
    
    # number of class probabilities, n classes for each anchor
    
    num_class_probs = config.ANCHOR_PER_GRID * config.CLASSES

    # slice pred tensor to extract class pred scores and then normalize them
    pred_class_probs = K.reshape(
        K.softmax(
            K.reshape(
                y_pred[:, :, :, :num_class_probs],
                [-1, config.CLASSES]
            )
        ),
        [config.BATCH_SIZE, config.ANCHORS, config.CLASSES],
    )

    # number of confidence scores, one for each anchor + class probs
    num_confidence_scores = config.ANCHOR_PER_GRID + num_class_probs

    # slice the confidence scores and put them trough a sigmoid for probabilities
    pred_conf = K.sigmoid(
        K.reshape(
            y_pred[:, :, :, num_class_probs:num_confidence_scores],
            [config.BATCH_SIZE, config.ANCHORS]
        )
    )

    # slice remaining bounding box_deltas
    pred_box_delta = K.reshape(
        y_pred[:, :, :, num_confidence_scores:],
        [config.BATCH_SIZE, config.ANCHORS, 4]
    )
    
    return [pred_class_probs, pred_conf, pred_box_delta] 
Example 74
Project: squeezedet-keras   Author: omni-us   File: utils.py    MIT License 4 votes vote down vote up
def slice_predictions_np(y_pred, config):
    """
    does the same as above, only uses numpy
    :param y_pred: network output
    :param config: config file
    :return: unpadded and sliced predictions
    """

    # calculate non padded entries
    n_outputs = config.CLASSES + 1 + 4
    # slice and reshape network output
    y_pred = y_pred[:, :, 0:n_outputs]
    y_pred = np.reshape(y_pred, (config.BATCH_SIZE, config.N_ANCHORS_HEIGHT, config.N_ANCHORS_WIDTH, -1))

    # number of class probabilities, n classes for each anchor

    num_class_probs = config.ANCHOR_PER_GRID * config.CLASSES

    # slice pred tensor to extract class pred scores and then normalize them
    pred_class_probs = np.reshape(
        softmax(
            np.reshape(
                y_pred[:, :, :, :num_class_probs],
                [-1, config.CLASSES]
            )
        ),
        [config.BATCH_SIZE, config.ANCHORS, config.CLASSES],
    )

    # number of confidence scores, one for each anchor + class probs
    num_confidence_scores = config.ANCHOR_PER_GRID + num_class_probs

    # slice the confidence scores and put them trough a sigmoid for probabilities
    pred_conf = sigmoid(
        np.reshape(
            y_pred[:, :, :, num_class_probs:num_confidence_scores],
            [config.BATCH_SIZE, config.ANCHORS]
        )
    )

    # slice remaining bounding box_deltas
    pred_box_delta = np.reshape(
        y_pred[:, :, :, num_confidence_scores:],
        [config.BATCH_SIZE, config.ANCHORS, 4]
    )

    return [pred_class_probs, pred_conf, pred_box_delta] 
Example 75
Project: recipe-summarization   Author: rtlee9   File: model.py    MIT License 4 votes vote down vote up
def create_model(vocab_size, embedding_size, LR, rnn_layers, rnn_size, embedding=None):
    """Construct and compile LSTM model."""
    # create a standard stacked LSTM
    if embedding is not None:
        embedding = [embedding]
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size,
                        input_length=maxlen,
                        W_regularizer=regularizer, dropout=p_emb, weights=embedding, mask_zero=True,
                        name='embedding_1'))
    for i in range(rnn_layers):
        lstm = LSTM(rnn_size, return_sequences=True,
                    W_regularizer=regularizer, U_regularizer=regularizer,
                    b_regularizer=regularizer, dropout_W=p_W, dropout_U=p_U,
                    name='lstm_{}'.format(i + 1))
        model.add(lstm)
        model.add(Dropout(p_dense, name='dropout_{}'.format(i + 1)))

    def simple_context(X, mask, n=activation_rnn_size):
        """Reduce the input just to its headline part (second half).

        For each word in this part it concatenate the output of the previous layer (RNN)
        with a weighted average of the outputs of the description part.
        In this only the last `rnn_size - activation_rnn_size` are used from each output.
        The first `activation_rnn_size` output is used to computer the weights for the averaging.
        """
        desc, head = X[:, :maxlend, :], X[:, maxlend:, :]
        head_activations, head_words = head[:, :, :n], head[:, :, n:]
        desc_activations, desc_words = desc[:, :, :n], desc[:, :, n:]

        # RTFM http://deeplearning.net/software/theano/library/tensor/basic.html#theano.tensor.batched_tensordot
        # activation for every head word and every desc word
        activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2))
        # make sure we dont use description words that are masked out
        activation_energies = activation_energies + -1e20 * K.expand_dims(
            1. - K.cast(mask[:, :maxlend], 'float32'), 1)

        # for every head word compute weights for every desc word
        activation_energies = K.reshape(activation_energies, (-1, maxlend))
        activation_weights = K.softmax(activation_energies)
        activation_weights = K.reshape(activation_weights, (-1, maxlenh, maxlend))

        # for every head word compute weighted average of desc words
        desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1))
        return K.concatenate((desc_avg_word, head_words))

    if activation_rnn_size:
        model.add(SimpleContext(simple_context, rnn_size, name='simplecontext_1'))

    model.add(TimeDistributed(Dense(
        vocab_size,
        W_regularizer=regularizer,
        b_regularizer=regularizer,
        name='timedistributed_1')))
    model.add(Activation('softmax', name='activation_1'))

    # opt = Adam(lr=LR)  # keep calm and reduce learning rate
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    K.set_value(model.optimizer.lr, np.float32(LR))
    return model 
Example 76
Project: pycorrector   Author: shibing624   File: seq2seq_attn_model.py    Apache License 2.0 4 votes vote down vote up
def build_model(self):
        # 搭建seq2seq模型
        x_in = Input(shape=(None,))
        y_in = Input(shape=(None,))
        x = x_in
        y = y_in
        x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x)
        y_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(y)

        x_one_hot = Lambda(self._one_hot)([x, x_mask])
        x_prior = ScaleShift()(x_one_hot)  # 学习输出的先验分布(target的字词很可能在input出现过)

        # embedding
        embedding = Embedding(self.vocab_size, self.hidden_dim)
        x = embedding(x)
        y = embedding(y)

        # encoder,双层双向GRU; decoder,双层单向GRU
        if self.use_gpu:
            print("use GPU")
            # encoder
            x = Bidirectional(CuDNNGRU(int(self.hidden_dim / 2), return_sequences=True))(x)
            x = Bidirectional(CuDNNGRU(int(self.hidden_dim / 2), return_sequences=True))(x)
            # decoder
            y = CuDNNGRU(self.hidden_dim, return_sequences=True)(y)
            y = CuDNNGRU(self.hidden_dim, return_sequences=True)(y)
        else:
            print("use CPU")
            # encoder
            x = Bidirectional(GRU(int(self.hidden_dim / 2), return_sequences=True, dropout=self.dropout))(x)
            x = Bidirectional(GRU(int(self.hidden_dim / 2), return_sequences=True, dropout=self.dropout))(x)
            # decoder
            y = GRU(self.hidden_dim, return_sequences=True, dropout=self.dropout)(y)
            y = GRU(self.hidden_dim, return_sequences=True, dropout=self.dropout)(y)

        xy = Interact()([y, x, x_mask])
        xy = Dense(512, activation='relu')(xy)
        xy = Dense(self.vocab_size)(xy)
        xy = Lambda(lambda x: (x[0] + x[1]) / 2)([xy, x_prior])  # 与先验结果平均
        xy = Activation('softmax')(xy)

        # 交叉熵作为loss,但mask掉padding部分
        cross_entropy = K.sparse_categorical_crossentropy(y_in[:, 1:], xy[:, :-1])
        loss = K.sum(cross_entropy * y_mask[:, 1:, 0]) / K.sum(y_mask[:, 1:, 0])

        model = Model([x_in, y_in], xy)
        model.add_loss(loss)
        model.compile(optimizer=Adam(1e-3))
        if os.path.exists(self.model_path):
            model.load_weights(self.model_path)
        return model 
Example 77
Project: Keras-TextClassification   Author: yongzhuo   File: capsule.py    MIT License 4 votes vote down vote up
def call(self, inputs, training=None):
        # inputs.shape=[None, input_num_capsule, input_dim_capsule]
        # inputs_expand.shape=[None, 1, input_num_capsule, input_dim_capsule]
        inputs_expand = K.expand_dims(inputs, 1)

        # Replicate num_capsule dimension to prepare being multiplied by W
        # inputs_tiled.shape=[None, num_capsule, input_num_capsule, input_dim_capsule]
        inputs_tiled = K.tile(inputs_expand, [1, self.num_capsule, 1, 1])

        # Compute `inputs * W` by scanning inputs_tiled on dimension 0.
        # x.shape=[num_capsule, input_num_capsule, input_dim_capsule]
        # W.shape=[num_capsule, input_num_capsule, dim_capsule, input_dim_capsule]
        # Regard the first two dimensions as `batch` dimension,
        # then matmul: [input_dim_capsule] x [dim_capsule, input_dim_capsule]^T -> [dim_capsule].
        # inputs_hat.shape = [None, num_capsule, input_num_capsule, dim_capsule]
        inputs_hat = K.map_fn(lambda x: K.batch_dot(x, self.W, [2, 3]), elems=inputs_tiled)

        # Begin: Routing algorithm ---------------------------------------------------------------------#
        # The prior for coupling coefficient, initialized as zeros.
        # b.shape = [None, self.num_capsule, self.input_num_capsule].
        b = tf.zeros(shape=[K.shape(inputs_hat)[0], self.num_capsule, self.input_num_capsule])

        assert self.routings > 0, 'The routings should be > 0.'
        for i in range(self.routings):
            # c.shape=[batch_size, num_capsule, input_num_capsule]
            c = tf.nn.softmax(b, dim=1)

            # c.shape =  [batch_size, num_capsule, input_num_capsule]
            # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule]
            # The first two dimensions as `batch` dimension,
            # then matmal: [input_num_capsule] x [input_num_capsule, dim_capsule] -> [dim_capsule].
            # outputs.shape=[None, num_capsule, dim_capsule]
            outputs = squash(K.batch_dot(c, inputs_hat, [2, 2]))  # [None, 10, 16]

            if i < self.routings - 1:
                # outputs.shape =  [None, num_capsule, dim_capsule]
                # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule]
                # The first two dimensions as `batch` dimension,
                # then matmal: [dim_capsule] x [input_num_capsule, dim_capsule]^T -> [input_num_capsule].
                # b.shape=[batch_size, num_capsule, input_num_capsule]
                b += K.batch_dot(outputs, inputs_hat, [2, 3])
        # End: Routing algorithm -----------------------------------------------------------------------#

        return outputs 
Example 78
Project: keras-extras   Author: kuza55   File: DiffForest.py    Apache License 2.0 4 votes vote down vote up
def call(self, x, mask=None):
        N_DECISION = (2 ** (self.n_depth)) - 1  # Number of decision nodes
        N_LEAF  = 2 ** (self.n_depth + 1)  # Number of leaf nodes

        flat_decision_p_e = []
        leaf_p_e = []
        for w_d, w_l in zip(self.w_d_ensemble, self.w_l_ensemble):

            decision_p = K.sigmoid((K.dot(x, w_d)))
            leaf_p = K.softmax(w_l)

            decision_p_comp = 1 - decision_p

            decision_p_pack = K.concatenate([decision_p, decision_p_comp])

            flat_decision_p_e.append(decision_p_pack)
            leaf_p_e.append(leaf_p)

        #Construct tiling pattern for decision probability matrix
        #Could be done in TF, but I think it's better statically
        tiling_pattern = np.zeros((N_LEAF, self.n_depth), dtype=np.int32)
        comp_offset = N_DECISION
        dec_idx = 0
        for n in xrange(self.n_depth):
            j = 0
            for depth_idx in xrange(2**n):
                repeat_times = 2 ** (self.n_depth - n)
                for _ in xrange(repeat_times):
                    tiling_pattern[j][n] = dec_idx 
                    j = j + 1

                for _ in xrange(repeat_times):
                    tiling_pattern[j][n] = comp_offset + dec_idx 
                    j = j + 1

                dec_idx = dec_idx + 1

        flat_pattern = tiling_pattern.flatten()

        # iterate over each tree
        tree_ret = None
        for flat_decision_p, leaf_p in zip(flat_decision_p_e, leaf_p_e):
            flat_mu = tf.transpose(tf.gather(tf.transpose(flat_decision_p), flat_pattern))
            
            batch_size = tf.shape(flat_decision_p)[0]
            shape = tf.pack([batch_size, N_LEAF, self.n_depth])

            mu = K.reshape(flat_mu, shape)
            leaf_prob = K.prod(mu, [2])
            prob_label = K.dot(leaf_prob, leaf_p)

            if tree_ret is None:
              tree_ret = prob_label
            else:
              tree_ret = tree_ret + prob_label

        return tree_ret/self.n_trees 
Example 79
Project: ntm_keras   Author: flomlo   File: ntm.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _split_and_apply_activations(self, controller_output):
        """ This takes the controller output, splits it in ntm_output, read and wright adressing data.
            It returns a triple of ntm_output, controller_instructions_read, controller_instructions_write.
            ntm_output is a tensor, controller_instructions_read and controller_instructions_write are lists containing
            the adressing instruction (k, beta, g, shift, gamma) and in case of write also the writing constructions,
            consisting of an erase and an add vector. 

            As it is necesseary for stable results,
            k and add_vector is activated via tanh, erase_vector via sigmoid (this is critical!),
            shift via softmax,
            gamma is sigmoided, inversed and clipped (probably not ideal)
            g is sigmoided,
            beta is linear (probably not ideal!) """
        
        # splitting
        ntm_output, controller_instructions_read, controller_instructions_write = tf.split(
                    controller_output,
                    np.asarray([self.output_dim,
                                self.read_heads * self.controller_read_head_emitting_dim,
                                self.write_heads * self.controller_write_head_emitting_dim]),
                    axis=1)

        controller_instructions_read = tf.split(controller_instructions_read, self.read_heads, axis=1)
        controller_instructions_write = tf.split(controller_instructions_write, self.write_heads, axis=1)

        controller_instructions_read = [
                tf.split(single_head_data, np.asarray([self.m_depth, 1, 1, 3, 1]), axis=1) for 
                single_head_data in controller_instructions_read]
        
        controller_instructions_write = [
                tf.split(single_head_data, np.asarray([self.m_depth, 1, 1, 3, 1, self.m_depth, self.m_depth]), axis=1) for 
                single_head_data in controller_instructions_write]
        
        #activation
        ntm_output = self.activation(ntm_output)
        controller_instructions_read = [(tanh(k), hard_sigmoid(beta)+0.5, sigmoid(g), softmax(shift), 1 + 9*sigmoid(gamma)) for
                (k, beta, g, shift, gamma) in controller_instructions_read]
        controller_instructions_write = [
                (tanh(k), hard_sigmoid(beta)+0.5, sigmoid(g), softmax(shift), 1 + 9*sigmoid(gamma), hard_sigmoid(erase_vector), tanh(add_vector))  for 
                (k, beta, g, shift, gamma, erase_vector, add_vector) in controller_instructions_write]
       
        return (ntm_output, controller_instructions_read, controller_instructions_write) 
Example 80
Project: keras-gat   Author: danielegrattarola   File: graph_attention_layer.py    MIT License 4 votes vote down vote up
def call(self, inputs):
        X = inputs[0]  # Node features (N x F)
        A = inputs[1]  # Adjacency matrix (N x N)

        outputs = []
        for head in range(self.attn_heads):
            kernel = self.kernels[head]  # W in the paper (F x F')
            attention_kernel = self.attn_kernels[head]  # Attention kernel a in the paper (2F' x 1)

            # Compute inputs to attention network
            features = K.dot(X, kernel)  # (N x F')

            # Compute feature combinations
            # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j]
            attn_for_self = K.dot(features, attention_kernel[0])    # (N x 1), [a_1]^T [Wh_i]
            attn_for_neighs = K.dot(features, attention_kernel[1])  # (N x 1), [a_2]^T [Wh_j]

            # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]]
            dense = attn_for_self + K.transpose(attn_for_neighs)  # (N x N) via broadcasting

            # Add nonlinearty
            dense = LeakyReLU(alpha=0.2)(dense)

            # Mask values before activation (Vaswani et al., 2017)
            mask = -10e9 * (1.0 - A)
            dense += mask

            # Apply softmax to get attention coefficients
            dense = K.softmax(dense)  # (N x N)

            # Apply dropout to features and attention coefficients
            dropout_attn = Dropout(self.dropout_rate)(dense)  # (N x N)
            dropout_feat = Dropout(self.dropout_rate)(features)  # (N x F')

            # Linear combination with neighbors' features
            node_features = K.dot(dropout_attn, dropout_feat)  # (N x F')

            if self.use_bias:
                node_features = K.bias_add(node_features, self.biases[head])

            # Add output of attention head to final output
            outputs.append(node_features)

        # Aggregate the heads' output according to the reduction method
        if self.attn_heads_reduction == 'concat':
            output = K.concatenate(outputs)  # (N x KF')
        else:
            output = K.mean(K.stack(outputs), axis=0)  # N x F')

        output = self.activation(output)
        return output