Python keras.initializers.Constant() Examples

The following are 30 code examples of keras.initializers.Constant(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module keras.initializers , or try the search function .
Example #1
Source File: hadamard.py    From landmark-recognition-challenge with GNU General Public License v3.0 6 votes vote down vote up
def build(self, input_shape):

        hadamard_size = 2 ** int(math.ceil(math.log(max(input_shape[1], self.output_dim), 2)))
        self.hadamard = K.constant(
            value=hadamard(hadamard_size, dtype=np.int8)[:input_shape[1], :self.output_dim])

        init_scale = 1. / math.sqrt(self.output_dim)

        self.scale = self.add_weight(name='scale', 
                                      shape=(1,),
                                      initializer=Constant(init_scale),
                                      trainable=True)

        if self.use_bias:
            self.bias  = self.add_weight(name='bias', 
                                          shape=(self.output_dim,),
                                          initializer=RandomUniform(-init_scale, init_scale),
                                          trainable=True)

        super(HadamardClassifier, self).build(input_shape) 
Example #2
Source File: recurrent_highway_networks.py    From recurrentshop with MIT License 6 votes vote down vote up
def RHN(input_dim, hidden_dim, depth):
    # Wrapped model
    inp = Input(batch_shape=(batch_size, input_dim))
    state = Input(batch_shape=(batch_size, hidden_dim))
    drop_mask = Input(batch_shape=(batch_size, hidden_dim))
    # To avoid all zero mask causing gradient to vanish
    inverted_drop_mask = Lambda(lambda x: 1.0 - x, output_shape=lambda s: s)(drop_mask)
    drop_mask_2 = Lambda(lambda x: x + 0., output_shape=lambda s: s)(inverted_drop_mask)
    dropped_state = multiply([state, inverted_drop_mask])
    y, new_state = RHNCell(units=hidden_dim, recurrence_depth=depth,
                           kernel_initializer=weight_init,
                           kernel_regularizer=l2(weight_decay),
                           kernel_constraint=max_norm(gradient_clip),
                           bias_initializer=Constant(transform_bias),
                           recurrent_initializer=weight_init,
                           recurrent_regularizer=l2(weight_decay),
                           recurrent_constraint=max_norm(gradient_clip))([inp, dropped_state])
    return RecurrentModel(input=inp, output=y,
                          initial_states=[state, drop_mask],
                          final_states=[new_state, drop_mask_2])


# lr decay Scheduler 
Example #3
Source File: query_reduction_network.py    From recurrentshop with MIT License 6 votes vote down vote up
def QRNcell():
    xq = Input(batch_shape=(batch_size, embedding_dim * 2))
    # Split into context and query
    xt = Lambda(lambda x, dim: x[:, :dim], arguments={'dim': embedding_dim},
                output_shape=lambda s: (s[0], s[1] / 2))(xq)
    qt = Lambda(lambda x, dim: x[:, dim:], arguments={'dim': embedding_dim},
                output_shape=lambda s: (s[0], s[1] / 2))(xq)

    h_tm1 = Input(batch_shape=(batch_size, embedding_dim))

    zt = Dense(1, activation='sigmoid', bias_initializer=Constant(2.5))(multiply([xt, qt]))
    zt = Lambda(lambda x, dim: K.repeat_elements(x, dim, axis=1), arguments={'dim': embedding_dim})(zt)
    ch = Dense(embedding_dim, activation='tanh')(concatenate([xt, qt], axis=-1))
    rt = Dense(1, activation='sigmoid')(multiply([xt, qt]))
    rt = Lambda(lambda x, dim: K.repeat_elements(x, dim, axis=1), arguments={'dim': embedding_dim})(rt)
    ht = add([multiply([zt, ch, rt]), multiply([Lambda(lambda x: 1 - x, output_shape=lambda s: s)(zt), h_tm1])])
    return RecurrentModel(input=xq, output=ht, initial_states=[h_tm1], final_states=[ht], return_sequences=True)


#
# Load data
# 
Example #4
Source File: __init__.py    From deep_complex_networks with MIT License 6 votes vote down vote up
def get_shallow_convnet(window_size=4096, channels=2, output_size=84):
    inputs = Input(shape=(window_size, channels))

    conv = ComplexConv1D(
        32, 512, strides=16,
        activation='relu')(inputs)
    pool = AveragePooling1D(pool_size=4, strides=2)(conv)

    pool = Permute([2, 1])(pool)
    flattened = Flatten()(pool)

    dense = ComplexDense(2048, activation='relu')(flattened)
    predictions = ComplexDense(
        output_size, 
        activation='sigmoid',
        bias_initializer=Constant(value=-5))(dense)
    predictions = GetReal(predictions)
    model = Model(inputs=inputs, outputs=predictions)

    model.compile(optimizer=Adam(lr=1e-4),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model 
Example #5
Source File: motion_CNN3DmoreLayers.py    From CNNArt with Apache License 2.0 5 votes vote down vote up
def fGetActivation(input_t,  iPReLU=0):
    init=0.25
    if iPReLU == 1:  # one alpha for each channel
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4])(input_t)
    elif iPReLU == 2:  # just one alpha for each layer
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4, 1])(input_t)
    else:
        output_t = Activation('relu')(input_t)
    return output_t 
Example #6
Source File: initializers_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_constant(tensor_shape):
    _runner(initializers.Constant(2), tensor_shape,
            target_mean=2, target_max=2, target_min=2) 
Example #7
Source File: initializers_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_constant(tensor_shape):
    _runner(initializers.Constant(2), tensor_shape,
            target_mean=2, target_max=2, target_min=2) 
Example #8
Source File: initializers_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_constant(tensor_shape):
    _runner(initializers.Constant(2), tensor_shape,
            target_mean=2, target_max=2, target_min=2) 
Example #9
Source File: layers.py    From Keras-GAN-Animeface-Character with MIT License 5 votes vote down vote up
def bilinear2x(x, nfilters):
	'''
    Ugh, I don't like making layers.
    My credit goes to: https://kivantium.net/keras-bilinear
    '''
	return Conv2DTranspose(nfilters, (4, 4),
        strides=(2, 2),
        padding='same',
		kernel_initializer=Constant(bilinear_upsample_weights(2, nfilters)))(x) 
Example #10
Source File: initializers_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_constant(tensor_shape):
    _runner(initializers.Constant(2), tensor_shape,
            target_mean=2, target_max=2, target_min=2) 
Example #11
Source File: highway_layer.py    From bidaf-keras with GNU General Public License v3.0 5 votes vote down vote up
def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        dim = input_shape[-1]
        transform_gate_bias_initializer = Constant(self.transform_gate_bias)
        input_shape_dense_1 = input_shape[-1]
        self.dense_1 = Dense(units=dim, bias_initializer=transform_gate_bias_initializer)
        self.dense_1.build(input_shape)
        self.dense_2 = Dense(units=dim)
        self.dense_2.build(input_shape)
        self.trainable_weights = self.dense_1.trainable_weights + self.dense_2.trainable_weights

        super(Highway, self).build(input_shape)  # Be sure to call this at the end 
Example #12
Source File: keras_regression_deep_broken.py    From Deep-Learning-Quick-Reference with MIT License 5 votes vote down vote up
def build_network(input_features=None):
    const_initializer = Constant(value=0)
    # first we specify an input layer, with a shape == features
    inputs = Input(shape=(input_features,), name="input")
    x = Dense(32, activation='relu', name="hidden1", kernel_initializer=const_initializer, bias_initializer='ones')(inputs)
    x = Dense(32, activation='relu', name="hidden2", kernel_initializer=const_initializer, bias_initializer='ones')(x)
    x = Dense(32, activation='relu', name="hidden3", kernel_initializer=const_initializer, bias_initializer='ones')(x)
    x = Dense(32, activation='relu', name="hidden4", kernel_initializer=const_initializer, bias_initializer='ones')(x)
    x = Dense(16, activation='relu', name="hidden5", kernel_initializer=const_initializer, bias_initializer='ones')(x)
    # for regression we will use a single neuron with linear (no) activation
    prediction = Dense(1, activation='linear', name="final", kernel_initializer=const_initializer, bias_initializer='ones')(x)

    model = Model(inputs=inputs, outputs=prediction)
    model.compile(optimizer='adam', loss='mean_absolute_error')
    return model 
Example #13
Source File: initializers_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_constant(tensor_shape):
    _runner(initializers.Constant(2), tensor_shape,
            target_mean=2, target_max=2, target_min=2) 
Example #14
Source File: initializers_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_constant(tensor_shape):
    _runner(initializers.Constant(2), tensor_shape,
            target_mean=2, target_max=2, target_min=2) 
Example #15
Source File: motion_VNetArt.py    From CNNArt with Apache License 2.0 5 votes vote down vote up
def fGetActivation(input_t,  iPReLU=0):
    init=0.25
    if iPReLU == 1:  # one alpha for each channel
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4])(input_t)
    elif iPReLU == 2:  # just one alpha for each layer
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4, 1])(input_t)
    else:
        output_t = Activation('relu')(input_t)
    return output_t 
Example #16
Source File: VNetArt.py    From CNNArt with Apache License 2.0 5 votes vote down vote up
def fGetActivation(input_t, iPReLU=0):
    init = 0.25
    if iPReLU == 1:  # one alpha for each channel
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4])(input_t)
    elif iPReLU == 2:  # just one alpha for each layer
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4, 1])(input_t)
    else:
        output_t = Activation('relu')(input_t)
    return output_t 
Example #17
Source File: 3D_CNN.py    From CNNArt with Apache License 2.0 5 votes vote down vote up
def fGetActivation(input_t, iPReLU=0):
    init = 0.25
    if iPReLU == 1:  # one alpha for each channel
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4])(input_t)
    elif iPReLU == 2:  # just one alpha for each layer
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4, 1])(input_t)
    else:
        output_t = Activation('relu')(input_t)
    return output_t 
Example #18
Source File: VNetArt.py    From CNNArt with Apache License 2.0 5 votes vote down vote up
def fGetActivation(input_t, iPReLU=0):
    init = 0.25
    if iPReLU == 1:  # one alpha for each channel
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4])(input_t)
    elif iPReLU == 2:  # just one alpha for each layer
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4, 1])(input_t)
    else:
        output_t = Activation('relu')(input_t)
    return output_t 
Example #19
Source File: CNN3D.py    From CNNArt with Apache License 2.0 5 votes vote down vote up
def fGetActivation(input_t, iPReLU=0):
    init = 0.25
    if iPReLU == 1:  # one alpha for each channel
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4])(input_t)
    elif iPReLU == 2:  # just one alpha for each layer
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4, 1])(input_t)
    else:
        output_t = Activation('relu')(input_t)
    return output_t 
Example #20
Source File: CNN3DmoreLayers.py    From CNNArt with Apache License 2.0 5 votes vote down vote up
def fGetActivation(input_t, iPReLU=0):
    init = 0.25
    if iPReLU == 1:  # one alpha for each channel
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4])(input_t)
    elif iPReLU == 2:  # just one alpha for each layer
        output_t = PReLU(alpha_initializer=Constant(value=init), shared_axes=[2, 3, 4, 1])(input_t)
    else:
        output_t = Activation('relu')(input_t)
    return output_t 
Example #21
Source File: building_blocks.py    From Tacotron-2-keras with MIT License 5 votes vote down vote up
def get_highway_output(highway_input, nb_layers, activation="tanh", bias=-3):
    dim = K.int_shape(highway_input)[-1]  # dimension must be the same
    initial_bias = k_init.Constant(bias)
    for n in range(nb_layers):
        H = Dense(units=dim, bias_initializer=initial_bias)(highway_input)
        H = Activation("sigmoid")(H)
        carry_gate = Lambda(lambda x: 1.0 - x,
                            output_shape=(dim,))(H)
        transform_gate = Dense(units=dim)(highway_input)
        transform_gate = Activation(activation)(transform_gate)
        transformed = Multiply()([H, transform_gate])
        carried = Multiply()([carry_gate, highway_input])
        highway_output = Add()([transformed, carried])
    return highway_output 
Example #22
Source File: transformer.py    From keras-transformer with MIT License 5 votes vote down vote up
def build(self, input_shape):
        assert len(input_shape) == 3
        _, sequence_length, d_model = input_shape
        self.halting_kernel = self.add_weight(
            name='halting_kernel',
            shape=(d_model, 1),
            initializer='glorot_uniform',
            trainable=True)
        self.halting_biases = self.add_weight(
            name='halting_biases',
            shape=(1,),
            initializer=initializers.Constant(0.1),
            trainable=True)
        self.time_penalty_t = K.constant(self.time_penalty, dtype=K.floatx())
        return super().build(input_shape) 
Example #23
Source File: graph_yoon_kim.py    From Keras-TextClassification with MIT License 5 votes vote down vote up
def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        dim = input_shape[-1]
        self.dense_1 = Dense(units=dim, bias_initializer=Constant(self.transform_gate_bias))
        self.dense_1.build(input_shape)
        self.dense_2 = Dense(units=dim)
        self.dense_2.build(input_shape)
        self.trainable_weights = self.dense_1.trainable_weights + self.dense_2.trainable_weights
        super(Highway, self).build(input_shape)  # Be sure to call this at the end 
Example #24
Source File: graph_yoon_kim.py    From Keras-TextClassification with MIT License 5 votes vote down vote up
def highway_keras(x):
    # writter by my own
    # paper; Highway Network(http://arxiv.org/abs/1505.00387).
    # 公式
    # 1. s = sigmoid(Wx + b)
    # 2. z = s * relu(Wx + b) + (1 - s) * x
    # x shape : [N * time_depth, sum(filters)]

    # Table 1. CIFAR-10 test set accuracy of convolutional highway networks with
    # rectified linear activation and sigmoid gates.
    # For comparison, results reported by Romero et al. (2014)
    # using maxout networks are also shown.
    # Fitnets were trained using a two step training procedure using soft targets from the trained Teacher network,
    # which was trained using backpropagation. We trained all highway networks directly using backpropagation.
    # * indicates networks which were trained only on a set of 40K out of 50K examples in the training set.



    # Figure 2. Visualization of certain internals of the blocks in the best 50 hidden layer highway networks trained on MNIST
    # (top row) and CIFAR-100 (bottom row). The first hidden layer is a plain layer which changes the dimensionality of the representation to 50. Each of
    # the 49 highway layers (y-axis) consists of 50 blocks (x-axis).
    # The first column shows the transform gate biases, which were initialized to -2 and -4 respectively.
    # In the second column the mean output of the transform gate over 10,000 training examples is depicted.
    # The third and forth columns show the output of the transform gates and
    # the block outputs for a single random training sample.

    gate_transform = Dense(units=K.int_shape(x)[1],
                           activation='sigmoid',
                           use_bias=True,
                           kernel_initializer='glorot_uniform',
                           bias_initializer=keras.initializers.Constant(value=-2))(x)
    gate_cross = 1 - gate_transform
    block_state = Dense(units=K.int_shape(x)[1],
                        activation='relu',
                        use_bias=True,
                        kernel_initializer='glorot_uniform',
                        bias_initializer='zero')(x)
    high_way = gate_transform * block_state + gate_cross * x

    return high_way 
Example #25
Source File: rbflayer.py    From rbf_keras with MIT License 5 votes vote down vote up
def build(self, input_shape):

        self.centers = self.add_weight(name='centers',
                                       shape=(self.output_dim, input_shape[1]),
                                       initializer=self.initializer,
                                       trainable=True)
        self.betas = self.add_weight(name='betas',
                                     shape=(self.output_dim,),
                                     initializer=Constant(
                                         value=self.init_betas),
                                     # initializer='ones',
                                     trainable=True)

        super(RBFLayer, self).build(input_shape) 
Example #26
Source File: keras_utils.py    From Benchmarks with MIT License 4 votes vote down vote up
def build_initializer(type, kerasDefaults, seed=None, constant=0.):
    """ Set the initializer to the appropriate Keras initializer function
        based on the input string and learning rate. Other required values
        are set to the Keras default values

        Parameters
        ----------
        type : string
            String to choose the initializer

            Options recognized: 'constant', 'uniform', 'normal',
            'glorot_uniform', 'lecun_uniform', 'he_normal'

            See the Keras documentation for a full description of the options

        kerasDefaults : list
            List of default parameter values to ensure consistency between frameworks

        seed : integer
            Random number seed

        constant : float
            Constant value (for the constant initializer only)

        Return
        ----------
        The appropriate Keras initializer function
    """

    if type == 'constant':
        return initializers.Constant(value=constant)

    elif type == 'uniform':
        return initializers.RandomUniform(minval=kerasDefaults['minval_uniform'],
                                  maxval=kerasDefaults['maxval_uniform'],
                                  seed=seed)

    elif type == 'normal':
        return initializers.RandomNormal(mean=kerasDefaults['mean_normal'],
                                  stddev=kerasDefaults['stddev_normal'],
                                  seed=seed)

# Not generally available
#    elif type == 'glorot_normal':
#        return initializers.glorot_normal(seed=seed)

    elif type == 'glorot_uniform':
        return initializers.glorot_uniform(seed=seed)

    elif type == 'lecun_uniform':
        return initializers.lecun_uniform(seed=seed)

    elif type == 'he_normal':
        return initializers.he_normal(seed=seed) 
Example #27
Source File: test_callbacks.py    From DeepLearning_Wavelet-LSTM with MIT License 4 votes vote down vote up
def test_TerminateOnNaN():
    np.random.seed(1337)
    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
                                                         num_test=test_samples,
                                                         input_shape=(input_dim,),
                                                         classification=True,
                                                         num_classes=num_classes)

    y_test = np_utils.to_categorical(y_test)
    y_train = np_utils.to_categorical(y_train)
    cbks = [callbacks.TerminateOnNaN()]
    model = Sequential()
    initializer = initializers.Constant(value=1e5)
    for _ in range(5):
        model.add(Dense(num_hidden, input_dim=input_dim, activation='relu',
                        kernel_initializer=initializer))
    model.add(Dense(num_classes, activation='linear'))
    model.compile(loss='mean_squared_error',
                  optimizer='rmsprop')

    # case 1 fit
    history = model.fit(X_train, y_train, batch_size=batch_size,
                        validation_data=(X_test, y_test), callbacks=cbks, epochs=20)
    loss = history.history['loss']
    assert len(loss) == 1
    assert loss[0] == np.inf

    # case 2 fit_generator
    def data_generator():
        max_batch_index = len(X_train) // batch_size
        i = 0
        while 1:
            yield (X_train[i * batch_size: (i + 1) * batch_size],
                   y_train[i * batch_size: (i + 1) * batch_size])
            i += 1
            i = i % max_batch_index
    history = model.fit_generator(data_generator(),
                                  len(X_train),
                                  validation_data=(X_test, y_test),
                                  callbacks=cbks,
                                  epochs=20)
    loss = history.history['loss']
    assert len(loss) == 1
    assert loss[0] == np.inf or np.isnan(loss[0]) 
Example #28
Source File: test_callbacks.py    From DeepLearning_Wavelet-LSTM with MIT License 4 votes vote down vote up
def test_TerminateOnNaN():
    np.random.seed(1337)
    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
                                                         num_test=test_samples,
                                                         input_shape=(input_dim,),
                                                         classification=True,
                                                         num_classes=num_classes)

    y_test = np_utils.to_categorical(y_test)
    y_train = np_utils.to_categorical(y_train)
    cbks = [callbacks.TerminateOnNaN()]
    model = Sequential()
    initializer = initializers.Constant(value=1e5)
    for _ in range(5):
        model.add(Dense(num_hidden, input_dim=input_dim, activation='relu',
                        kernel_initializer=initializer))
    model.add(Dense(num_classes, activation='linear'))
    model.compile(loss='mean_squared_error',
                  optimizer='rmsprop')

    # case 1 fit
    history = model.fit(X_train, y_train, batch_size=batch_size,
                        validation_data=(X_test, y_test), callbacks=cbks, epochs=20)
    loss = history.history['loss']
    assert len(loss) == 1
    assert loss[0] == np.inf

    # case 2 fit_generator
    def data_generator():
        max_batch_index = len(X_train) // batch_size
        i = 0
        while 1:
            yield (X_train[i * batch_size: (i + 1) * batch_size],
                   y_train[i * batch_size: (i + 1) * batch_size])
            i += 1
            i = i % max_batch_index
    history = model.fit_generator(data_generator(),
                                  len(X_train),
                                  validation_data=(X_test, y_test),
                                  callbacks=cbks,
                                  epochs=20)
    loss = history.history['loss']
    assert len(loss) == 1
    assert loss[0] == np.inf or np.isnan(loss[0]) 
Example #29
Source File: test_callbacks.py    From DeepLearning_Wavelet-LSTM with MIT License 4 votes vote down vote up
def test_TerminateOnNaN():
    np.random.seed(1337)
    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
                                                         num_test=test_samples,
                                                         input_shape=(input_dim,),
                                                         classification=True,
                                                         num_classes=num_classes)

    y_test = np_utils.to_categorical(y_test)
    y_train = np_utils.to_categorical(y_train)
    cbks = [callbacks.TerminateOnNaN()]
    model = Sequential()
    initializer = initializers.Constant(value=1e5)
    for _ in range(5):
        model.add(Dense(num_hidden, input_dim=input_dim, activation='relu',
                        kernel_initializer=initializer))
    model.add(Dense(num_classes, activation='linear'))
    model.compile(loss='mean_squared_error',
                  optimizer='rmsprop')

    # case 1 fit
    history = model.fit(X_train, y_train, batch_size=batch_size,
                        validation_data=(X_test, y_test), callbacks=cbks, epochs=20)
    loss = history.history['loss']
    assert len(loss) == 1
    assert loss[0] == np.inf

    # case 2 fit_generator
    def data_generator():
        max_batch_index = len(X_train) // batch_size
        i = 0
        while 1:
            yield (X_train[i * batch_size: (i + 1) * batch_size],
                   y_train[i * batch_size: (i + 1) * batch_size])
            i += 1
            i = i % max_batch_index
    history = model.fit_generator(data_generator(),
                                  len(X_train),
                                  validation_data=(X_test, y_test),
                                  callbacks=cbks,
                                  epochs=20)
    loss = history.history['loss']
    assert len(loss) == 1
    assert loss[0] == np.inf or np.isnan(loss[0]) 
Example #30
Source File: test_callbacks.py    From DeepLearning_Wavelet-LSTM with MIT License 4 votes vote down vote up
def test_TerminateOnNaN():
    np.random.seed(1337)
    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
                                                         num_test=test_samples,
                                                         input_shape=(input_dim,),
                                                         classification=True,
                                                         num_classes=num_classes)

    y_test = np_utils.to_categorical(y_test)
    y_train = np_utils.to_categorical(y_train)
    cbks = [callbacks.TerminateOnNaN()]
    model = Sequential()
    initializer = initializers.Constant(value=1e5)
    for _ in range(5):
        model.add(Dense(num_hidden, input_dim=input_dim, activation='relu',
                        kernel_initializer=initializer))
    model.add(Dense(num_classes, activation='linear'))
    model.compile(loss='mean_squared_error',
                  optimizer='rmsprop')

    # case 1 fit
    history = model.fit(X_train, y_train, batch_size=batch_size,
                        validation_data=(X_test, y_test), callbacks=cbks, epochs=20)
    loss = history.history['loss']
    assert len(loss) == 1
    assert loss[0] == np.inf

    # case 2 fit_generator
    def data_generator():
        max_batch_index = len(X_train) // batch_size
        i = 0
        while 1:
            yield (X_train[i * batch_size: (i + 1) * batch_size],
                   y_train[i * batch_size: (i + 1) * batch_size])
            i += 1
            i = i % max_batch_index
    history = model.fit_generator(data_generator(),
                                  len(X_train),
                                  validation_data=(X_test, y_test),
                                  callbacks=cbks,
                                  epochs=20)
    loss = history.history['loss']
    assert len(loss) == 1
    assert loss[0] == np.inf or np.isnan(loss[0])