from __future__ import absolute_import from __future__ import print_function import numpy as np import pandas as pd np.random.seed(1337) # for reproducibility from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.normalization import BatchNormalization from keras.layers.advanced_activations import PReLU from keras.utils import np_utils, generic_utils from keras.optimizers import Adam, SGD, Optimizer from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler from sklearn.metrics import confusion_matrix, log_loss from sklearn.ensemble import BaggingClassifier from sklearn.cross_validation import StratifiedKFold, KFold path = '../Data/' class LossHistory(Callback): def on_train_begin(self, logs={}): self.losses = [] def on_batch_end(self, batch, logs={}): self.losses.append(logs.get('loss')) def load_data(path, train=True): df = pd.read_csv(path) X = df.values.copy() if train: X, labels = X[:, 1:-1].astype(np.float32), X[:, -1] return X, labels else: X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str) return X, ids def preprocess_data(X, scaler=None): if not scaler: scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) return X, scaler def preprocess_labels(labels, encoder=None, categorical=True): if not encoder: encoder = LabelEncoder() encoder.fit(labels) y = encoder.transform(labels).astype(np.int32) if categorical: y = np_utils.to_categorical(y) return y, encoder def make_submission(y_prob, ids, encoder, fname): with open(fname, 'w') as f: f.write('id,') f.write(','.join([str(i) for i in encoder.classes_])) f.write('\n') for i, probs in zip(ids, y_prob): probas = ','.join([i] + [str(p) for p in probs.tolist()]) f.write(probas) f.write('\n') print("Wrote submission to file {}.".format(fname)) print("Loading data...") X, labels = load_data(path+'train.csv', train=True) #X=np.log(X+1) #X=np.sqrt(X+(3/8)) X, scaler = preprocess_data(X) y, encoder = preprocess_labels(labels) X_test, ids = load_data(path+'test.csv', train=False) #X_test=np.log(X_test+1) #X_test=np.sqrt(X_test+(3/8)) X_test, _ = preprocess_data(X_test, scaler) nb_classes = y.shape[1] print(nb_classes, 'classes') dims = X.shape[1] print(dims, 'dims') sample = pd.read_csv(path+'sampleSubmission.csv') N = X.shape[0] trainId = np.array(range(N)) submissionTr = pd.DataFrame(index=trainId,columns=sample.columns[1:]) nfold=5 RND = np.random.randint(0,10000,nfold) pred = np.zeros((X_test.shape[0],9)) score = np.zeros(nfold) i=0 skf = StratifiedKFold(labels, nfold, random_state=1337) for tr, te in skf: X_train, X_valid, y_train, y_valid = X[tr], X[te], y[tr], y[te] predTr = np.zeros((X_valid.shape[0],9)) n_bag=10 for j in range(n_bag): print('nfold: ',i,'/',nfold, ' n_bag: ',j,' /',n_bag) print("Building model...") model = Sequential() model.add(Dense(512, input_shape=(dims,))) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(512)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(512)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) ADAM=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8) sgd=SGD(lr=0.01, momentum=0.9, decay=1e-6, nesterov=True) model.compile(loss='categorical_crossentropy', optimizer="adam") print("Training model...") earlystopping=EarlyStopping(monitor='val_loss', patience=10, verbose=1) checkpointer = ModelCheckpoint(filepath=path+"tmp/weights.hdf5", verbose=0, save_best_only=True) model.fit(X_train, y_train, nb_epoch=1000, batch_size=128, verbose=2, validation_data=(X_valid,y_valid), callbacks=[earlystopping,checkpointer]) model.load_weights(path+"tmp/weights.hdf5") print("Generating submission...") pred += model.predict_proba(X_test) predTr += model.predict_proba(X_valid) predTr /= n_bag submissionTr.iloc[te] = predTr score[i]= log_loss(y_valid,predTr,eps=1e-15, normalize=True) print(score[i]) i+=1 pred /= (nfold*n_bag) print("ave: "+ str(np.average(score)) + "stddev: " + str(np.std(score))) make_submission(pred, ids, encoder, fname=path+'kerasNN3.csv') print(log_loss(labels,submissionTr.values,eps=1e-15, normalize=True)) submissionTr.to_csv(path+"kerasNN3_retrain.csv",index_label='id') # nfold 2: 0.520287 + 0.00190431 # nfold 3: 0.502752 + 0.004984166 # nfold 3, bagging 5: 0.48422 + 0.00637037 # nfold 3, bagging 10, 0.481189 + 0.0050939 # nfold 5, bagging 5: 0.4743325 + 0.01887 # nfold 5, bagging 10: 0.4723626 + 0.0120412