python source code of insurance_tf_nn_classification

#make the optimizer the gini_normalized function so that the nn
#feedback is based on the right thing! this will optimize for
#the gini, as opposed for the ability to make correct preditions
#it isn't correctness we are after, but rather the correct
#order of the claims relative to one another


import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def gini_tf(actual, pred):
	"""a tensorflow version of normalized gini coefficient, 
	uses in top k to calculate """
	assert (len(actual) == len(pred))
	n = int(actual.get_shape()[-1])
	indices = tf.reverse(tf.nn.top_k(pred, k=n)[1], axis=0)
	a_s = tf.gather(actual, indices)
	a_c = tf.cumsum(a_s)
	giniSum = tf.reduce_sum(a_c) / tf.reduce_sum(a_s)
	giniSum -= (n + 1) / 2.
	return giniSum / n

def gini(actual, pred, cmpcol = 0, sortcol = 1):
	assert( len(actual) == len(pred) )
	all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
	all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
	totalLosses = all[:,0].sum()
	giniSum = all[:,0].cumsum().sum() / totalLosses

	giniSum -= (len(actual) + 1) / 2.
	return giniSum / len(actual)
 
def gini_normalized(a, p):
	return gini(a, p) / gini(a, a)
 
# Load the data

test_dat = pd.read_csv('test.csv')
train_dat = pd.read_csv('train.csv')
submission = pd.read_csv('sample_submission.csv')


train_dat_0s = train_dat[train_dat['target'] == 0]


train_dat_1s = train_dat[train_dat['target'] == 1]
rep_1 =[train_dat_1s for x in range(train_dat_0s.shape[0]//train_dat_1s.shape[0] )]
keep_1s = pd.concat(rep_1, axis=0)

train_dat = pd.concat([keep_1s,train_dat_0s],axis=0)


train_y = train_dat['target'].as_matrix()
train_x = train_dat.drop(['target', 'id'], axis = 1)
test_dat = test_dat.drop(['id'], axis = 1)

merged_dat = pd.concat([train_x, test_dat],axis=0)

cat_features = [col for col in merged_dat.columns if col.endswith('cat')]
for column in cat_features:
	temp=pd.get_dummies(pd.Series(merged_dat[column]))
	merged_dat=pd.concat([merged_dat,temp],axis=1)
	merged_dat=merged_dat.drop([column],axis=1)

numeric_features = [col for col in merged_dat.columns if '_calc_' in  str(col)]
numeric_features = [col for col in numeric_features if '_bin' not in str(col)]

scaler = StandardScaler()
scaled_numerics = scaler.fit_transform(merged_dat[numeric_features])
scaled_num_df = pd.DataFrame(scaled_numerics, columns =numeric_features )


merged_dat = merged_dat.drop(numeric_features, axis=1)


merged_dat = np.concatenate((merged_dat.values,scaled_num_df), axis = 1)


train_x = merged_dat[:train_x.shape[0]]
test_dat = merged_dat[train_x.shape[0]:]


train_x = train_x.astype(np.float32)
test_dat = test_dat.astype(np.float32)

train_x.shape
#227 predictors


#for stability
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)


reset_graph()

num_inputs = train_x.shape[1]
learning_rate = 0.1
num_classes = 2
n_hidden1 = 100
n_hidden2 = 400
n_hidden3 = 200
n_hidden4 = 100
dropout = 0.75
n_epochs = 1500


X = tf.placeholder(tf.float32, shape=(None, num_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")


with tf.variable_scope('ClassNet'):

	he_init = tf.contrib.layers.variance_scaling_initializer()

	training = tf.placeholder_with_default(False, shape=(), name='training')

	hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
							  kernel_initializer=he_init, name="hidden1")

	bn1 = tf.layers.batch_normalization(hidden1, training = training, momentum = 0.9)

	hidden2 = tf.layers.dense(bn1, n_hidden2, activation=tf.nn.relu,
							  kernel_initializer=he_init, name="hidden2")

	hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu,
							  kernel_initializer=he_init, name="hidden3")

	hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu,
							  kernel_initializer=he_init, name="hidden4")

	fc1 = tf.layers.dropout(hidden4, rate=dropout)

	logits = tf.layers.dense(fc1, num_classes, activation=tf.nn.sigmoid)
	
	#make the number of classes == 1 and see how this affects the scoring?
	#the learning rate could also be upped to .1
	#seems to be working at improving the GINI with the sigmoid final, but it is
	#improving at a very very slow rate

with tf.name_scope("loss"):
	xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
	loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
	optimizer = tf.train.GradientDescentOptimizer(learning_rate)
	training_op = optimizer.minimize(loss)

"""
with tf.name_scope("loss"):	
	gini = gini_tf(y, outputs)
	loss = tf.reduce_mean(gini, name="loss")

with tf.name_scope("train"):
	optimizer = tf.train.GradientDescentOptimizer(learning_rate)
	training_op = optimizer.minimize(loss)

"""


#this is the one to switch, want it based off of gini not the xentropy
#look at the docs to confirm this
#the training_op = optimizer.minimize(loss) below should then
#be changed to maximize(gini)


with tf.name_scope("eval"):
	correct = tf.nn.in_top_k(logits, y, 1)
	accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))


init = tf.global_variables_initializer()
saver = tf.train.Saver()


X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2)


#batch_size = 100
#X_batch, y_batch = tf.train.batch([X_train, y_train], batch_size=100)
#for iteration in range(X_train.shape[0] // batch_size):		


with tf.Session() as sess:
	init.run()
	for epoch in range(n_epochs):
		sess.run(training_op, feed_dict={X: X_train, y: y_train})	
		acc_train = accuracy.eval(feed_dict={X: X_train, y: y_train})
		acc_test = accuracy.eval(feed_dict={X: X_val,
											y: y_val})

		###below is the new GINI test.
		prob_test = logits.eval(feed_dict={X: X_val,
								y: y_val})
		#switched from outputs to logits
		gini_n = gini_normalized(y_val, prob_test[:,1])

		print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test, 
			"\nGINI NORM:", gini_n)
	
	save_path = saver.save(sess, "./cams_model_final.ckpt")




#make external predictions on the test_dat
with tf.Session() as sess:
    saver.restore(sess, "./cams_model_final.ckpt") # or better, use save_path
    Z = logits.eval(feed_dict={X: test_dat}) #switched from outputs to logits
    y_pred = Z[:,1]



dnn_output = submission
dnn_output['target'] = y_pred

dnn_output.to_csv('tf_dnn_predictions_upsample.csv', index=False, float_format='%.10f')