# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import csv

# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# We are going to create a feature from each name. There seems to be a correlation
# between the title of each person and whether or not they survived.  So let's
# turn the name column into just a number representing the person's title.
# No title = -1, "Master" = 0, "Mr" = 1, etc.
def parseName(name):
  out = -1
  names = ["Master.", "Mr.", "Dona.", "Miss.", "Mrs.", "Dr.", "Rev.", "Col.", "Ms.", "Capt.", "Mlle.", "Major.", "Mme."]
  for i, n in enumerate(names):
    if n in name:
      out = i
      break
  return out

# The data from Kaggle needs some cleaning
def cleanData(data):
  # If fare data is missing, replace it with the average from that class
  data.Fare = data.Fare.map(lambda x: np.nan if x==0 else x)
  classmeans = data.pivot_table('Fare', rows='Pclass', aggfunc='mean')
  data.Fare = data[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )

  # Turn names into a number representing titles
  data.Name = data.Name.map(lambda x: parseName(x))

  # Covert sex into a numberic value
  data.Sex = data.Sex.apply(lambda sex: 0 if sex == "male" else 1)

  return data


# Load training and test data sets, cleaning them in the process
train = cleanData(pd.read_csv("train.csv"))
test = cleanData(pd.read_csv("test.csv"))

# Pick out the four columns we care about and split the training set in features (X)
# and labels (y)
cols = ["Fare", "Pclass", "Sex", "Name"]
X = train[cols].values
y = train['Survived'].values

# To use an SVC, data needs to be scaled between [-1, 1] with a 0 mean.
# Scaling won't negatively affect any of the other classifiers.
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Create the classifiers
clf1 = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0)
clf2 = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0)
clf3 = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
clf4 = AdaBoostClassifier(n_estimators=500)
clf5 = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, max_depth=1, random_state=0)
clf6 = SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.2, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False)

clfs = [clf1, clf2, clf3, clf4, clf5, clf6]

# Fit each classifier based on the training data
for clf in clfs:
  clf.fit(X, y)

# Now create features from the test set
X = test[cols].values
X = scaler.transform(X)

# For all 6 classifiers, predict outputs and save the probabilities of each prediction
predictions = []

for clf in clfs:
  predictions.append(clf.predict_proba(X))

# Now we have six sets of predictions in a list.  Average across all lists to create
# one average prediction across all classifiers.
# Note: There are smarter ways to do this, but this works fairly well.
p = np.mean(predictions, axis=0)

# Now we have one list where each element is a tuple of (prob_true, prob_false).
# Let's turn those into 0 or 1 based on the prob_true value.
p = map(lambda x: 0 if x[0] >= 0.5 else 1, p)

# Now we have a prediction for each item of 0 or 1. Just write the result to a
# csv file in the format that Kaggle wants
with open('predictions.csv', 'wb') as csvfile:
  w = csv.writer(csvfile)
  w.writerow(["PassengerId", "Survived"])

  for i in xrange(len(p)):
    w.writerow([test.PassengerId[i], p[i]])