""" Code shared between all models for Otto competition. """ import gc import numpy as np import os import pandas as pd from sklearn.calibration import CalibratedClassifierCV from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit from sklearn.metrics import log_loss import consts def load_data(path_train=consts.DATA_TRAIN_PATH, path_test=consts.DATA_TEST_PATH): train = pd.read_csv(path_train) train_labels = [int(v[-1])-1 for v in train.target.values] train_ids = train.id.values train = train.drop('id', axis=1) train = train.drop('target', axis=1) test = pd.read_csv(path_test) test_ids = test.id.values test = test.drop('id', axis=1) return np.array(train, dtype=float), np.array(train_labels), np.array(test, dtype=float),\ np.array(train_ids), np.array(test_ids) def make_blender_cv(classifier, x, y, calibrate=False): skf = StratifiedKFold(y, n_folds=5, random_state=23) scores, predictions = [], None for train_index, test_index in skf: if calibrate: # Make training and calibration calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y[train_index])) fitted_classifier = calibrated_classifier.fit(x[train_index, :], y[train_index]) else: fitted_classifier = classifier.fit(x[train_index, :], y[train_index]) preds = fitted_classifier.predict_proba(x[test_index, :]) # Free memory calibrated_classifier, fitted_classifier = None, None gc.collect() scores.append(log_loss(y[test_index], preds)) predictions = np.append(predictions, preds, axis=0) if predictions is not None else preds return scores, predictions def write_blender_data(path, file_name, predictions): file_path = os.path.join(path, file_name) np.savetxt(file_path, predictions, delimiter=',', fmt='%.5f') def save_submission(path_sample_submission, output_file_path, predictions): sample = pd.read_csv(path_sample_submission) submission = pd.DataFrame(predictions, index=sample.id.values, columns=sample.columns[1:]) submission.to_csv(output_file_path, index_label='id') def stratified_split(x, y, test_size=0.2): strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23) train_index, valid_index = [s for s in strat_shuffled_split][0] x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index] return x_train, y_train, x_valid, y_valid def hold_out_evaluation(classifier, x, y, test_size=0.2, calibrate=False): x_train, y_train, x_valid, y_valid = stratified_split(x, y, test_size) # Train if calibrate: # Make training and calibration calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y_train)) fitted_classifier = calibrated_classifier.fit(x_train, y_train) else: fitted_classifier = classifier.fit(x_train, y_train) # Evaluate score = log_loss(y_valid, fitted_classifier.predict_proba(x_valid)) return score def get_prediction_files(): return ['model_%s.csv' % f for f in consts.PREDICTION_FILES] def get_cv(y, n_folds=5): return StratifiedKFold(y, n_folds=n_folds, random_state=23)