#!/usr/bin/python ''' sequential backward selection In order to reduce the complexity of the model, the number of features can be reduced by Sequential Backward Selection Th e13b dataset has 20 column sums, one for each column in the original images. Only a few of these would be needed to produce a good fit. The SBS algorithm removes features by repeatedly running a fit of the data, selecting the feature for removal that makes the least difference to the accuracy of the fit. Created on Jun 23, 2016 from Python Machine Learning by Sebastian Raschka under the following license The MIT License (MIT) Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. @author: richard lyman ''' import numpy as np import ocr_utils import matplotlib.pyplot as plt y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=range(0,20), nChars=1000, random_state=0) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0) from sklearn.preprocessing import StandardScaler stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.transform(X_test) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) from sklearn.base import clone from itertools import combinations from sklearn.metrics import accuracy_score class SBS(): def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1): self.scoring = scoring self.estimator = clone(estimator) self.k_features = k_features self.test_size = test_size self.random_state = random_state def fit(self, X, y): X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=self.test_size, random_state=self.random_state) dim = X_train.shape[1] self.indices_ = tuple(range(dim)) self.subsets_ = [self.indices_] score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_) self.scores_ = [score] while dim > self.k_features: scores = [] subsets = [] for p in combinations(self.indices_, r=dim-1): score = self._calc_score(X_train, y_train, X_test, y_test, p) scores.append(score) subsets.append(p) best = np.argmax(scores) self.indices_ = subsets[best] self.subsets_.append(self.indices_) dim -= 1 self.scores_.append(scores[best]) self.k_score_ = self.scores_[-1] return self def transform(self, X): return X[:, self.indices_] def _calc_score(self, X_train, y_train, X_test, y_test, indices): self.estimator.fit(X_train[:, indices], y_train) y_pred = self.estimator.predict(X_test[:, indices]) score = self.scoring(y_test, y_pred) return score from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=2) # selecting features sbs = SBS(knn, k_features=1) sbs.fit(X_train_std, y_train) # plotting performance of feature subsets k_feat = [len(k) for k in sbs.subsets_] title='Sequential Backward Selection' plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.title(title) plt.tight_layout() ocr_utils.show_figures(plt,title) best=10 k5 = list(sbs.subsets_[best]) print('The best {} column_sums'.format(best)) for s in k5: print(labels[s]) print() knn.fit(X_train_std, y_train) print('Training accuracy using all features:', knn.score(X_train_std, y_train)) print('Test accuracy using all features:', knn.score(X_test_std, y_test)) knn.fit(X_train_std[:, k5], y_train) print('Training accuracy using {} features:'.format(best), knn.score(X_train_std[:, k5], y_train)) print('Test accuracy using {} features:'.format(best), knn.score(X_test_std[:, k5], y_test)) print ('\n########################### No Errors ####################################')