""" All rights reserved. -- Yang Song. """ import os import pickle from copy import deepcopy import pandas as pd import numpy as np from abc import ABCMeta, abstractmethod from sklearn.svm import SVC from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from FAE.DataContainer.DataContainer import DataContainer from Utility.EcLog import eclog from Utility.Constants import * from FAE.HyperParameterConfig.HyperParamManager import RANDOM_SEED class Classifier: """ This is the base class of the classifer. All the specific classifier need to be artributed from this base class. """ def __init__(self): self.__model = None self._x = np.array([]) self._y = np.array([]) self._data_container = DataContainer() self.logger = eclog(os.path.split(__file__)[-1]).GetLogger() def __deepcopy__(self): copy_classifier = type(self)() copy_classifier._data_container = deepcopy(self._data_container) copy_classifier._x, copy_classifier._y = deepcopy(self._x), deepcopy(self._y) copy_classifier.SetModel(deepcopy(self.__model)) return copy_classifier def SetDataContainer(self, data_container): data = data_container.GetArray() label = data_container.GetLabel() try: assert(data.shape[0] == label.shape[0]) if data.ndim == 1: data = data[..., np.newaxis] self._data_container = data_container self._x = data self._y = label except Exception as e: content = 'The case number is not same to the label number: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) def SetData(self, data, label): try: assert(data.shape[0] == label.shape[0]) if data.ndim == 1: data = data[..., np.newaxis] self._x = data self._y = label except Exception as e: content = 'The case number is not same to the label number: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) def SetModel(self, model): self.__model = model def GetModel(self): return self.__model def SetModelParameter(self, param): self.__model.set_params(**param) def Fit(self): self.__model.fit(self._x, self._y) def GetDescription(self): text = "We did not use any classifier. " return text def Predict(self, x): return self.__model.predict(x) def Save(self, store_path): if os.path.isdir(store_path): store_path = os.path.join(store_path, 'model.pickle') if store_path[-7:] != '.pickle': print('Check the store path. ') else: with open(store_path, 'wb') as f: pickle.dump(self.__model, f) def Load(self, store_path): if os.path.isdir(store_path): store_path = os.path.join(store_path, 'model.pickle') if store_path[-7:] != '.pickle': print('Check the store path. ') else: with open(store_path, 'rb') as f: self.__model = pickle.load(f) @abstractmethod def GetName(self): pass class SVM(Classifier): def __init__(self, **kwargs): super(SVM, self).__init__() if 'kernel' not in kwargs.keys(): kwargs['kernel'] = 'linear' if 'C' not in kwargs.keys(): kwargs['C'] = 1.0 if 'probability' not in kwargs.keys(): kwargs['probability'] = True super(SVM, self).SetModel(SVC(random_state=RANDOM_SEED[CLASSIFIER_SVM], **kwargs)) self.__name = 'SVM_' + kwargs['kernel'] + '_C_' + '{:.3f}'.format(kwargs['C']) def GetName(self): return CLASSIFIER_SVM def Predict(self, x, is_probability=True): if is_probability: return super(SVM, self).GetModel().predict_proba(x)[:, 1] else: return super(SVM, self).Predict(x) def GetDescription(self): text = "We used support vector machine (SVM) as the classifier. SVM was an effective and robust classifier " \ "to build the model. The kernel function has the ability to map the features into a higher dimension " \ "to search the hyper-plane for separating the cases with different labels. Here we used the linear " \ "kernel function because it was easier to explain the coefficients of the features for the final model. " return text def Save(self, store_folder): if not os.path.isdir(store_folder): print('The store function of SVM must be a folder path') return # Save the coefficients try: coef_path = os.path.join(store_folder, 'SVM_coef.csv') df = pd.DataFrame(data=np.transpose(self.GetModel().coef_), index=self._data_container.GetFeatureName(), columns=['Coef']) df.to_csv(coef_path) except Exception as e: content = 'SVM with specific kernel does not give coef: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) # Save the intercept_ try: intercept_path = os.path.join(store_folder, 'SVM_intercept.csv') intercept_df = pd.DataFrame(data=self.GetModel().intercept_.reshape(1, 1), index=['intercept'], columns=['value']) intercept_df.to_csv(intercept_path) except Exception as e: content = 'SVM with specific kernel does not give intercept: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) super(SVM, self).Save(store_folder) class LDA(Classifier): def __init__(self, **kwargs): super(LDA, self).__init__() super(LDA, self).SetModel(LinearDiscriminantAnalysis(**kwargs)) def GetName(self): return 'LDA' def Predict(self, x, is_probability=True): if is_probability: return super(LDA, self).GetModel().predict_proba(x)[:, 1] else: return super(LDA, self).Predict(x) def GetDescription(self): text = "We used linear discriminant analysis (LDA) as the classifier. LDA was an linear classifier by " \ "fitting class conditional densities to the data and using Bayes’rule. " return text def Save(self, store_path): if not os.path.isdir(store_path): print('The store function of LDA must be a folder path') return # Save the coefficients try: coef_path = os.path.join(store_path, 'LDA_coef.csv') df = pd.DataFrame(data=np.transpose(self.GetModel().coef_), index=self._data_container.GetFeatureName(), columns=['Coef']) df.to_csv(coef_path) except Exception as e: content = 'LDA with specific kernel does not give coef: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) super(LDA, self).Save(store_path) class RandomForest(Classifier): def __init__(self, **kwargs): super(RandomForest, self).__init__() if 'n_estimators' not in kwargs.keys(): super(RandomForest, self).SetModel(RandomForestClassifier(random_state=RANDOM_SEED[CLASSIFIER_RF], n_estimators=200, **kwargs)) else: super(RandomForest, self).SetModel(RandomForestClassifier(random_state=RANDOM_SEED[CLASSIFIER_RF], **kwargs)) def GetName(self): return CLASSIFIER_RF def GetDescription(self): text = "We used random forest as the classifier. Random forest is an ensemble learning method which " \ "combining multiple decision trees at different subset of the training data set. Random forest " \ "is an effective method to avoid over-fitting. " return text def Predict(self, x, is_probability=True): if is_probability: return super(RandomForest, self).GetModel().predict_proba(x)[:, 1] else: return super(RandomForest, self).Predict(x) class AE(Classifier): def __init__(self, **kwargs): super(AE, self).__init__() if 'early_stopping' not in kwargs.keys(): kwargs['early_stopping'] = True super(AE, self).SetModel(MLPClassifier(random_state=RANDOM_SEED[CLASSIFIER_AE], **kwargs)) def GetName(self): return CLASSIFIER_AE def GetDescription(self): text = "We used multi-layer perceptron (MLP), sometimes called auto-encoder (AE), as the classifier. " \ "MLP is based neural network with multi-hidden layers to find the mapping from inputted features " \ "to the label. Here we used 1 hidden layers with 100 hidden units. The non-linear activate function " \ "was rectified linear unit function and the optimizer was Adam with step 0.001. " return text def Predict(self, x, is_probability=True): if is_probability: return super(AE, self).GetModel().predict_proba(x)[:, 1] else: return super(AE, self).Predict(x) class AdaBoost(Classifier): def __init__(self, **kwargs): super(AdaBoost, self).__init__() super(AdaBoost, self).SetModel(AdaBoostClassifier(random_state=RANDOM_SEED[CLASSIFIER_AB], **kwargs)) def GetName(self): return CLASSIFIER_AB def GetDescription(self): text = "We used AdaBoost as the classifier. AdaBoost is a meta-algorithm that conjunct other type of " \ "algorithms and combine them to get a final output of boosted classifier. AdaBoost is sensitive to " \ "the noise and the outlier. Over-fitting can also be avoided by AdaBoost. " \ "Here we used decision tree as the base classifier. " return text def Predict(self, x, is_probability=True): if is_probability: return super(AdaBoost, self).GetModel().predict_proba(x)[:, 1] else: return super(AdaBoost, self).Predict(x) class DecisionTree(Classifier): def __init__(self, **kwargs): super(DecisionTree, self).__init__() super(DecisionTree, self).SetModel(DecisionTreeClassifier(random_state=RANDOM_SEED[CLASSIFIER_DT], **kwargs)) def GetName(self): return CLASSIFIER_DT def GetDescription(self): text = "We used decision tree as the classifier. Decision tree is a non-parametric supervised learning " \ "method and can be used for classification with high interpretation. " return text def Predict(self, x, is_probability=True): if is_probability: return super(DecisionTree, self).GetModel().predict_proba(x)[:, 1] else: return super(DecisionTree, self).Predict(x) class GaussianProcess(Classifier): def __init__(self, **kwargs): super(GaussianProcess, self).__init__() super(GaussianProcess, self).SetModel(GaussianProcessClassifier( random_state=RANDOM_SEED[CLASSIFIER_GP], **kwargs)) def GetName(self): return CLASSIFIER_GP def GetDescription(self): text = "We used Gaussian process as the classifier. Gaussian process combines the features to build a joint " \ "distribution to estimate the probability of the classification. " return text def Predict(self, x, is_probability=True): if is_probability: return super(GaussianProcess, self).GetModel().predict_proba(x)[:, 1] else: return super(GaussianProcess, self).Predict(x) class NaiveBayes(Classifier): def __init__(self, **kwargs): super(NaiveBayes, self).__init__() super(NaiveBayes, self).SetModel(GaussianNB(**kwargs)) def GetName(self): return 'NB' def GetDescription(self): text = "We used naive Bayes as the classifier. Naive Bayes is a kind of probabilistic classifiers " \ "based on Bayes theorem. Naive Bayes requires number of parameters linear in the number of features. " return text def Predict(self, x, is_probability=True): if is_probability: return super(NaiveBayes, self).GetModel().predict_proba(x)[:, 1] else: return super(NaiveBayes, self).Predict(x) class LR(Classifier): def __init__(self, **kwargs): super(LR, self).__init__() if 'solver' in kwargs.keys(): super(LR, self).SetModel(LogisticRegression(penalty='none', **kwargs)) else: super(LR, self).SetModel(LogisticRegression(penalty='none', solver='saga', tol=0.01, random_state=RANDOM_SEED[CLASSIFIER_LR], **kwargs)) def GetName(self): return CLASSIFIER_LR def GetDescription(self): text = "We used logistic regression as the classifier. Logistic regression is a linear classifier that " \ "combines all the features. A hyper-plane was searched in the high dimension to separate the samples. " return text def Predict(self, x, is_probability=True): if is_probability: return super(LR, self).GetModel().predict_proba(x)[:, 1] else: return super(LR, self).Predict(x) def Save(self, store_path): if not os.path.isdir(store_path): print('The store function of SVM must be a folder path') return # Save the coefficients try: coef_path = os.path.join(store_path, 'LR_coef.csv') df = pd.DataFrame(data=np.transpose(self.GetModel().coef_), index=self._data_container.GetFeatureName(), columns=['Coef']) df.to_csv(coef_path) except Exception as e: content = 'LR can not load coef: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) try: intercept_path = os.path.join(store_path, 'LR_intercept.csv') intercept_df = pd.DataFrame(data=self.GetModel().intercept_.reshape(1, 1), index=['intercept'], columns=['value']) intercept_df.to_csv(intercept_path) except Exception as e: content = 'LR can not load intercept: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) super(LR, self).Save(store_path) class LRLasso(Classifier): def __init__(self, **kwargs): super(LRLasso, self).__init__() if 'solver' in kwargs.keys(): super(LRLasso, self).SetModel(LogisticRegression(penalty='l1', **kwargs)) else: super(LRLasso, self).SetModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_SEED[CLASSIFIER_LRLasso], **kwargs)) def GetName(self): return CLASSIFIER_LRLasso def GetDescription(self): text = "We used logistic regression with LASSO constrain as the classifier. Logistic regression with LASSO " \ "constrain is a linear classifier based on logistic regression. L1 norm is added in the final lost " \ "function and the weights was constrained, which make the features sparse. " return text def Predict(self, x, is_probability=True): if is_probability: return super(LRLasso, self).GetModel().predict_proba(x)[:, 1] else: return super(LRLasso, self).Predict(x) def Save(self, store_path): if not os.path.isdir(store_path): print('The store function of SVM must be a folder path') return # Save the coefficients try: coef_path = os.path.join(store_path, 'LRLasso_coef.csv') df = pd.DataFrame(data=np.transpose(self.GetModel().coef_), index=self._data_container.GetFeatureName(), columns=['Coef']) df.to_csv(coef_path) except Exception as e: content = 'LASSO can not load coef: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) try: intercept_path = os.path.join(store_path, 'LRLasso_intercept.csv') intercept_df = pd.DataFrame(data=self.GetModel().intercept_.reshape(1, 1), index=['intercept'], columns=['value']) intercept_df.to_csv(intercept_path) except Exception as e: content = 'LASSO can not load intercept: ' self.logger.error('{}{}'.format(content, str(e))) print('{} \n{}'.format(content, e.__str__())) super(LRLasso, self).Save(store_path) if __name__ == '__main__': X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) y = np.array([1, 1, 0, 0]) clf = SVM() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = AE() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = RandomForest() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = LDA() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = AdaBoost() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = DecisionTree() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = GaussianProcess() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = NaiveBayes() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = LR() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]])) clf = LRLasso() clf.SetData(X, y) clf.Fit() print(clf.GetName(), clf.Predict([[1, 1]]))