# -*- coding: utf-8 -*- import os import io import warnings import argparse import numpy as np import multiprocessing import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from jinja2 import Environment, FileSystemLoader from sklearn.model_selection import StratifiedKFold, KFold from sklearn.model_selection import GridSearchCV, learning_curve from sklearn.svm import SVC, LinearSVC, SVR from sklearn.metrics import classification_report, make_scorer from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression, Ridge, SGDRegressor,\ SGDClassifier from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.exceptions import ConvergenceWarning warnings.filterwarnings('ignore', category=ConvergenceWarning) from .algorithm import Algorithm from .data import Data from .metrics import f1_weighted from .clustering import Clustering class MALSS(object): def __init__(self, task=None, shuffle=True, standardize=True, scoring=None, cv=5, n_jobs=-1, random_state=0, lang='en', verbose=True, interactive=False, min_clusters=2, max_clusters=10): """ Initialize parameters. Parameters ---------- task : string Specifies the task of the analysis. It must be one of 'classification', 'regression', and 'clustering'. shuffle : boolean, optional (default=True) Whether to shuffle the data. standardize : boolean, optional (default=True) Whether to sdandardize the data. scoring : string, callable or None, optional, default: None A string (see scikit-learn's model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y). mean_squared_error (for regression task) or f1 (for classification task) is used by default. cv : integer or cross-validation generator. If an integer is passed, it is the number of folds (default 3). K-fold cv (for regression task) or Stratified k-fold cv (for classification task) is used by default. Specific cross-validation objects can be passed, see sklearn.model_selection module for the list of possible objects. min_clusters : integer (default=2). Minimum number of search conditions of the number of clusters. This number is used for only clustering task. max_clusters : integer (default=10). Maximum number of search conditions of the number of clusters. This number is used for only clustering task. n_jobs : integer, optional (default=-1) The number of jobs to run in parallel. If -1, then the number of jobs is set to the number of cores - 1. random_state : int seed, RandomState instance, or None (default=0) The seed of the pseudo random number generator lang : string (default='en') Specifies the language in the report. It must be one of 'en' (English), 'jp' (Japanese). verbose : boolean, default: True Enable verbose output. interactive : boolean, default: False Run MALSS with interactive application mode. """ parser = argparse.ArgumentParser() parser.add_argument('--lang', '-l', nargs=1, choices=['en', 'jp']) if interactive: import sys from .app import App try: from PyQt5.QtWidgets import QApplication except ImportError: print('PyQt5 is required.') sys.exit() app = QApplication(sys.argv) args = parser.parse_args() if args.lang is not None: lang = args.lang[0] App(lang=lang) sys.exit(app.exec_()) self.is_ready = False self.shuffle = shuffle self.standardize = standardize if task is None: raise ValueError("Set task ('classification' or 'regression').") elif task == 'classification': self.scoring = make_scorer(f1_weighted) if scoring is None else scoring if scoring is None: self.scoring_name = 'f1_weighted' elif isinstance(self.scoring, str): self.scoring_name = scoring else: self.scoring_name = scoring.__name__ elif task == 'regression': self.scoring =\ 'neg_mean_squared_error' if scoring is None else scoring if isinstance(self.scoring, str): self.scoring_name = scoring else: self.scoring_name = scoring.__name__ elif task == 'clustering': pass else: raise ValueError('task:%s is not supported' % task) self.task = task self.cv = cv if n_jobs == -1: self.n_jobs = np.max([multiprocessing.cpu_count() - 1, 1]) else: self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose if task == 'clustering': self.min_clusters, self.max_clusters = sorted([min_clusters, max_clusters]) if lang != 'en' and lang != 'jp': raise ValueError('lang:%s is no supported' % lang) self.lang = lang self.data = None self.algorithms = [] def __choose_algorithm(self): if self.task == 'classification': algorithms = self.__choose_algorithm_for_classification() elif self.task == 'regression': algorithms = self.__choose_algorithm_for_regression() elif self.task == 'clustering': algorithms = self.__choose_algorithm_for_clustering() return algorithms def __choose_algorithm_for_classification(self): algorithms = [] if self.data.X.shape[0] * self.data.X.shape[1] <= 1e+06: if self.data.X.shape[0] ** 2 * self.data.X.shape[1] <= 1e+09: algorithms.append( Algorithm( SVC(random_state=self.random_state, kernel='rbf'), [{'C': [1, 10, 100, 1000], 'gamma': [1e-3, 1e-2, 1e-1, 1.0]}], 'Support Vector Machine (RBF Kernel)', ('http://scikit-learn.org/stable/modules/' 'generated/sklearn.svm.SVC.html'))) algorithms.append( Algorithm( RandomForestClassifier( random_state=self.random_state, n_estimators=500, n_jobs=1), [{'max_features': [0.3, 0.6, 0.9], 'max_depth': [3, 7, 11]}], 'Random Forest', ('http://scikit-learn.org/stable/modules/' 'generated/' 'sklearn.ensemble.RandomForestClassifier.html'))) else: algorithms.append( Algorithm( LinearSVC(random_state=self.random_state), [{'C': [0.1, 1, 10, 100]}], 'Support Vector Machine (Linear Kernel)', ('http://scikit-learn.org/stable/modules/generated/' 'sklearn.svm.LinearSVC.html'))) algorithms.append( Algorithm( LogisticRegression( random_state=self.random_state, solver='lbfgs', multi_class='auto'), [{'C': [0.1, 0.3, 1, 3, 10]}], 'Logistic Regression', ('http://scikit-learn.org/stable/modules/generated/' 'sklearn.linear_model.LogisticRegression.html'))) algorithms.append( Algorithm( DecisionTreeClassifier(random_state=self.random_state), [{'max_depth': [3, 5, 7, 9, 11]}], 'Decision Tree', ('http://scikit-learn.org/stable/modules/generated/' 'sklearn.tree.DecisionTreeClassifier.html'))) # Too small data doesn't suit for kNN. if isinstance(self.cv, int): num_cv = self.cv else: num_cv = self.cv.get_n_splits() min_nn = int( 0.1 * (num_cv - 1) * self.data.X.shape[0] / num_cv) # where 0.1 means smallest data size ratio of learning_curve # function. # The value of min_nn isn't accurate when cv is stratified. if min_nn >= 11: algorithms.append( Algorithm( KNeighborsClassifier(), [{'n_neighbors': list(range(2, min(20, min_nn + 1), 4))}], 'k-Nearest Neighbors', ('http://scikit-learn.org/stable/modules/' 'generated/sklearn.neighbors.KNeighborsClassifier' '.html'))) else: algorithms.append( Algorithm( SGDClassifier( random_state=self.random_state, max_iter=1000, tol=1e-3, n_jobs=1), [{'alpha': [1e-05, 3e-05, 1e-04, 3e-04, 1e-03]}], 'SGD Classifier', ('http://scikit-learn.org/stable/modules/generated/' 'sklearn.linear_model.SGDClassifier.html'))) return algorithms def __choose_algorithm_for_regression(self): algorithms = [] if self.data.X.shape[0] * self.data.X.shape[1] <= 1e+06: if self.data.X.shape[0] ** 2 * self.data.X.shape[1] <= 1e+09: algorithms.append( Algorithm( SVR(kernel='rbf'), [{'C': [1, 10, 100, 1000], 'gamma': [1e-3, 1e-2, 1e-1, 1.0]}], 'Support Vector Machine (RBF Kernel)', ('http://scikit-learn.org/stable/modules/' 'generated/sklearn.svm.SVR.html'))) algorithms.append( Algorithm( RandomForestRegressor( random_state=self.random_state, n_estimators=500, n_jobs=1), [{'max_features': [0.3, 0.6, 0.9], 'max_depth': [3, 7, 11]}], 'Random Forest', ('http://scikit-learn.org/stable/modules/' 'generated/' 'sklearn.ensemble.RandomForestRegressor.html'))) algorithms.append( Algorithm( Ridge(), [{'alpha': [0.01, 0.1, 1, 10, 100]}], 'Ridge Regression', ('http://scikit-learn.org/stable/modules/generated/' 'sklearn.linear_model.Ridge.html'))) algorithms.append( Algorithm( DecisionTreeRegressor(random_state=self.random_state), [{'max_depth': [3, 5, 7, 9, 11]}], 'Decision Tree', ('http://scikit-learn.org/stable/modules/generated/' 'sklearn.tree.DecisionTreeRegressor.html'))) else: algorithms.append( Algorithm( SGDRegressor( random_state=self.random_state, max_iter=1000, tol=1e-3), [{'alpha': [1e-05, 3e-05, 1e-04, 3e-04, 1e-03]}], 'SGD Regressor', ('http://scikit-learn.org/stable/modules/generated/' 'sklearn.linear_model.SGDRegressor.html'))) return algorithms def __choose_algorithm_for_clustering(self): return Clustering.choose_algorithm(self.min_clusters, self.max_clusters, self.random_state) def add_algorithm(self, estimator, param_grid, name, link=None): """ Add arbitrary scikit-learn-compatible algorithm. Parameters ---------- estimator : object type that implements the “fit” and “predict” methods A object of that type is instantiated for each grid point. param_grid : dict or list of dictionaries Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. name : string Algorithm name (used for report) link : string URL to explain the algorithm (used for report) """ if self.verbose: print('add %s' % name) self.algorithms.append(Algorithm(estimator, param_grid, name, link)) def change_params(self, identifier, param_grid): """ Change parameters of an algorithm. Parameters ---------- identifier : integer or string. If an integer is passed, it is the index of the algorithm in the list of algorithms. If a string is passed, it is the name of the algorithm. param_grid : dict or list of dictionaries Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. """ if isinstance(identifier, int): self.algorithms[identifier].parameters = param_grid elif isinstance(identifier, str): for algorithm in self.algorithms: if algorithm.name == identifier: algorithm.parameters = param_grid break else: raise Exception('Wrong identifier') def remove_algorithm(self, index=-1): """ Remove algorithm Parameters ---------- index : int (default=-1) Remove an algorithm from list by index. By default, last algorithm is removed. """ if self.verbose: print('remove %s' % self.algorithms[index].name) del self.algorithms[index] def get_algorithms(self): """ Get algorithm names and grid parameters. Returns ------- algorithms : list List of tupples(name, grid_params). """ rtn = [] for algorithm in self.algorithms: rtn.append((algorithm.name, algorithm.parameters)) return rtn def fit(self, X, y=None, dname=None, algorithm_selection_only=False): """ Tune parameters and search best algorithm Parameters ---------- X : {numpy.ndarray, pandas.DataFrame}, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : {numpy.ndarray, pandas.Series}, shape = [n_samples] Target values (class labels in classification, real numbers in regression) dname : string (default=None) If not None, make a analysis report in this directory. algorithm_selection_only : boolean, optional (default=False) If True, only algorithm selection is executed. This option is needed for (get|add|remove)_algorithm(s) methods. Returns ------- self : object Returns self. """ if self.task == 'clustering' and y is not None: print('Warning: target values y is ignored for clustering.') elif (self.task == 'classification' or self.task == 'regression') and y is None: raise ValueError(f'Target values y must be set in {self.task}.') if self.verbose: print('Set data.') self.data = Data(self.shuffle, self.standardize, self.random_state) self.data.fit_transform(X, y) if self.task == 'classification' or self.task == 'regression': return self.__fit_supervised(dname, algorithm_selection_only) elif self.task == 'clustering': return self.__fit_clustering(dname) def __fit_supervised(self, dname, algorithm_selection_only): if not self.is_ready: if self.verbose: print('Choose algorithms.') self.algorithms = self.__choose_algorithm() if self.verbose: for algorithm in self.algorithms: print(' %s' % algorithm.name) self.is_ready = True else: # initialize for algorithm in self.algorithms: algorithm.best_score is None algorithm.best_params is None algorithm.is_best_algorithm = False algorithm.grid_scores is None algorithm.classification_report is None if algorithm_selection_only: return (self.data.X, self.data.y) if isinstance(self.cv, int): if self.task == 'classification': self.cv = StratifiedKFold(n_splits=self.cv, shuffle=self.shuffle, random_state=self.random_state) elif self.task == 'regression': self.cv = KFold(n_splits=self.cv, shuffle=self.shuffle, random_state=self.random_state) if self.verbose: print('Analyze (This will take some time).') self.__tune_parameters() if self.task == 'classification': self.__report_classification_result() if dname is not None: if self.verbose: print('Make report.') self.__make_report(dname) self.results = {'algorithms': {}} for algorithm in self.algorithms: self.results['algorithms'][algorithm.name] = {} self.results['algorithms'][algorithm.name]['grid_scores'] =\ algorithm.grid_scores if dname is None: self.results['algorithms'][algorithm.name]['learning_curve'] =\ self.__calc_learning_curve(algorithm) if algorithm.is_best_algorithm: self.results['best_algorithm'] = {} self.results['best_algorithm']['estimator'] =\ algorithm.estimator self.results['best_algorithm']['score'] = algorithm.best_score if self.verbose: print('Done.') return self def __fit_clustering(self, dname): if self.verbose: print('Choose algorithms.') self.algorithms = self.__choose_algorithm() if self.verbose: for algorithm in self.algorithms: print(' %s' % algorithm.name) if self.verbose: print('Analyze (This will take some time).') Clustering.analyze(self.algorithms, self.data, self.min_clusters, self.max_clusters, self.random_state, self.verbose) if dname is not None: if self.verbose: print('Make report.') self.__make_report(dname) def predict(self, X, estimator=None): if self.task == 'classification' or self.task == 'regression': if estimator is None: return self.algorithms[self.best_index].estimator.predict( self.data.transform(X)) else: return estimator.predict(self.data.transform(X)) elif self.task == "clustering": return Clustering.predict(self.algorithms, self.data.transform(X)) def __search_best_algorithm(self): self.best_score = float('-Inf') self.best_index = -1 for i in range(len(self.algorithms)): if self.algorithms[i].best_score > self.best_score: self.best_score = self.algorithms[i].best_score self.best_index = i self.algorithms[self.best_index].is_best_algorithm = True def __tune_parameters(self): for i in range(len(self.algorithms)): if self.verbose: print(' %s' % self.algorithms[i].name) estimator = self.algorithms[i].estimator parameters = self.algorithms[i].parameters clf = GridSearchCV( estimator, parameters, cv=self.cv, scoring=self.scoring, iid=False, n_jobs=self.n_jobs) clf.fit(self.data.X, self.data.y) grid_scores = [] for j in range(len(clf.cv_results_['mean_test_score'])): grid_scores.append((clf.cv_results_['params'][j], clf.cv_results_['mean_test_score'][j], clf.cv_results_['std_test_score'][j])) self.algorithms[i].estimator = clf.best_estimator_ self.algorithms[i].best_score = clf.best_score_ self.algorithms[i].best_params = clf.best_params_ self.algorithms[i].grid_scores = grid_scores self.__search_best_algorithm() def __report_classification_result(self): for i in range(len(self.algorithms)): est = self.algorithms[i].estimator self.algorithms[i].classification_report =\ classification_report(self.data.y, est.predict(self.data.X)) def __calc_learning_curve(self, algorithm): estimator = algorithm.estimator train_sizes, train_scores, test_scores = learning_curve( estimator, self.data.X, self.data.y, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs) # parallel run in cross validation train_scores_mean = np.mean(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) return {'x': train_sizes, 'y_train': train_scores_mean, 'y_cv': test_scores_mean} def __plot_learning_curve(self, dname=None): for alg in self.algorithms: if self.verbose: print(' %s' % alg.name) estimator = alg.estimator train_sizes, train_scores, test_scores = learning_curve( estimator, self.data.X, self.data.y, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs) # parallel run in cross validation train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure() plt.title(estimator.__class__.__name__) plt.xlabel("Training examples") plt.ylabel("Score") plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="lower right") if dname is not None and not os.path.exists(dname): os.mkdir(dname) if dname is not None: plt.savefig('%s/learning_curve_%s.png' % (dname, estimator.__class__.__name__), bbox_inches='tight', dpi=75) else: plt.savefig('learning_curve_%s.png' % estimator.__class__.__name__, bbox_inches='tight', dpi=75) plt.close() def __make_report(self, dname): if self.task == 'classification' or self.task == 'regression': self.__make_report_supervised(dname) elif self.task == 'clustering': Clustering.make_report(self.algorithms, self.data, dname, self.lang) def __make_report_supervised(self, dname): if not os.path.exists(dname): os.mkdir(dname) self.__plot_learning_curve(dname) env = Environment( loader=FileSystemLoader( os.path.abspath( os.path.dirname(__file__)) + '/template', encoding='utf8')) if self.lang == 'jp': tmpl = env.get_template('report_jp.html.tmp') else: tmpl = env.get_template('report.html.tmp') html = tmpl.render(algorithms=self.algorithms, scoring=self.scoring_name, task=self.task, data=self.data).encode('utf-8') fo = io.open(dname + '/report.html', 'w', encoding='utf-8') fo.write(html.decode('utf-8')) fo.close() def generate_module_sample(self, fname='module_sample.py'): """ Generate a module sample to be able to add in the model in your system for prediction. Parameters ---------- fname : string (default="module_sample.py") A string containing a path to a output file. """ env = Environment( loader=FileSystemLoader( os.path.abspath( os.path.dirname(__file__)) + '/template', encoding='utf8')) tmpl = env.get_template('sample_code.py.tmp') encoded = True if len(self.data.del_columns) > 0 else False html = tmpl.render(algorithm=self.algorithms[self.best_index], encoded=encoded, standardize=self.standardize).encode('utf-8') fo = io.open(fname, 'w', encoding='utf-8') fo.write(html.decode('utf-8')) fo.close() def select_features(self): if self.data is None: warnings.warn("'drop_col' must be used after 'fit' has used.") return if self.task == 'regression': rf = RandomForestRegressor(random_state=0, oob_score=True, n_estimators=50, n_jobs=self.n_jobs) else: rf = RandomForestClassifier(random_state=0, oob_score=True, n_estimators=50, n_jobs=self.n_jobs) num_col = len(self.data.X.columns) self.data.drop_col(rf) if len(self.data.X.columns) < num_col: self.algorithms = self.__choose_algorithm() self.is_ready = True if __name__ == "__main__": MALSS(interactive=True)