python source code of train

from __future__ import unicode_literals
import logging
import pandas as pd
from sklearn.externals import joblib
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics.classification import (
    recall_score, confusion_matrix,
    precision_recall_fscore_support,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from settings import DATASET_FILENAME, MODEL_FILENAME

logger = logging.getLogger('training')

def main():
    """ Find the best model to fit the dataset and save it into file """
    # create a GridSearch object to find the best fitting model
    grid_search = new_grid_search()
    # run the search algorithm
    run_grid_search(grid_search)
    # save the best fitting model into FS
    save_search_results(grid_search)


def split_dataset():
    """ Read and split dataset into train and test subsets """
    df = pd.read_csv(DATASET_FILENAME, header=0)
    X = df[df.columns[:-1]].as_matrix()
    y = df[df.columns[-1]].as_matrix()
    return train_test_split(X, y, test_size=0.2, random_state=42)

def new_grid_search():
    """ Create new GridSearch obj with models pipeline """
    pipeline = Pipeline([
        # TODO some smart preproc can be added here
        (u"clf", LogisticRegression(class_weight="balanced")),
    ])
    search_params = {"clf__C": (1e-4, 1e-2, 1e0, 1e2, 1e4)}
    return GridSearchCV(
        estimator=pipeline,
        param_grid=search_params,
        scoring="recall_macro",
        cv=10,
        n_jobs=-1,
        verbose=3,
    )

def run_grid_search(grid_search, show_evaluation=True):
    """ Run the GridSearch algorithm and compute evaluation metrics """
    X_train, X_test, y_train, y_test = split_dataset()

    grid_search.fit(X_train, y_train)
    # for key, value in grid_search.cv_results_.items():
    #     print key, value

    predictions = grid_search.predict(X_test)

    if show_evaluation:
        logger.debug("macro_recall: %s", recall_score(y_test, predictions, average="macro"))
        logger.debug(precision_recall_fscore_support(y_test, predictions))
        logger.debug(confusion_matrix(y_test, predictions))

def save_search_results(grid_search):
    """ Serialize model into file """
    joblib.dump(grid_search.best_estimator_, MODEL_FILENAME)
    # then load it like this:
    # clf = joblib.load(model_dump_filename)

if __name__ == "__main__":
    main()