python source code of make

########################################################################
# Creates a Machine Learning Problem Dataset
# Suitable for assigning to a student for a homework assignment
# Mostly convenience code for sklearn's make_classification routines
#
#
########################################################################

from sklearn.datasets import make_classification, make_regression
from snape.utils import assert_is_type, get_random_state, assert_valid_percent

import pandas as pd
import numpy as np
import argparse
import json
import re
import os
import sys

from sklearn.model_selection import train_test_split


def parse_args(args):
    """
    Returns arguments passed at the command line as a dict
    """
    parser = argparse.ArgumentParser(description='Generates a machine Learning Dataset.')
    parser.add_argument('-c', help="Config File Location", required=True,
                        dest='config')
    return vars(parser.parse_args(args))


def load_config(config_name):
    """
    Loads a json config file and returns a config dictionary.
    :param config_name: the path to the config json
    """
    with open(config_name) as config_file:
        config = json.load(config_file)
        return config


def rename_columns(df, prefix='x'):
    """
    Rename the columns of a dataframe to have X in front of them

    :param df: data frame we're operating on
    :param prefix: the prefix string
    """
    # the prefix needs to be a string
    assert_is_type(prefix, str)

    df = df.copy()
    df.columns = [prefix + str(i) for i in df.columns]
    return df


def insert_missing_values(df, percent_rows, random_state=None):
    """
    Inserts missing values into a data frame.

    :param df: data frame we're operating on
    :param percent_rows: the percentage of rows that should have a missing value.
    :param random_state: the numpy RandomState
    :return: a df with missing values
    """
    # get the initialized random_state (if not already initialized)
    random_state = get_random_state(random_state)
    df = df.copy()

    def _insert_random_null(x):
        """
        Chose a random column in a df row to null. This
        operates in-place. But it's on the copy, so it should be OK.

        :param x: the data frame
        """
        # -1 because last col will always be y
        x[random_state.randint(0, len(x) - 1)] = np.nan
        return x

    # this is a "truthy" check. If it's zero or False, this will work.
    if not percent_rows:
        return df
    else:
        # otherwise validate that it's a float
        percent_rows = assert_valid_percent(percent_rows, eq_upper=True)  # eq_lower not necessary because != 0.
        sample_index = df.sample(frac=percent_rows, random_state=random_state).index  # random sample of rows to null
        df.loc[sample_index] = df.loc[sample_index].apply(_insert_random_null, axis=1)
        return df


def insert_special_char(character, df, random_state=None):
    """
    Chooses a column to reformat as currency or percentage, including a $ or % string, to make cleaning harder

    :param character: either $ or %
    :param df: the dataframe we're operating on
    :param random_state: the numpy RandomState
    :return: A dataframe with a single column chosen at random converted to a % or $ format
    """
    # get the initialized random_state (if not already initialized)
    random_state = get_random_state(random_state)
    df = df.copy()

    # choose a column at random, that isn't Y.  Only choose from numeric columns (no other eviled up columns)
    chosen_col = random_state.choice([col for col in df.select_dtypes(include=['number']).columns if col != 'y'])

    # assert that character is a string and that it's in ('$', '%')
    assert_is_type(character, str)
    if character not in ('$', '%'):
        raise ValueError('expected `character` to be in ("$", "%"), but got {0}'.format(character))

    # do scaling first:
    df[chosen_col] = (df[chosen_col] - df[chosen_col].mean()) / df[chosen_col].std()

    # do the specific div/mul operations
    if character is "$":
        # multiply by 1000, finally add a $
        df[chosen_col] = (df[chosen_col] * 1000).round(decimals=2).map(lambda x: "$" + str(x))
    else:  # elif character is "%":
        # divide by 100, finally add a $
        df[chosen_col] = (df[chosen_col] / 100).round(decimals=2).map(lambda x: str(x) + "%")

    return df


def create_categorical_features(df, label_list, random_state=None, label_name='y'):
    """
    Creates random categorical variables

    :param df: data frame we're operation on
    :param label_list: A list of lists, each list is the labels for one categorical variable
    :param random_state: the numpy RandomState
    :param label_name: the column name of rht label, if any. Default is 'y'
    :return: A modified dataframe

    Example:

    create_categorical_features(df, [['a','b'], ['red','blue']])

    """
    random_state = get_random_state(random_state)

    df = df.copy()
    n_categorical = len(label_list)

    # get numeric columns ONCE so we don't have to do it every time we loop:
    numer_cols = [col for col in df.select_dtypes(include=['number']).columns if col != label_name]

    for i in range(0, n_categorical):
        # we might be out of numerical columns!
        if not numer_cols:
            break

        # chose a random numeric column that isn't y
        chosen_col = random_state.choice(numer_cols)
        # pop the chosen_col out of the numer_cols
        numer_cols.pop(numer_cols.index(chosen_col))

        # use cut to convert that column to categorical
        df[chosen_col] = pd.cut(df[chosen_col], bins=len(label_list[i]), labels=label_list[i])

    return df


def create_classification_dataset(n_samples, n_features, n_informative, n_redundant, n_repeated,
                                  n_clusters_per_class, weights, n_classes, random_state=None,
                                  shuffle=True):
    """
    Creates a binary classifier dataset

    :param n_samples: number of observations
    :param n_features: number of  features
    :param n_informative: number of informative features
    :param n_redundant: number of multicolinear
    :param n_repeated:  number of perfect collinear features
    :param n_clusters_per_class:  gaussian clusters per class
    :param weights: list of class balances, e.g. [.5, .5]
    :param n_classes: the number of class levels
    :param random_state: the numpy RandomState
    :param shuffle: shuffle the samples and the features.
    :return: the requested dataframe
    """
    random_state = get_random_state(random_state)
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
                               n_redundant=n_redundant, n_repeated=n_repeated,
                               n_clusters_per_class=n_clusters_per_class, weights=weights,
                               scale=(np.random.rand(n_features) * 10), n_classes=n_classes,
                               random_state=random_state, shuffle=shuffle)
    # cast to a data frame
    df = pd.DataFrame(X)
    # rename X columns
    df = rename_columns(df)
    # and add the Y
    df['y'] = y
    return df


def create_regression_dataset(n_samples, n_features, n_informative, effective_rank, tail_strength,
                              noise, random_state=None, shuffle=True):
    """
    Creates a regression dataset

    :param n_samples: number of observations
    :param n_features: number of features
    :param n_informative: number of informative features
    :param n_targets: The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar.
    :param effective_rank: approximate number of singular vectors required to explain data
    :param tail_strength: relative importance of the fat noisy tail of the singular values profile
    :param noise: standard deviation of the gaussian noise applied to the output
    :param random_state: the numpy RandomState
    :param shuffle: shuffle the samples and the features.
    :return: the requested dataframe
    """
    random_state = get_random_state(random_state)
    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
                           n_targets=1, effective_rank=effective_rank, tail_strength=tail_strength,
                           noise=noise, random_state=random_state, shuffle=shuffle)

    # cast to a data frame
    df = pd.DataFrame(X)
    # rename X columns
    df = rename_columns(df)
    # and add the Y
    df['y'] = y
    return df


def make_star_schema(df, out_path="." + os.path.sep):
    """
    Converts dataset to star-schema fact and dimension tables. Dimension tables are written out to CSV files,
    and the dataframe passed to the function is converted into a 'fact' table and returned as a dataframe (this
    file is NOT written out at this point because the fact table would be subject to test/train split functions,
    and dimension tables would not be).

    :param df: Source dataframe
    :param out_path: path to write the dimension files to
    :return: dataframe with dimension table
    """
    def _get_categorical_columns(x):  # don't shadow df from outer scope
        return x.select_dtypes(include=['category', 'object']).columns

    def _find_dollars(text):
        return 1 if re.match(r'^\$-?\d+\.?\d+', str(text)) else 0

    def _find_percentages(text):
        return 1 if re.search(r'^-?\d+\.?\d+[%]$', str(text)) else 0

    def _is_special_char(list_object):
        if list_object.dtype != 'O':
            return False
        else:
            percent_sum = sum(list_object.apply(_find_percentages))
            dollars_sum = sum(list_object.apply(_find_dollars))

            return (percent_sum / list_object.count() == 1) or (dollars_sum / list_object.count() == 1)

    # Get the categorical columns
    cols = _get_categorical_columns(df)
    assert len(cols) > 0, "No categorical variables exist in this dataset; star schema cannot be developed."

    # Iterate through the categorical columns
    for cat_column in cols:

        # Determine if the list includes requested entropy or not (NOTE: Decided not to make dimension
        # tables before this command so dimension keys CAN'T be selected for entropy)
        if not _is_special_char(df[cat_column]):  # previously was "is not True" but not very pythonic

            # Turn the value counts into a dataframe
            vals = pd.DataFrame(df[cat_column].value_counts())

            # todo: Sara, the following seems hacky... is there a better way to do this?
            # Reset the index to add index as the key
            vals.reset_index(inplace=True)  # Puts the field names into the dataframe
            vals.reset_index(inplace=True)  # Puts the index numbers in as integers

            # Name the column with the same name as the column 'value_count'
            vals.rename(index=str,
                        columns={'level_0': 'primary_key',
                                 'index': 'item',
                                 cat_column: 'value_count'
                                 },
                        inplace=True)

            # Make a df out of just the value and the mapping
            val_df = vals[['primary_key', 'item']]

            # todo: Sara, this is hacky (but really cool!) Could you please write a comment block
            # todo: ... explaining exactly what you're achieving here?
            # Make a dimension df by appending a NaN placeholder
            val_df.item.cat.add_categories('Not specified', inplace=True)
            val_df = val_df.append({'primary_key': -1, 'item': 'Not specified'}, ignore_index=True)

            # todo: Sara, should we take another param in this function that can either
            # todo: ... permit or prevent accidentally overwriting an existing file?
            # Write the new dimension table out to CSV
            dim_file_name = cat_column + '_dim.csv'
            val_df.to_csv(out_path + dim_file_name, index=False)

            # Set the index up for mapping
            val_df.set_index('item', inplace=True)

            # Convert to dict for mapping
            mapper = val_df.to_dict().get('primary_key')

            # Fill the NaNs in the dataframe's categorical column to 'Not Specified'
            df[cat_column].cat.add_categories('Not specified', inplace=True)
            df[cat_column].fillna('Not specified', inplace=True)

            # Insert new column into the dataframe
            df.insert(df.shape[1], cat_column + '_key', df[cat_column].map(mapper))

            # Drop cat column from the dataframe
            df.drop(cat_column, axis=1, inplace=True)

    # Now, reset the dataframe's index and rename the index column as 'primary_key'
    df.reset_index(inplace=True)
    df_cols = df.columns
    df_cols = df_cols.delete(0)
    df_cols = df_cols.insert(0, 'primary_key')
    df.columns = df_cols

    # Return the main dataframe as a 'fact' table, which will then be split into test/train splits
    # dimension tables are immune to this
    return df.copy()


def write_dataset(df, file_name, out_path="." + os.path.sep):
    """
    Writes generated dataset to file

    :param df: dataframe to write
    :param file_name: beginning of filename
    :param out_path: the path to write the dataset
    :return: None
    """
    # todo: Mike, do we want to take a param for overwriting existing files?
    df_train, df_testkey = train_test_split(df, test_size=.2)

    df_train.to_csv(out_path + file_name + "_train.csv", index=False)
    df_test = df_testkey.drop(['y'], axis=1)
    df_test.to_csv(out_path + file_name + "_test.csv", index=False)
    df_testkey.to_csv(out_path + file_name + "_testkey.csv", index=False)


def make_dataset(config=None):
    """
    Creates a machine learning dataset based on command line arguments passed

    :param config: a configuration dictionary, or None if called from the command line
    :return: None
    """

    if config is None:
        # called from the command line so parse configuration
        args = parse_args(sys.argv[1:])
        config = load_config(args['config'])

    print('-' * 80)
    c_type = config['type']  # avoid multiple lookups - fails with key error if not present
    if c_type not in ('regression', 'classification'):
        raise ValueError('type must be in ("regression", "classification"), but got %s' % c_type)
    reg = c_type == 'regression'

    # get defaults - these are the defaults from sklearn.
    def _safe_get_with_default(cfg, key, default):
        if key not in cfg:
            print("Warning: %s not in configuration, defaulting to %r" % (key, default))
            return default
        return cfg[key]

    n_samples = _safe_get_with_default(config, 'n_samples', 100)
    n_features = _safe_get_with_default(config, 'n_features', 20 if not reg else 100)  # diff defaults in sklearn
    n_informative = _safe_get_with_default(config, 'n_informative', 2 if not reg else 10)  # diff defaults in sklearn
    n_redundant = _safe_get_with_default(config, 'n_redundant', 2)
    n_repeated = _safe_get_with_default(config, 'n_repeated', 0)
    n_clusters_per_class = _safe_get_with_default(config, 'n_clusters_per_class', 2)
    weights = _safe_get_with_default(config, 'weights', None)
    n_classes = _safe_get_with_default(config, 'n_classes', 2)
    effective_rank = _safe_get_with_default(config, 'effective_rank', None)
    tail_strength = _safe_get_with_default(config, 'tail_strength', 0.5)
    noise = _safe_get_with_default(config, 'noise', 0.)
    seed = _safe_get_with_default(config, 'random_seed', 42)
    shuffle = _safe_get_with_default(config, 'shuffle', True)

    # get the random state
    random_state = get_random_state(seed)

    # create the base dataset
    if not reg:
        print('Creating Classification Dataset...')
        df = create_classification_dataset(n_samples=n_samples, n_features=n_features,
                                           n_informative=n_informative, n_redundant=n_redundant,
                                           n_repeated=n_repeated, n_clusters_per_class=n_clusters_per_class,
                                           weights=weights, n_classes=n_classes, random_state=random_state,
                                           shuffle=shuffle)

    else:  # elif c_type == 'regression':
        print('Creating Regression Dataset...')
        df = create_regression_dataset(n_samples=n_samples, n_features=n_features,
                                       n_informative=n_informative, effective_rank=effective_rank,
                                       tail_strength=tail_strength, noise=noise, random_state=random_state,
                                       shuffle=shuffle)

    # make sure to use safe lookups to avoid KeyErrors!!!
    label_list = _safe_get_with_default(config, 'label_list', None)
    do_categorical = label_list is not None and len(label_list) > 0

    if do_categorical:
        print("Creating Categorical Features...")

        df = create_categorical_features(df, label_list, random_state=random_state)

    # insert entropy
    insert_dollar = _safe_get_with_default(config, 'insert_dollar', "No")
    insert_percent = _safe_get_with_default(config, 'insert_percent', "No")

    if any(entropy == "Yes" for entropy in (insert_dollar, insert_percent)):
        print("Inserting Requested Entropy...")

        # add $ or % column if requested
        if insert_dollar == "Yes":
            df = insert_special_char('$', df, random_state=random_state)
        if insert_percent == "Yes":
            df = insert_special_char('%', df, random_state=random_state)

    # insert missing values
    pct_missing = _safe_get_with_default(config, 'pct_missing', None)
    df = insert_missing_values(df, pct_missing, random_state=random_state)

    # Convert dataset to star schema if requested
    star_schema = _safe_get_with_default(config, 'star_schema', "No")
    outpath = _safe_get_with_default(config, 'out_path', "." + os.path.sep)
    if star_schema == "Yes":
        # Check the number of categorical variables
        if do_categorical:
            df = make_star_schema(df, outpath)
        else:
            print("No categorical variables added. Dataset cannot be transformed into a star schema. "
                  "Dataset will be generated as a single-table dataset...")

    print("Writing Train/Test Datasets")
    write_dataset(df, _safe_get_with_default(config, 'output', 'my_dataset'), outpath)


if __name__ == "__main__":
    make_dataset()