python source code of data_preparation

Project: healthcareai-py (GitHub Link)

healthcareai-py-master
- example_classification_1.py
- how_to_release_a_version.md
- conda-build-pipeline.sh
- environment.yml
- conda-recipe
  - meta.yaml
  - build.sh
  - bld.bat
- AUTHORS
- example_regression_2.py
- example_advanced.py
- healthcareai
  - datasets
    - data
      - diagnostic_breast_cancer.csv
      - fertility.csv
      - thoracic_surgery.csv
      - heart_disease.csv
      - prognostic_breast_cancer.csv
      - pima_indians_diabetes.csv
      - acute_inflammations.csv
      - cervical_cancer.csv
      - mammographic_masses.csv
      - diabetes.csv
    - __init__.py
    - base.py
  - trained_models
    - trained_supervised_model.py
    - __init__.py
  - supervised_model_trainer.py
  - common
    - database_validators.py
    - csv_loader.py
    - database_library_validators.py
    - transformers.py
    - top_factors.py
    - impact_coding.py
    - model_eval.py
    - table_archiver.py
    - randomized_search.py
    - cardinality_checks.py
    - healthcareai_error.py
    - azure_blob_storage_helper.py
    - predict.py
    - trainer_output.py
    - get_categorical_levels.py
    - filters.py
    - file_io_utilities.py
    - __init__.py
    - feature_availability_profiler.py
    - database_writers.py
    - catalyst_sqlite_db_fixtures.py
    - helpers.py
    - database_connections.py
  - advanced_supvervised_model_trainer.py
  - pipelines
    - data_preparation.py
    - __init__.py
  - __init__.py
  - tests
    - test_top_factors.py
    - test_csv_loader.py
    - test_file_utilities.py
    - test_trainer_decorator.py
    - fixtures
      - iris_classification.csv
      - .gitkeep
    - test_dataframe_transformers_Dataframe_Imputer.py
    - test_trainer.py
    - test_advanced_trainer.py
    - test_trained_supervised_model.py
    - test_database_validation.py
    - test_cardinality_checks.py
    - test_impact_coding.py
    - __init__.py
    - test_model_eval_helpers.py
    - test_dataframe_transformers.py
    - test_categorical_levels.py
    - test_datasets.py
    - test_feature_availability_profiler.py
    - test_dataframe_filters.py
    - test_model_eval.py
    - test_predict.py
    - helpers.py
- example_regression_1.py
- mkdocs.yml
- LICENSE
- example_classification_2.py
- dev-requirements.txt
- appveyor.yml
- CONTRIBUTING.md
- appveyor
  - run_with_env.cmd
  - install.ps1
- tasks.py
- CHANGELOG.md
- setup.py
- dox
  - Makefile
  - make.bat
  - healthcareai.trained_models.rst
  - healthcareai.rst
  - healthcareai.pipelines.rst
  - modules.rst
  - index.rst
  - conf.py
  - healthcareai.tests.rst
  - healthcareai.common.rst
- setup.cfg
- .travis.yml
- README.md
- .landscape.yml
- Dockerfile
- .gitignore
- docs
  - deploy.md
  - compile.md
  - img
  - catalyst_edw_instructions.md
  - profiling_your_data.md
  - training.md
  - hints.md
  - architecture_overview_for_developers.md
  - index.md
  - favicon.ico
  - prediction_types.md
  - FAQ.md
  - getting_started.md
  - conf.py
  - mermaid.js
  - databases.md
  - utilities.md
- .dockerignore
- MANIFEST.in

from sklearn.pipeline import Pipeline

import healthcareai.common.transformers as hcai_transformers
import healthcareai.common.filters as hcai_filters


def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True, imputeStrategy='MeanMode', tunedRandomForest=False, numeric_columns_as_categorical=None):
    """
    Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.
    
    Note advanced users may wish to use their own custom pipeline.
    """

    # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
    #   inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
    pipeline = Pipeline([
        ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
        ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)),
        # Perform one of two basic imputation methods
        # TODO we need to think about making this optional to solve the problem of rare and very predictive values
        ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical)),
        ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
        ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
        ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
        ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])),
    ])
    return pipeline