python source code of discovery

Project: fairtest (GitHub Link)

fairtest-master
- src
  - test.ipynb
  - recommender.ipynb
  - medical.ipynb
  - fairtest
    - testing.py
    - utils
      - log.py
      - prepare_data.py
      - __init__.py
    - examples
      - testing.py
      - error_profiling.py
      - __init__.py
      - discovery.py
    - service
      - launch_server.py
      - helpers
        db.py
        config.py
        experiments.py
        __init__.py
        worker.py
      - templates
        upload.html
      - config
        config.yaml
      - activate_fairtest_service.sh
      - __init__.py
      - static
        img
        js
        bootstrap.min.js
        bootstrap.js
        css
        bootstrap-responsive.min.css
        bootstrap.min.css
        bootstrap.css
        bootstrap-responsive.css
      - launch_workers.py
    - error_profiling.py
    - benchmarks
      - make_scheduling.py
      - make_compas.py
      - Makefile
      - make_benchmark_effectiveness.py
      - make_medical.py
      - make_csv.sh
      - __init__.py
      - make_adult.py
      - make_staples.py
      - make_berkeley.py
      - make_images.py
      - make_benchmark_performance.py
      - make_sf.py
      - make_recommender.py
      - make_benchmark.py
    - __init__.py
    - tests
      - test_discovery.py
      - test_investigation.py
      - test_error_profiling.py
      - test_testing.py
      - data
        tiny_berkeley.csv
      - __init__.py
      - modules
        context_discovery
        __init__.py
        statistics
        __init__.py
        bug_report
        __init__.py
        __init__.py
        metrics
        test_correlation.py
        test_regression.py
        __init__.py
        test_binary.py
        test_mutual_info.py
    - holdout.py
    - investigation.py
    - modules
      - context_discovery
        tree_parser.py
        guided_tree.py
        __init__.py
      - statistics
        confidence_interval.py
        multiple_testing.py
        hypothesis_test.py
        __init__.py
      - bug_report
        report.py
        __init__.py
        filter_rank.py
      - __init__.py
      - metrics
        binary_metrics.py
        metric.py
        correlation.py
        regression.py
        __init__.py
        mutual_info.py
    - discovery.py
  - apps
    - demo
      - demo.py
    - overfeat
      - results_black.txt
      - README
      - find_labels.py
      - results_white.txt
      - parse_results.py
    - compas
      - truth_tables.py
    - propdiff
      - propdiff.cc
      - Makefile
      - binomial.cc
      - binomial.h
    - staples
      - wsgi.py
      - api
        serializers.py
        models.py
        views.py
        tests.py
        __init__.py
        admin.py
        management
        commands
        post_users.py
        get_users.py
        read_users.py
        populate_zipcodes.py
        populate_competitors.py
        __init__.py
        populate_users.py
        put_users.py
        populate_stores.py
        __init__.py
      - simulation
      - bugreport
        helpers
        __init__.py
        distance.py
        views.py
        __init__.py
      - templates
        bugreport
      - manage.py
      - urls.py
      - data
        staples_coordinates.csv
        office_depot_max_coordinates.csv
        office_depot_max_addresses.csv
      - README.md
      - scripts
        avg_income_per_race.py
        parse_synthetic_users.py
        address_match.py
        temp_avg.py
        zipcode_match.py
        mkstats_on_deltas.py
        mkstats_on_proportional.py
        mkstats_on_location_dependency.py
      - settings.py
      - requirements.txt
    - recommender
      - output
      - README
      - average_scores.py
      - preprocess_ratings.py
      - MovieLensALS.py
      - postprocess_recommendations.py
      - input
        README
        users.dat
        movies.dat
    - scheduling
      - Makefile
      - scheduling.py
      - helpers
        columns.py
        utils.py
      - data
        dallas_police_reports.json
    - medical
      - Regression_old.ipynb
      - convert_header.py
      - Regression.ipynb
    - caffe
      - results_black.txt
      - find_labels.py
      - results_white.txt
      - parse_results.py
      - imagenet.py
    - sfpolice
      - predictions_conf.txt
      - acs5demographics.csv
      - augment_train.py
      - coord2zipcodes.py
      - README.md
      - logit.py
      - acs5demographics.py
      - .gitignore
- results
  - pretty_graphs.py
  - report_staples.txt
  - report_caffe.txt
  - report_medical_reg.txt
  - report_overfeat.txt
  - report_berkeley_initial.txt
  - report_berkeley.txt
  - report_adult.txt
  - medical_reg_plots
  - report_scheduling.txt
- LICENSE
- setup.py
- data
  - inmates
  - compas
  - adult
  - images
    - caffe.txt
    - overfeat_raw.txt
  - staples
  - recommender
  - scheduling
  - medical
  - prudential
  - berkeley
    - berkeley.csv
  - benchmark
  - sfpolice
- README.md
- .gitignore

"""
Run FairTest Discovery Investigations on Movie Recommender Dataset

Usage: python discovery.py
"""

import fairtest.utils.prepare_data as prepare
from fairtest import Discovery, train, test, report, DataSource
import ast
import pandas as pd
from sklearn import preprocessing

import sys


def main(argv=sys.argv):
    if len(argv) != 1:
        usage(argv)

    FILENAME = "../../../data/recommender/recommendations.txt"
    OUTPUT_DIR = "."
    data = prepare.data_from_csv(FILENAME, sep='\\t',
                                 to_drop=['RMSE', 'Avg Movie Age',
                                          'Avg Recommended Rating',
                                          'Avg Seen Rating', 'Occupation'])
    TARGET = 'Types'
    SENS = ['Gender']

    EXPL = []
    labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
    for labels in labeled_data:
        assert len(labels) == 5
    label_encoder = preprocessing.MultiLabelBinarizer()
    labeled_data = label_encoder.fit_transform(labeled_data)
    labels = label_encoder.classes_
    df_labels = pd.DataFrame(labeled_data, columns=labels)
    data = pd.concat([data.drop(TARGET, axis=1), df_labels], axis=1)
    TARGET = labels.tolist()

    data_source = DataSource(data)

    # Instantiate the experiment
    inv = Discovery(data_source, SENS, TARGET, EXPL, topk=10, random_state=0)

    # Train the classifier
    train([inv])

    # Evaluate on the testing set
    test([inv])

    # Create the report
    report([inv], "discovery", OUTPUT_DIR)


def usage(argv):
    print "Usage:%s" % argv[0]
    exit(-1)

if __name__ == '__main__':
    sys.exit(main())