talkingdata-adtracking-fraud-detection

flowlight side of 1st place solution for TalkingData AdTracking Fraud Detection Challenge (https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/)

Please see https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/56475 or https://www.slideshare.net/TakanoriHayashi3/talkingdata-adtracking-fraud-detection-challenge-1st-place-solution (in Japanese) if you want to know the overview of our solution.

Disclaimer: I did not care quality/readability of code in this repository, and the following documentation may contain errata.

Setup

Since this repository assumes all the experiments are conducted on a Docker container on an AWS docker host created by docker-machine, you need to create a docker host and image before experiments.

{
  "features": [
    "app",
    "hour",
    "count",
    "duplicated_row_index_diff",
    "future_click_count_10",
    "future_click_count_80",
    "next_click_time_delta",
    "prev_click_time_delta",
    "all_click_count",
    "average_attributed_ratio",
    "komaki_lda_10_ip",
    "komaki_lda_20_no_device_ip",
    "komaki_lda_20_no_device_os",
    "komaki_lda_20_no_device_channel",
    "komaki_lda_20_no_device_app",
    "komaki_lda_5_no_device",
    "komaki_nmf_5_no_device",
    "komaki_pca_5_no_device"
  ],
  "model": {
    "name": "lightgbm",
    "model_params": {
      "boosting_type": "gbdt",
      "objective": "binary",
      "metric": "auc",
      "learning_rate": 0.01,
      "num_leaves": 255,
      "max_depth": 8,
      "min_child_samples": 200,
      "subsample": 0.9,
      "subsample_freq": 1,
      "colsample_bytree": 0.5,
      "min_child_weight": 0,
      "subsample_for_bin": 1000000,
      "min_split_gain": 0,
      "reg_lambda": 0,
      "verbose": 0
    },
    "train_params": {
      "num_boost_round": 5000,
      "early_stopping_rounds": 30
    }
  },
  "dataset": {
    "input_directory": "data/input/",
    "cache_directory": "data/working/",
    "files": {
      "train": "train.feather",
      "test": "old_test.feather"
    },
    "negative_down_sampling": {
      "enabled": true,
      "bagging_size": 5
    }
  },
  "rank_average": false,
  "test_hours": {
    "filter_validation": true,
    "higher_train_weight": false
  },
  "note": "100 with min_child_samples = 200"
}