
flowlight side of 1st place solution for TalkingData AdTracking Fraud Detection Challenge (

Please see or (in Japanese) if you want to know the overview of our solution.

Disclaimer: I did not care quality/readability of code in this repository, and the following documentation may contain errata.


Since this repository assumes all the experiments are conducted on a Docker container on an AWS docker host created by docker-machine, you need to create a docker host and image before experiments.

  "features": [
  "model": {
    "name": "lightgbm",
    "model_params": {
      "boosting_type": "gbdt",
      "objective": "binary",
      "metric": "auc",
      "learning_rate": 0.01,
      "num_leaves": 255,
      "max_depth": 8,
      "min_child_samples": 200,
      "subsample": 0.9,
      "subsample_freq": 1,
      "colsample_bytree": 0.5,
      "min_child_weight": 0,
      "subsample_for_bin": 1000000,
      "min_split_gain": 0,
      "reg_lambda": 0,
      "verbose": 0
    "train_params": {
      "num_boost_round": 5000,
      "early_stopping_rounds": 30
  "dataset": {
    "input_directory": "data/input/",
    "cache_directory": "data/working/",
    "files": {
      "train": "train.feather",
      "test": "old_test.feather"
    "negative_down_sampling": {
      "enabled": true,
      "bagging_size": 5
  "rank_average": false,
  "test_hours": {
    "filter_validation": true,
    "higher_train_weight": false
  "note": "100 with min_child_samples = 200"