import gzip import math import pickle import zlib import io import pandas as pd import numpy as np # import scipy.stats from sklearn.preprocessing import LabelEncoder import engines from utils import * np.random.seed(2016) transformers = {} def assert_uniq(series, name): uniq = np.unique(series, return_counts=True) print("assert_uniq", name, uniq) def custom_one_hot(df, features, name, names, dtype=np.int8, check=False): for n, val in names.items(): new_name = "%s_%s" % (name, n) print(name, new_name) df[new_name] = df[name].map(lambda x: 1 if x == val else 0).astype(dtype) if check: assert_uniq(df[new_name], new_name) features.append(new_name) def label_encode(df, features, name): df[name] = df[name].astype('str') if name in transformers: # test df[name] = transformers[name].transform(df[name]) else: # train transformers[name] = LabelEncoder() df[name] = transformers[name].fit_transform(df[name]) features.append(name) def encode_top(s, count=100, dtype=np.int8): uniqs, freqs = np.unique(s, return_counts=True) top = sorted(zip(uniqs,freqs), key=lambda vk: vk[1], reverse = True)[:count] top_map = {uf[0]: l+1 for uf, l in zip(top, range(len(top)))} return s.map(lambda x: top_map.get(x, 0)).astype(dtype) def apply_transforms(train_df): features = [] with Timer("apply transforms"): label_encode(train_df, features, "canal_entrada") # label_encode(train_df, features, "nomprov") # use cod_prov only label_encode(train_df, features, "pais_residencia") train_df["age"] = train_df["age"].fillna(0.0).astype(np.int16) features.append("age") train_df["renta"].fillna(1.0, inplace=True) train_df["renta_top"] = encode_top(train_df["renta"]) assert_uniq(train_df["renta_top"], "renta_top") features.append("renta_top") train_df["renta"] = train_df["renta"].map(math.log) features.append("renta") train_df["antiguedad"] = train_df["antiguedad"].map(lambda x: 0.0 if x < 0 or math.isnan(x) else x+1.0).astype(np.int16) features.append("antiguedad") train_df["tipodom"] = train_df["tipodom"].fillna(0.0).astype(np.int8) features.append("tipodom") train_df["cod_prov"] = train_df["cod_prov"].fillna(0.0).astype(np.int8) features.append("cod_prov") train_df["fecha_dato_month"] = train_df["fecha_dato"].map(lambda x: int(x.split("-")[1])).astype(np.int8) features.append("fecha_dato_month") train_df["fecha_dato_year"] = train_df["fecha_dato"].map(lambda x: float(x.split("-")[0])).astype(np.int16) features.append("fecha_dato_year") train_df["fecha_alta_month"] = train_df["fecha_alta"].map(lambda x: 0.0 if x.__class__ is float else float(x.split("-")[1])).astype(np.int8) features.append("fecha_alta_month") train_df["fecha_alta_year"] = train_df["fecha_alta"].map(lambda x: 0.0 if x.__class__ is float else float(x.split("-")[0])).astype(np.int16) features.append("fecha_alta_year") train_df["fecha_dato_float"] = train_df["fecha_dato"].map(date_to_float) train_df["fecha_alta_float"] = train_df["fecha_alta"].map(date_to_float) train_df["dato_minus_alta"] = train_df["fecha_dato_float"] - train_df["fecha_alta_float"] features.append("dato_minus_alta") train_df["int_date"] = train_df["fecha_dato"].map(date_to_int).astype(np.int8) custom_one_hot(train_df, features, "indresi", {"n":"N"}) custom_one_hot(train_df, features, "indext", {"s":"S"}) custom_one_hot(train_df, features, "conyuemp", {"n":"N"}) custom_one_hot(train_df, features, "sexo", {"h":"H", "v":"V"}) custom_one_hot(train_df, features, "ind_empleado", {"a":"A", "b":"B", "f":"F", "n":"N"}) custom_one_hot(train_df, features, "ind_nuevo", {"new":1}) custom_one_hot(train_df, features, "segmento", {"top":"01 - TOP", "particulares":"02 - PARTICULARES", "universitario":"03 - UNIVERSITARIO"}) custom_one_hot(train_df, features, "indfall", {"s":"S"}) train_df["ind_actividad_cliente"] = train_df["ind_actividad_cliente"].map(lambda x: 0.0 if math.isnan(x) else x+1.0).astype(np.int8) features.append("ind_actividad_cliente") custom_one_hot(train_df, features, "indrel", {"1":1, "99":99}) train_df["indrel_1mes"] = train_df["indrel_1mes"].map(lambda x: 5.0 if x == "P" else x).astype(float).fillna(0.0).astype(np.int8) assert_uniq(train_df["indrel_1mes"], "indrel_1mes") features.append("indrel_1mes") custom_one_hot(train_df, features, "tiprel_1mes", {"a":"A", "i":"I", "p":"P", "r":"R"}, check=True) return train_df, tuple(features) def make_prev_df(train_df, step): with Timer("make prev%s DF" % step): prev_df = pd.DataFrame() prev_df["ncodpers"] = train_df["ncodpers"] prev_df["int_date"] = train_df["int_date"].map(lambda x: x+step).astype(np.int8) prod_features = ["%s_prev%s" % (prod, step) for prod in products] for prod, prev in zip(products, prod_features): prev_df[prev] = train_df[prod] return prev_df, tuple(prod_features) def load_data(fname="../input/all_clean.csv"): with Timer("load train csv"): train_df = pd.read_csv(fname, dtype=dtypes) with Timer("fill products NA"): for prod in products: train_df[prod] = train_df[prod].fillna(0.0).astype(np.int8) train_df, features = apply_transforms(train_df) prev_dfs = [] prod_features = None use_features = frozenset([1,2]) for step in range(1,6): prev1_train_df, prod1_features = make_prev_df(train_df, step) prev_dfs.append(prev1_train_df) if step in use_features: features += prod1_features if step == 1: prod_features = prod1_features return train_df, prev_dfs, features, prod_features def join_with_prev(df, prev_df, how): with Timer("join %s" % how): assert set(df.columns.values.tolist()) & set(prev_df.columns.values.tolist()) == set(["ncodpers", "int_date"]) print("before join", len(df)) df = df.merge(prev_df, on=["ncodpers", "int_date"], how=how) for f in set(prev_df.columns.values.tolist()) - set(["ncodpers", "int_date"]): df[f] = df[f].astype(np.float16) print("after join", len(df)) return df def make_data(): train_df, prev_dfs, features, prod_features = load_data() for i, prev_df in enumerate(prev_dfs): with Timer("join train with prev%s" % (i+1)): how = "inner" if i == 0 else "left" train_df = join_with_prev(train_df, prev_df, how=how) # Various aggregates to try # for prod in products: # print() # print(prod) # #prev1_bin = (train_df[prod + "_prev1"] != 1).astype(np.int8) # for begin, end in [(2,5),(1,4)]: # prods = ["%s_prev%s" % (prod, i) for i in range(begin,end+1)] # mp_df = train_df.as_matrix(columns=prods) # print(prods) # # stdf = "%s_std_%s_%s" % (prod,begin,end) # train_df[stdf] = np.nanstd(mp_df, axis=1) # * prev1_bin # # maxf = "%s_max_%s_%s"%(prod,begin,end) # train_df[maxf] = np.nanmax(mp_df, axis=1).astype(np.int8) # # # minf = "%s_min_%s_%s"%(prod,begin,end) # # train_df[minf] = np.nanmin(mp_df, axis=1).astype(np.int8) # # chf = "%s_ch_%s_%s"%(prod,begin,end) # train_df[chf] = np.sum(np.invert(np.isclose(mp_df[:,1:], mp_df[:,:-1], equal_nan=True)), axis=1, dtype=np.int8) # # sumf = "%s_sum_%s_%s"%(prod,begin,end) # train_df[sumf] = np.nansum(mp_df, axis=1, dtype=np.int8) # # skewf = "%s_skew_%s_%s"%(prod,begin,end) # train_df[skewf] = scipy.stats.skew(mp_df, axis=1) # # features += (stdf,maxf,chf,sumf,skewf) for prod in products: print() print(prod) for begin, end in [(1,3),(1,5),(2,5)]: prods = ["%s_prev%s" % (prod, i) for i in range(begin,end+1)] mp_df = train_df.as_matrix(columns=prods) print(prods) stdf = "%s_std_%s_%s" % (prod,begin,end) train_df[stdf] = np.nanstd(mp_df, axis=1) # * prev1_bin features += (stdf,) for prod in products: print() print(prod) for begin, end in [(2,3),(2,5)]: prods = ["%s_prev%s" % (prod, i) for i in range(begin,end+1)] mp_df = train_df.as_matrix(columns=prods) print(prods) minf = "%s_min_%s_%s"%(prod,begin,end) train_df[minf] = np.nanmin(mp_df, axis=1).astype(np.int8) maxf = "%s_max_%s_%s"%(prod,begin,end) train_df[maxf] = np.nanmax(mp_df, axis=1).astype(np.int8) features += (minf,maxf,) with Timer("Remove unused columns"): leave_columns = ["ncodpers", "int_date", "fecha_dato"] + list(products) + list(features) assert len(leave_columns) == len(set(leave_columns)) train_df = train_df[leave_columns] return train_df, features, prod_features def make_submission(f, Y_test, C): Y_ret = [] with Timer("make submission"): f.write("ncodpers,added_products\n".encode('utf-8')) for c, y_test in zip(C, Y_test): y_prods = [(y,p,ip) for y,p,ip in zip(y_test, products, range(len(products)))] y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7] Y_ret.append([ip for y,p,ip in y_prods]) y_prods = [p for y,p,ip in y_prods] f.write(("%s,%s\n" % (int(c), " ".join(y_prods))).encode('utf-8')) return Y_ret def train_predict(all_df, features, prod_features, str_date, cv): test_date = date_to_int(str_date) train_df = all_df[all_df.int_date < test_date] test_df = pd.DataFrame(all_df[all_df.int_date == test_date]) print(sorted(set(train_df.columns.values.tolist()))) print(len(train_df.columns.values.tolist()), len(set(train_df.columns.values.tolist()))) print(len(features),len(set(features))) X = [] Y = [] for i, prod in enumerate(products): prev = prod + "_prev1" prX = train_df[(train_df[prod] == 1) & (train_df[prev] == 0)] prY = np.zeros(prX.shape[0], dtype=np.int8) + i X.append(prX) Y.append(prY) print(prod, prX.shape) XY = pd.concat(X) Y = np.hstack(Y) XY["y"] = Y XY["url"] = np.zeros(len(XY), dtype=np.int8) del train_df del all_df XY["ncodepers_fecha_dato"] = XY["ncodpers"].astype(str) + XY["fecha_dato"] uniqs, counts = np.unique(XY["ncodepers_fecha_dato"], return_counts=True) weights = np.exp(1/counts - 1) print(np.unique(counts, return_counts=True)) print(np.unique(weights, return_counts=True)) wdf = pd.DataFrame() wdf["ncodepers_fecha_dato"] = uniqs wdf["counts"] = counts wdf["weight"] = weights print("before merge", len(XY)) XY = XY.merge(wdf, on="ncodepers_fecha_dato") print("after merge", len(XY)) print(XY.shape) mask = np.random.rand(len(XY)) < 0.8 XY_train = XY[mask] XY_validate = XY[~mask] with Timer("prepare test data"): test_df["y"] = test_df["ncodpers"] test_df["url"] = np.zeros(len(test_df), dtype=np.int8) test_df["weight"] = np.ones(len(test_df), dtype=np.int8) Y_prev = test_df.as_matrix(columns=prod_features) C = test_df.as_matrix(columns=["ncodpers"]) for prod in products: prev = prod + "_prev1" padd = prod + "_add" test_df[padd] = test_df[prod] - test_df[prev] test_add_mat = test_df.as_matrix(columns=[prod + "_add" for prod in products]) test_add_list = [list() for i in range(len(C))] assert test_add_mat.shape == (len(C), len(products)) count = 0 for c in range(len(C)): for p in range(len(products)): if test_add_mat[c,p] > 0: test_add_list[c].append(p) count += 1 if cv: max_map7 = mapk(test_add_list, test_add_list, 7, 0.0) map7coef = float(len(test_add_list)) / float(sum([int(bool(a)) for a in test_add_list])) print("Max MAP@7", str_date, max_map7, max_map7*map7coef) with Timer("LightGBM"): Y_test_lgbm = engines.lightgbm(XY_train, XY_validate, test_df, features, XY_all = XY, restore = (str_date == "2016-06-28") ) test_add_list_lightgbm = make_submission(io.BytesIO() if cv else gzip.open("tmp/%s.lightgbm.csv.gz" % str_date, "wb"), Y_test_lgbm - Y_prev, C) if cv: map7lightgbm = mapk(test_add_list, test_add_list_lightgbm, 7, 0.0) print("LightGBMlib MAP@7", str_date, map7lightgbm, map7lightgbm*map7coef) with Timer("XGBoost"): Y_test_xgb = engines.xgboost(XY_train, XY_validate, test_df, features, XY_all = XY, restore = (str_date == "2016-06-28") ) test_add_list_xgboost = make_submission(io.BytesIO() if cv else gzip.open("tmp/%s.xgboost.csv.gz" % str_date, "wb"), Y_test_xgb - Y_prev, C) if cv: map7xgboost = mapk(test_add_list, test_add_list_xgboost, 7, 0.0) print("XGBoost MAP@7", str_date, map7xgboost, map7xgboost*map7coef) Y_test = np.sqrt(np.multiply(Y_test_xgb, Y_test_lgbm)) test_add_list_xl = make_submission(io.BytesIO() if cv else gzip.open("tmp/%s.xgboost-lightgbm.csv.gz" % str_date, "wb"), Y_test - Y_prev, C) if cv: map7xl = mapk(test_add_list, test_add_list_xl, 7, 0.0) print("XGBoost+LightGBM MAP@7", str_date, map7xl, map7xl*map7coef) if __name__ == "__main__": if True: all_df, features, prod_features = make_data() with Timer("save data"): all_df.to_pickle("tmp/cv_data.pickle") pickle.dump((features, prod_features), open("tmp/cv_meta.pickle", "wb")) else: with Timer("restore data"): all_df = pd.read_pickle("tmp/cv_data.pickle") (features, prod_features) = pickle.load(open("tmp/cv_meta.pickle", "rb")) train_predict(all_df, features, prod_features, "2016-05-28", cv=True) train_predict(all_df, features, prod_features, "2016-06-28", cv=False)