python source code of taar

"""
This trains an ensemble model for TAAR
based on a set of constituent recommenders (collaborative, locale and
similarity).

We take firefox client_info data from the clients_daily table and
and obtain the most recent data.

For each client with N addons, We mask the most recently installed
addon to use as the best suggestion.  Using the N-1 addon list - we
generate recommendations for each of the 3 base models outputting
GUID and weight for each recommendation.

We compute CLLR values substituting in 0 in the edge case where CLLR
computes a NaN value for the recommendation set from each recommender.

We then compute a Vector with (has_match, weight=1.0,
features=[cllr_1, cllr_2, cllr_3]) and then train a LogisticRegression
model to compute coefficients for each of the recommenders.
"""

import click
import boto3
import json
import numpy as np
import os
import sys
import tempfile
import contextlib
import shutil

from datetime import date, timedelta
from importlib import reload
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, size, rand
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
from pyspark import SparkConf
from taar.context import default_context


# Define the set of feature names to be used in the donor computations.
CATEGORICAL_FEATURES = ["geo_city", "locale", "os"]
CONTINUOUS_FEATURES = [
    "subsession_length",
    "bookmark_count",
    "tab_open_count",
    "total_uri",
    "unique_tlds",
]


def get_df(spark, date_from, sample_rate):
    gs_url = "gs://moz-fx-data-derived-datasets-parquet/clients_daily/v6"
    parquetFile = spark.read.parquet(gs_url)
    # Use the parquet files to create a temporary view and then used in SQL statements.
    parquetFile.createOrReplaceTempView("clients_daily")
    df = (
        spark.sql("SELECT * FROM clients_daily")
        .where("active_addons IS NOT null")
        .where("size(active_addons) > 2")
        .where("size(active_addons) < 100")
        .where("channel = 'release'")
        .where("app_name = 'Firefox'")
        .where("submission_date_s3 >= {}".format(date_from))
        .selectExpr(
            "client_id as client_id",
            "active_addons as active_addons",
            "city as geo_city",
            "subsession_hours_sum as subsession_length",
            "locale as locale",
            "os as os",
            "row_number() OVER (PARTITION BY client_id ORDER BY submission_date_s3 desc) as rn",
            "places_bookmarks_count_mean AS bookmark_count",
            "scalar_parent_browser_engagement_tab_open_event_count_sum AS tab_open_count",
            "scalar_parent_browser_engagement_total_uri_count_sum AS total_uri",
            "scalar_parent_browser_engagement_unique_domains_count_max AS unique_tlds",
        )
        .where("rn = 1")
        .drop("rn")
    ).sample(False, sample_rate)
    return df


def get_addons_per_client(users_df, minimum_addons_count):
    """ Extracts a DataFrame that contains one row
    for each client along with the list of active add-on GUIDs.
    """

    def is_valid_addon(addon):
        return not (
            addon.is_system
            or addon.app_disabled
            or addon.type != "extension"
            or addon.user_disabled
            or addon.foreign_install
            or addon.install_day is None
        )

    # may need additional whitelisting to remove shield addons

    def get_valid_addon_ids(addons):
        sorted_addons = sorted(
            [(a.addon_id, a.install_day) for a in addons if is_valid_addon(a)],
            key=lambda addon_tuple: addon_tuple[1],
        )
        return [addon_id for (addon_id, install_day) in sorted_addons]

    get_valid_addon_ids_udf = udf(get_valid_addon_ids, ArrayType(StringType()))

    # Create an add-ons dataset un-nesting the add-on map from each
    # user to a list of add-on GUIDs. Also filter undesired add-ons.
    return users_df.select(
        "client_id", get_valid_addon_ids_udf("active_addons").alias("addon_ids")
    ).filter(size("addon_ids") > minimum_addons_count)


def safe_get_int(row, fieldname, default, factor=None):
    tmp = getattr(row, fieldname, default)
    if tmp is None:
        return 0
    try:
        if factor is not None:
            tmp *= factor
        tmp = int(tmp)
    except Exception:
        return 0
    return tmp


def safe_get_str(row, fieldname):
    tmp = getattr(row, fieldname, "")
    if tmp is None:
        return ""
    return str(tmp)


def row_to_json(row):
    jdata = {}

    # This is not entirely obvious.  All of our row data from raw telemetry uses *real*
    # client_ids.   The production TAAR system only uses hashed telemetry client IDs.
    # That said - we don't need to hash because we are only concerned
    # with GUID recommendations here for the purposes of training
    jdata["client_id"] = row.client_id

    # Note the inconsistent naming of the addon ID field
    jdata["installed_addons"] = row.addon_ids
    jdata["bookmark_count"] = safe_get_int(row, "bookmark_count", 0)
    jdata["tab_open_count"] = safe_get_int(row, "tab_open_count", 0)
    jdata["total_uri"] = safe_get_int(row, "total_uri", 0)
    jdata["subsession_length"] = safe_get_int(row, "subsession_length", 0, 3600)
    jdata["unique_tlds"] = safe_get_int(row, "unique_tlds", 0)
    jdata["geo_city"] = safe_get_str(row, "geo_city")
    jdata["locale"] = safe_get_str(row, "locale")
    jdata["os"] = safe_get_str(row, "os")

    return jdata


def reload_configuration():
    """
    Configuration needs to be reloaded on a per worker node basis.
    This is an unfortunate a side effect of re-using the TAAR library which
    expects to be using python-decouple to load the configuration
    from enviroment variables.
    """
    from taar.recommenders import s3config

    # Locale Recommender Overrides
    # This *Must* be called just prior to instantiating the individual recommenders in the
    # ETL enviroment.
    s3config.TAAR_LOCALE_BUCKET = os.environ[
        "TAAR_LOCALE_BUCKET"
    ] = "telemetry-parquet"
    s3config.TAAR_LOCALE_KEY = os.environ[
        "TAAR_LOCALE_KEY"
    ] = "taar/locale/top10_dict.json"

    # Similarity Recommender configuration overrides
    s3config.TAAR_SIMILARITY_BUCKET = os.environ[
        "TAAR_SIMILARITY_BUCKET"
    ] = "telemetry-parquet"
    s3config.TAAR_SIMILARITY_DONOR_KEY = os.environ[
        "TAAR_SIMILARITY_DONOR_KEY"
    ] = "taar/similarity/donors.json"
    s3config.TAAR_SIMILARITY_LRCURVES_KEY = os.environ[
        "TAAR_SIMILARITY_LRCURVES_KEY"
    ] = "taar/similarity/lr_curves.json"

    # Collaborative Recommender Overrides
    s3config.TAAR_ITEM_MATRIX_BUCKET = os.environ[
        "TAAR_ITEM_MATRIX_BUCKET"
    ] = "telemetry-public-analysis-2"
    s3config.TAAR_ITEM_MATRIX_KEY = os.environ[
        "TAAR_ITEM_MATRIX_KEY"
    ] = "telemetry-ml/addon_recommender/item_matrix.json"
    s3config.TAAR_ADDON_MAPPING_BUCKET = os.environ[
        "TAAR_ADDON_MAPPING_BUCKET"
    ] = "telemetry-public-analysis-2"
    s3config.TAAR_ADDON_MAPPING_KEY = os.environ[
        "TAAR_ADDON_MAPPING_KEY"
    ] = "telemetry-ml/addon_recommender/addon_mapping.json"

    from taar.recommenders import LocaleRecommender
    from taar.recommenders import SimilarityRecommender
    from taar.recommenders import CollaborativeRecommender

    reload(sys.modules["taar.recommenders"])

    # Force reload of recommender modules
    [
        reload(sys.modules[rec_cls.__module__])
        for rec_cls in [
            LocaleRecommender,
            SimilarityRecommender,
            CollaborativeRecommender,
        ]
    ]


COLLABORATIVE, SIMILARITY, LOCALE = "collaborative", "similarity", "locale"
PREDICTOR_ORDER = [COLLABORATIVE, SIMILARITY, LOCALE]


def load_recommenders(ctx):
    from taar.recommenders import LocaleRecommender
    from taar.recommenders import SimilarityRecommender
    from taar.recommenders import CollaborativeRecommender

    ctx = default_context()

    reload_configuration()

    lr = LocaleRecommender(ctx)
    sr = SimilarityRecommender(ctx)
    cr = CollaborativeRecommender(ctx)
    return {LOCALE: lr, COLLABORATIVE: cr, SIMILARITY: sr}


# Make predictions with sub-models and construct a new stacked row
def to_stacked_row(recommender_list, client_row):
    # Build a Row object with a label indicating
    # 1 or 0 for a match within at least one recommender.
    # Weight is set to 1.0 as the features will use a cllr result
    # indicating 'matchiness' with the known truth.
    try:
        training_client_info = row_to_json(client_row)

        # Pop off a single addon as the expected set.
        # I've tried a couple variations on this (pop 1 item, pop 2 items)
        # but there isn't much effect.

        expected = [training_client_info["installed_addons"].pop()]

        stacked_row = []

        cLLR = CostLLR()

        for recommend in recommender_list:
            guid_weight_list = recommend(training_client_info, limit=4)
            cllr_val = cLLR.evalcllr(guid_weight_list, expected)
            stacked_row.append(cllr_val)

        return Row(
            label=int(cLLR.total > 0.0),
            weight=1.0,
            features=Vectors.dense(*stacked_row),
        )
    except Exception:
        # This shouldn't happen. Log relevant data so that we can
        # Patch this up on the next run
        return None


# Stack the prediction results for each recommender into a stacked_row for each
# client_info blob in the training set.


def build_stacked_datasets(ctx, dataset, folds):
    # For each of k_folds, we apply the stacking
    # function to the training fold.
    # Where k_folds = 3, this will yield a list consisting
    # of 3 RDDs.   Each RDD is defined by the output of the
    # `stacking` function.

    def stacked_row_closure():
        rec_map = load_recommenders(ctx)

        recommender_list = [
            rec_map[COLLABORATIVE].recommend,  # Collaborative
            rec_map[SIMILARITY].recommend,  # Similarity
            rec_map[LOCALE].recommend,  # Locale
        ]

        def inner(client_row):
            return to_stacked_row(recommender_list, client_row)

        return inner

    wrapped_to_stacked_row = stacked_row_closure()

    print("Number of folds: {}".format(len(folds)))

    stacked_datasets = []
    for fold in folds:
        train_set = [f for f in folds if f != fold]
        stacking_result = [
            df.rdd.map(wrapped_to_stacked_row).filter(lambda x: x is not None)
            for df in train_set
        ]
        stacked_datasets.append(stacking_result)
    return stacked_datasets


def dump_training_info(blorModel):
    """
    This function is useful for debugging when we do not converge to a
    solution during LogisticRegression.
    """
    trainingSummary = blorModel.summary

    print("Total iterations: %d" % trainingSummary.totalIterations)
    print("Intercepts: " + str(blorModel.intercept))
    print("Coefficients: " + str(blorModel.coefficients))
    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)


def today_minus_7_days():
    return (date.today() + timedelta(days=-7)).strftime("%Y%m%d")


def verify_valid_coefs(coefs):
    """ verify that the model has proper floating point values (> 0)
    """

    assert "ensemble_weights" in coefs
    weights = coefs["ensemble_weights"]

    assert len(weights) == 3

    for key in weights.keys():
        assert key in coefs["ensemble_weights"]
        assert not np.isnan(coefs["ensemble_weights"][key])
        assert coefs["ensemble_weights"][key] > 0.0

    # This ordering must be strict
    msg = """
    FINAL WEIGHTS
    =============
    Collab     : {:0.8f}
    Locale     : {:0.8f}
    Similarity : {:0.8f}
    """.format(
        weights["collaborative"], weights["locale"], weights["similarity"]
    )

    print("Weight output")
    print("================================")
    print(msg)
    print("================================")
    assert weights["collaborative"] > 0.0
    assert weights["locale"] > 0.0
    assert weights["similarity"] > 0.0


class CostLLR:
    """ based on Niko Brummer's original implementation:
        Niko Brummer and Johan du Preez, Application-Independent Evaluation of Speaker Detection"
        Computer Speech and Language, 2005
    """

    def __init__(self):
        self._total = 0

    # evalcllr expects two lists
    # recommendations_list should be a list of (guid, weight) 2-tuples
    # unmasked_addons should be a list of guid strings
    def evalcllr(self, recommendations_list, unmasked_addons):
        # Organizer function to extract weights from recommendation list for passing to cllr.
        lrs_on_target_helper = np.array(
            [
                item[1]
                for item in recommendations_list
                if item[0] in unmasked_addons
            ]
        )
        lrs_off_target_helper = np.array(
            [
                item[1]
                for item in recommendations_list
                if item[0] not in unmasked_addons
            ]
        )
        try:
            tmp = self._cllr(lrs_on_target_helper, lrs_off_target_helper)
        except Exception:
            tmp = np.NaN

        if np.isnan(tmp):
            # This may happen if recommendations come back with a
            # weight of 0
            tmp = 0
        self._total += tmp
        return tmp

    @property
    def total(self):
        return self._total

    # Private methods below

    # Helper function to do some math for cllr.
    def _neg_log_sig(self, log_odds):
        neg_log_odds = [-1.0 * x for x in log_odds]
        e = np.exp(neg_log_odds)
        return [np.log(1 + f) for f in e if f < (f + 1)]

    # Compute the log likelihood ratio cost which should be minimized.
    def _cllr(self, lrs_on_target, lrs_off_target):
        lrs_on_target = np.log(lrs_on_target[~np.isnan(lrs_on_target)])
        lrs_off_target = np.log(lrs_off_target[~np.isnan(lrs_off_target)])

        c1 = np.mean(self._neg_log_sig(lrs_on_target)) / np.log(2)
        c2 = np.mean(self._neg_log_sig(-1.0 * lrs_off_target)) / np.log(2)
        return (c1 + c2) / 2


def cross_validation_split(dataset, k_folds):
    """
  Splits dataframe into k_folds, returning array of dataframes
  """
    dataset_split = []
    h = 1.0 / k_folds
    df = dataset.select("*", rand().alias("rand"))

    for i in range(k_folds):
        validateLB = i * h
        validateUB = (i + 1) * h
        condition = (df["rand"] >= validateLB) & (df["rand"] < validateUB)
        fold = df.filter(condition).cache()
        dataset_split.append(fold)

    return dataset_split


def verify_counts(taar_training, addons_info_df, client_samples_df):
    # This verification is only run to debug the job
    taar_training_count = taar_training.count()
    addons_info_count = addons_info_df.count()
    client_samples_count = client_samples_df.count()

    assert taar_training_count != client_samples_count
    assert taar_training_count == addons_info_count
    assert taar_training_count != client_samples_count
    # taar training should contain exactly the same number of elements
    # in addons_info_frame it should have filtered out clients that
    # started in client_features_frame

    print(
        "All counts verified.  taar_training_count == %d" % taar_training_count
    )


def extract(spark, date_from, minInstalledAddons, sample_rate):
    client_samples_df = get_df(spark, date_from, sample_rate)
    addons_info_df = get_addons_per_client(
        client_samples_df, minInstalledAddons
    )
    taar_training = addons_info_df.join(client_samples_df, "client_id", "inner")
    # verify_counts(taar_training, addons_info_df, client_samples_df)
    return taar_training


def compute_regression(spark, rdd_list, regParam, elasticNetParam):
    df0 = spark.sparkContext.union(rdd_list).toDF()
    blor = LogisticRegression(
        maxIter=50,
        regParam=regParam,
        weightCol="weight",
        elasticNetParam=elasticNetParam,
    )

    blorModel = blor.fit(df0)
    return blorModel


def transform(ctx, spark, taar_training, regParam, elasticNetParam):
    k_folds = 4
    df_folds = cross_validation_split(taar_training, k_folds)

    stacked_datasets_rdd_list = build_stacked_datasets(
        ctx, taar_training, df_folds
    )

    # Merge a list of RDD lists into a single RDD and then cast it into a DataFrame
    rdd_list = [
        spark.sparkContext.union(rdd_list)
        for rdd_list in stacked_datasets_rdd_list
    ]

    blorModel = compute_regression(spark, rdd_list, regParam, elasticNetParam)

    coefs = {
        "ensemble_weights": dict(
            [(k, v) for k, v in zip(PREDICTOR_ORDER, blorModel.coefficients)]
        )
    }

    verify_valid_coefs(coefs)

    return coefs


@contextlib.contextmanager
def selfdestructing_path(dirname):
    yield dirname
    shutil.rmtree(dirname)


def store_json_to_s3(json_data, base_filename, date, prefix, bucket):
    """Saves the JSON data to a local file and then uploads it to S3.

    Two copies of the file will get uploaded: one with as "<base_filename>.json"
    and the other as "<base_filename><YYYYMMDD>.json" for backup purposes.

    :param json_data: A string with the JSON content to write.
    :param base_filename: A string with the base name of the file to use for saving
        locally and uploading to S3.
    :param date: A date string in the "YYYYMMDD" format.
    :param prefix: The S3 prefix.
    :param bucket: The S3 bucket name.
    """

    tempdir = tempfile.mkdtemp()

    with selfdestructing_path(tempdir):
        JSON_FILENAME = "{}.json".format(base_filename)
        FULL_FILENAME = os.path.join(tempdir, JSON_FILENAME)
        with open(FULL_FILENAME, "w+") as json_file:
            json_file.write(json_data)

        archived_file_copy = "{}{}.json".format(base_filename, date)

        # Store a copy of the current JSON with datestamp.
        write_to_s3(FULL_FILENAME, archived_file_copy, prefix, bucket)
        write_to_s3(FULL_FILENAME, JSON_FILENAME, prefix, bucket)


def write_to_s3(source_file_name, s3_dest_file_name, s3_prefix, bucket):
    """Store the new json file containing current top addons per locale to S3.

    :param source_file_name: The name of the local source file.
    :param s3_dest_file_name: The name of the destination file on S3.
    :param s3_prefix: The S3 prefix in the bucket.
    :param bucket: The S3 bucket.
    """
    client = boto3.client("s3", "us-west-2")
    transfer = boto3.s3.transfer.S3Transfer(client)

    # Update the state in the analysis bucket.
    key_path = s3_prefix + s3_dest_file_name
    transfer.upload_file(source_file_name, bucket, key_path)


def load(coefs, date, prefix, bucket):
    store_json_to_s3(
        json.dumps(coefs, indent=2),
        "ensemble_weight",
        date,
        prefix,
        bucket,
    )


@click.command()
@click.option("--date", required=True)
@click.option("--aws_access_key_id", required=True)
@click.option("--aws_secret_access_key", required=True)
@click.option("--bucket", default="telemetry-parquet")
@click.option("--prefix", default="taar/ensemble/")
@click.option("--elastic_net_param", default=0.01)
@click.option("--reg_param", default=0.1)
@click.option("--min_installed_addons", default=4)
@click.option("--client_sample_date_from", default=today_minus_7_days())
@click.option("--sample_rate", default=0.005)
def main(
    date,
    aws_access_key_id,
    aws_secret_access_key,
    bucket,
    prefix,
    elastic_net_param,
    reg_param,
    min_installed_addons,
    client_sample_date_from,
    sample_rate,
):
    print("Sampling clients since {}".format(client_sample_date_from))

    # Clobber the AWS access credentials
    os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
    os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key

    ctx = default_context()

    APP_NAME = "TaarEnsemble"
    conf = SparkConf().setAppName(APP_NAME)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    taar_training = extract(
        spark, client_sample_date_from, min_installed_addons, sample_rate
    )
    coefs = transform(ctx, spark, taar_training, reg_param, elastic_net_param)
    load(coefs, date, prefix, bucket)


if __name__ == "__main__":
    main()