# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. from collections import defaultdict from datetime import datetime DATES_KEY = "dates" COMMITS_KEY = "git-commits" HISTORY_KEY = "history" NAME_KEY = "name" TYPE_KEY = "type" REFLOG_KEY = "reflog-index" def is_test_probe(probe_type, name): if probe_type == 'histogram': # These are test-only probes and never sent out. return name.startswith("TELEMETRY_TEST_") elif probe_type in ['scalar', 'event']: return name.startswith("telemetry.test.") return False def get_from_nested_dict(dictionary, path, default=None): keys = path.split('/') for k in keys[:-1]: dictionary = dictionary[k] return dictionary.get(keys[-1], default) def get_probe_id(probe_type, name): return probe_type + "/" + name def probes_equal(probe1, probe2): props = [ # Common. "cpp_guard", "optout", "notification_emails", # Histograms & scalars. "details/keyed", "details/kind", # Histograms. "details/n_buckets", "details/n_values", "details/low", "details/high", "details/record_in_processes", "details/labels", # Events. "details/methods", "details/objects", "details/extra_keys", ] for prop in props: if get_from_nested_dict(probe1, prop) != get_from_nested_dict(probe2, prop): return False return True def extract_node_data(node_id, channel, probe_type, probe_data, result_data, version, break_by_channel): """ Extract the probe data and group it by channel. :param node_id: the revision the probe data comes from, with th :param channel: the channel the probe was found in. :param probe_type: the probe type (e.g. 'histogram'). :param probe_data: the probe data, with the following form: { node_id: { histogram: { name: ..., ... }, scalar: { ... }, }, ... } :param result_data: the dictionary to which the processed probe data is appended to. Extract probe data will be added to result_data in the form: { channel: { probe_id: { type: 'histogram', name: 'some-name', history: { channel: [ { optout: True, ... revisions: {first: ..., last: ...}, versions: {first: ..., last: ...} }, ... ] } } } } :param version: a human readable version string. :param break_by_channel: True if probe data for different channels needs to be stored separately, False otherwise. If True, probe data will be saved to result_data[channel] instead of just result_data. """ for name, probe in probe_data.items(): # Telemetrys test probes are never submitted to the servers. if is_test_probe(probe_type, name): continue storage = result_data if break_by_channel: if channel not in result_data: result_data[channel] = {} storage = result_data[channel] probe_id = get_probe_id(probe_type, name) if probe_id in storage and channel in storage[probe_id][HISTORY_KEY]: # If the probes state didn't change from the previous revision, # we just override with the latest state and continue. previous = storage[probe_id][HISTORY_KEY][channel][-1] if probes_equal(previous, probe): previous["revisions"]["first"] = node_id previous["versions"]["first"] = version continue if probe_id not in storage: storage[probe_id] = { TYPE_KEY: probe_type, NAME_KEY: name, HISTORY_KEY: {channel: []}, } if channel not in storage[probe_id][HISTORY_KEY]: storage[probe_id][HISTORY_KEY][channel] = [] probe["revisions"] = { "first": node_id, "last": node_id, } probe["versions"] = { "first": version, "last": version, } storage[probe_id][HISTORY_KEY][channel].append(probe) def sorted_node_lists_by_channel(node_data): channels = defaultdict(list) for channel, nodes in node_data.items(): for node_id, data in nodes.items(): channels[channel].append({ 'node_id': node_id, 'version': data['version'], }) for channel, data in channels.items(): channels[channel] = sorted(data, key=lambda n: int(n["version"]), reverse=True) return channels def sorted_node_lists_by_date(node_data, revision_dates): def get_date(revision): return revision_dates[channel][revision]["date"] channels = defaultdict(list) for channel, nodes in node_data.items(): for node_id, data in nodes.items(): channels[channel].append({ 'node_id': node_id, 'version': data['version'], }) for channel, data in channels.items(): channels[channel] = sorted(data, key=lambda x: get_date(x["node_id"]), reverse=True) return channels def transform(probe_data, node_data, break_by_channel, revision_dates=None): """ Transform the probe data into the final format. :param probe_data: the preprocessed probe data. :param node_data: the raw probe data. :param break_by_channel: True if we want the probe output grouped by release channel. :param revision_dates: (optional) A dictionary of channel-revisions and their publish date, used to sort the revisions """ if revision_dates is None: channels = sorted_node_lists_by_channel(node_data) else: channels = sorted_node_lists_by_date(node_data, revision_dates) result_data = {} for channel, channel_data in channels.items(): print("\n" + channel + " - transforming probe data:") for entry in channel_data: node_id = entry['node_id'] readable_version = str(entry["version"]) print(" from: " + str({"node": node_id, "version": readable_version})) for probe_type, probes in probe_data[channel][node_id].items(): # Group the probes by the release channel, if requested extract_node_data(node_id, channel, probe_type, probes, result_data, readable_version, break_by_channel) return result_data def get_minimum_date(probe_data, revision_data, revision_dates): probe_histories = transform(probe_data, revision_data, break_by_channel=True, revision_dates=revision_dates) min_dates = defaultdict(lambda: defaultdict(str)) for channel, probes in probe_histories.items(): for probe_id, entry in probes.items(): dates = [] for history in entry['history'][channel]: revision = history['revisions']['first'] dates.append(revision_dates[channel][revision]["date"]) min_dates[probe_id][channel] = min(dates) return min_dates def pretty_ts(ts): return datetime.utcfromtimestamp(ts).isoformat(' ') def make_item_defn(definition, commit, commit_timestamps): if COMMITS_KEY not in definition: # This is the first time we've seen this definition definition[COMMITS_KEY] = { "first": commit, "last": commit } definition[DATES_KEY] = { "first": pretty_ts(commit_timestamps[commit][0]), "last": pretty_ts(commit_timestamps[commit][0]) } definition[REFLOG_KEY] = { "first": commit_timestamps[commit][1], "last": commit_timestamps[commit][1] } else: # we've seen this definition, update the `last` commit definition[COMMITS_KEY]["last"] = commit definition[DATES_KEY]["last"] = pretty_ts(commit_timestamps[commit][0]) definition[REFLOG_KEY]["last"] = commit_timestamps[commit][1] return definition def metrics_equal(def1, def2): return all(( def1.get(l) == def2.get(l) for l in { 'bugs', 'data_reviews', 'description', 'disabled', 'labeled', 'labels', 'lifetime', 'notification_emails', 'send_in_pings', 'time_unit', 'type', 'version', } )) def ping_equal(def1, def2): # Test all keys except the ones the probe-scraper adds ignored_keys = set([DATES_KEY, COMMITS_KEY, HISTORY_KEY, REFLOG_KEY]) all_keys = set(def1.keys()).union(def2.keys()).difference(ignored_keys) return all(( def1.get(l) == def2.get(l) for l in all_keys )) def metric_constructor(defn, metric): return { TYPE_KEY: defn[TYPE_KEY], NAME_KEY: metric, HISTORY_KEY: [defn] } def ping_constructor(defn, metric): return { NAME_KEY: metric, HISTORY_KEY: [defn] } def update_or_add_item(repo_items, commit_hash, item, definition, commit_timestamps, equal_fn, type_ctor): # If we've seen this item before, check previous definitions if item in repo_items: prev_defns = repo_items[item][HISTORY_KEY] max_defn_i = max(range(len(prev_defns)), key=lambda i: datetime.fromisoformat(prev_defns[i][DATES_KEY]["last"])) max_defn = prev_defns[max_defn_i] # If equal to previous commit, update date and commit on existing definition if equal_fn(definition, max_defn): new_defn = make_item_defn(max_defn, commit_hash, commit_timestamps) repo_items[item][HISTORY_KEY][max_defn_i] = new_defn # Otherwise, prepend changed definition for existing item else: new_defn = make_item_defn(definition, commit_hash, commit_timestamps) repo_items[item][HISTORY_KEY] = prev_defns + [new_defn] # We haven't seen this item before, add it else: defn = make_item_defn(definition, commit_hash, commit_timestamps) repo_items[item] = type_ctor(defn, item) return repo_items def transform_by_hash(commit_timestamps, data, equal_fn, type_ctor): """ :param commit_timestamps - of the form <repo_name>: { <commit-hash>: (<commit-timestamp>, <commit-index>), ... } :param data - of the form <repo_name>: { <commit-hash>: { <item-name>: { ... }, }, ... } Outputs deduplicated data of the form <repo_name>: { <name>: { "type": <type>, "name": <name>, "history": [ { "bugs": [<bug#>, ...], ...other info (from metrics.yaml or pings.yaml)..., "git-commits": { "first": <hash>, "last": <hash> }, "dates": { "first": <datetime>, "last": <datetime> } }, ] } } """ # We need to sort by timestamp in ascending order, but reflog index in # descending order. def timestamp_sorter(entry): return (entry[0], -entry[1]) all_items = {} for repo_name, commits in data.items(): repo_items = {} # iterate through commits, sorted by timestamp of the commit sorted_commits = sorted( iter(commits.items()), key=lambda x_y: timestamp_sorter(commit_timestamps[repo_name][x_y[0]]) ) for commit_hash, items in sorted_commits: for item, definition in items.items(): repo_items = update_or_add_item(repo_items, commit_hash, item, definition, commit_timestamps[repo_name], equal_fn, type_ctor) all_items[repo_name] = repo_items return all_items def transform_metrics_by_hash(commit_timestamps, metric_data): return transform_by_hash(commit_timestamps, metric_data, metrics_equal, metric_constructor) def transform_pings_by_hash(commit_timestamps, ping_data): return transform_by_hash(commit_timestamps, ping_data, ping_equal, ping_constructor)