python source code of repository

# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import argparse
import concurrent.futures
import copy
import itertools
import json
import logging
import math
import os
import pickle
import shelve
import subprocess
import sys
import threading
from datetime import datetime
from functools import lru_cache
from typing import Generator, Iterable, List, NewType, Optional, Tuple

import hglib
import lmdb
import rs_parsepatch
from tqdm import tqdm

from bugbug import db, rust_code_analysis_server, utils
from bugbug.utils import LMDBDict

logger = logging.getLogger(__name__)

CommitDict = NewType("CommitDict", dict)

code_analysis_server: Optional[rust_code_analysis_server.RustCodeAnalysisServer] = None

hg_servers = list()
hg_servers_lock = threading.Lock()
thread_local = threading.local()

COMMITS_DB = "data/commits.json"
COMMIT_EXPERIENCES_DB = "commit_experiences.lmdb.tar.zst"
db.register(
    COMMITS_DB,
    "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst",
    13,
    [COMMIT_EXPERIENCES_DB],
)

path_to_component = None

EXPERIENCE_TIMESPAN = 90
EXPERIENCE_TIMESPAN_TEXT = f"{EXPERIENCE_TIMESPAN}_days"

SOURCE_CODE_TYPES_TO_EXT = {
    "Assembly": [".asm", ".S"],
    "Javascript": [".js", ".jsm", ".sjs"],
    "C/C++": [".c", ".cpp", ".cc", ".cxx", ".h", ".hh", ".hpp", ".hxx"],
    "Objective-C/C++": [".mm", ".m"],
    "Java": [".java"],
    "Python": [".py"],
    "Rust": [".rs"],
    "Kotlin": [".kt"],
    "HTML/XHTML/XUL": [".html", ".htm", ".xhtml", ".xht", ".xul"],
    "IDL/IPDL/WebIDL": [".idl", ".ipdl", ".webidl"],
}

OTHER_TYPES_TO_EXT = {
    "YAML": [".yaml", ".yml"],
    "Image": [
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".bmp",
        ".ico",
        ".icns",
        ".psd",
        ".tiff",
        ".ttf",
        ".bcmap",
        ".webp",
    ],
    "Archive": [".zip", ".gz", ".bz2", ".tar", ".xpi", ".jar"],
    "Video": [".mp4", ".webm", ".ogv", ".avi", ".mov", ".m4s", ".mgif"],
    "Audio": [".mp3", ".ogg", ".wav", ".flac", ".opus"],
    "Executable": [".exe", ".dll", ".so", ".class"],
    "Document": [".pdf", ".doc", ".otf"],
    "Documentation": [".rst", ".md"],
    "Build System File": [".build", ".mk", ".in"],
}

HARDCODED_TYPES = {".eslintrc.js": ".eslintrc.js"}

TYPES_TO_EXT = {**SOURCE_CODE_TYPES_TO_EXT, **OTHER_TYPES_TO_EXT}

EXT_TO_TYPES = {ext: typ for typ, exts in TYPES_TO_EXT.items() for ext in exts}


def get_type(path: str) -> str:
    file_name = os.path.basename(path)
    if file_name in HARDCODED_TYPES:
        return HARDCODED_TYPES[file_name]

    ext = os.path.splitext(path)[1].lower()
    return EXT_TO_TYPES.get(ext, ext)


class Commit:
    def __init__(
        self,
        node,
        author,
        desc,
        date,
        pushdate,
        bug_id,
        backsout,
        backedoutby,
        author_email,
        reviewers,
        ignored=False,
    ):
        self.node = node
        self.author = author
        self.bug_id = bug_id
        self.desc = desc
        self.date = date
        self.pushdate = pushdate
        self.backsout = backsout
        self.backedoutby = backedoutby
        self.author_email = author_email
        self.reviewers = reviewers
        self.ignored = ignored
        self.source_code_added = 0
        self.other_added = 0
        self.test_added = 0
        self.source_code_deleted = 0
        self.other_deleted = 0
        self.test_deleted = 0
        self.types = set()
        self.functions = {}
        self.seniority_author = 0.0
        self.total_source_code_file_size = 0
        self.average_source_code_file_size = 0
        self.maximum_source_code_file_size = 0
        self.minimum_source_code_file_size = 0
        self.source_code_files_modified_num = 0
        self.total_other_file_size = 0
        self.average_other_file_size = 0
        self.maximum_other_file_size = 0
        self.minimum_other_file_size = 0
        self.other_files_modified_num = 0
        self.total_test_file_size = 0
        self.average_test_file_size = 0
        self.maximum_test_file_size = 0
        self.minimum_test_file_size = 0
        self.test_files_modified_num = 0
        self.average_cyclomatic = 0.0
        self.average_halstead_N1 = 0.0
        self.average_halstead_n1 = 0.0
        self.average_halstead_N2 = 0.0
        self.average_halstead_n2 = 0.0
        self.average_source_loc = 0.0
        self.average_logical_loc = 0.0
        self.maximum_cyclomatic = 0
        self.maximum_halstead_N2 = 0
        self.maximum_halstead_n2 = 0
        self.maximum_halstead_N1 = 0
        self.maximum_halstead_n1 = 0
        self.maximum_source_loc = 0
        self.maximum_logical_loc = 0
        self.minimum_cyclomatic = sys.maxsize
        self.minimum_halstead_N1 = sys.maxsize
        self.minimum_halstead_n1 = sys.maxsize
        self.minimum_halstead_N2 = sys.maxsize
        self.minimum_halstead_n2 = sys.maxsize
        self.minimum_source_loc = sys.maxsize
        self.minimum_logical_loc = sys.maxsize
        self.total_cyclomatic = 0
        self.total_halstead_N1 = 0
        self.total_halstead_n1 = 0
        self.total_halstead_N2 = 0
        self.total_halstead_n2 = 0
        self.total_source_loc = 0
        self.total_logical_loc = 0

    def __eq__(self, other):
        assert isinstance(other, Commit)
        return self.node == other.node

    def __hash__(self):
        return hash(self.node)

    def set_files(self, files, file_copies):
        self.files = files
        self.file_copies = file_copies
        self.components = list(
            set(
                path_to_component[path.encode("utf-8")].tobytes().decode("utf-8")
                for path in files
                if path.encode("utf-8") in path_to_component
            )
        )
        self.directories = get_directories(files)
        return self

    def set_experience(
        self, exp_type, commit_type, timespan, exp_sum, exp_max, exp_min
    ):
        exp_str = f"touched_prev_{timespan}_{exp_type}_"
        if commit_type:
            exp_str += f"{commit_type}_"
        setattr(self, f"{exp_str}sum", exp_sum)
        if exp_type != "author":
            setattr(self, f"{exp_str}max", exp_max)
            setattr(self, f"{exp_str}min", exp_min)

    def to_dict(self) -> CommitDict:
        d = self.__dict__
        for f in ["file_copies"]:
            del d[f]
        d["types"] = list(d["types"])
        d["pushdate"] = str(d["pushdate"])
        d["date"] = str(d["date"])
        return CommitDict(d)


def get_directories(files):
    if isinstance(files, str):
        files = [files]

    directories = set()
    for path in files:
        path_dirs = (
            os.path.dirname(path).split("/", 2)[:2] if os.path.dirname(path) else []
        )
        if path_dirs:
            directories.update([path_dirs[0], "/".join(path_dirs)])
    return list(directories)


def is_wptsync(commit: dict) -> bool:
    return commit["author_email"] == "wptsync@mozilla.com" or any(
        s in commit["desc"] for s in ("wpt-pr:", "wpt-head:", "wpt-type:")
    )


def filter_commits(
    commits: Iterable[CommitDict],
    include_no_bug: bool = False,
    include_backouts: bool = False,
    include_ignored: bool = False,
) -> Generator[CommitDict, None, None]:
    for commit in commits:
        if not include_ignored and commit["ignored"]:
            continue

        if not include_no_bug and not commit["bug_id"]:
            continue

        if not include_backouts and len(commit["backsout"]) > 0:
            continue

        yield commit


def get_commits(
    include_no_bug: bool = False,
    include_backouts: bool = False,
    include_ignored: bool = False,
) -> Generator[CommitDict, None, None]:
    return filter_commits(
        db.read(COMMITS_DB),
        include_no_bug=include_no_bug,
        include_backouts=include_backouts,
        include_ignored=include_ignored,
    )


def _init_process(repo_dir):
    global HG, REPO_DIR
    REPO_DIR = repo_dir
    HG = hglib.open(REPO_DIR)
    get_component_mapping()


def _init_thread(repo_dir):
    hg_server = hglib.open(repo_dir)
    thread_local.hg = hg_server
    with hg_servers_lock:
        hg_servers.append(hg_server)


# This code was adapted from https://github.com/mozsearch/mozsearch/blob/2e24a308bf66b4c149683bfeb4ceeea3b250009a/router/router.py#L127
def is_test(path):
    return (
        "/test/" in path
        or "/tests/" in path
        or "/mochitest/" in path
        or "/unit/" in path
        or "/gtest/" in path
        or "testing/" in path
        or "/jsapi-tests/" in path
        or "/reftests/" in path
        or "/reftest/" in path
        or "/crashtests/" in path
        or "/crashtest/" in path
        or "/gtests/" in path
        or "/googletest/" in path
    )


def hg_modified_files(hg, commit):
    template = '{join(files,"|")}\\0{join(file_copies,"|")}\\0'
    args = hglib.util.cmdbuilder(
        b"log",
        template=template,
        no_merges=True,
        rev=commit.node.encode("ascii"),
        branch="tip",
    )
    x = hg.rawcommand(args)
    files_str, file_copies_str = x.split(b"\x00")[:-1]

    file_copies = {}
    for file_copy in file_copies_str.decode("utf-8").split("|"):
        if not file_copy:
            continue

        parts = file_copy.split(" (")
        copied = parts[0]
        orig = parts[1][:-1]
        file_copies[sys.intern(orig)] = sys.intern(copied)

    commit.set_files(
        [sys.intern(f) for f in files_str.decode("utf-8").split("|") if f], file_copies
    )


def get_functions_from_metrics(metrics_space):
    functions = []

    if (
        metrics_space["kind"] == "function"
        and metrics_space["name"]
        and metrics_space["name"] != "<anonymous>"
    ):
        functions.append(
            {
                "end_line": metrics_space["end_line"],
                "name": metrics_space["name"],
                "start_line": metrics_space["start_line"],
            }
        )

    for space in metrics_space["spaces"]:
        functions += get_functions_from_metrics(space)

    return functions


def get_touched_functions(metrics_space, deleted_lines, added_lines):
    touched_functions = set()
    touched_function_names = set()

    functions = get_functions_from_metrics(metrics_space)

    def get_touched(functions, lines):
        last_f = 0
        for line in lines:
            for function in functions[last_f:]:
                # Skip functions which we already passed.
                if function["end_line"] < line:
                    last_f += 1

                # If the line belongs to this function, add the function to the set of touched functions.
                elif function["start_line"] <= line:
                    touched_function_names.add(function["name"])
                    last_f += 1

    # Get functions touched by added lines.
    get_touched(functions, added_lines)

    # Map functions to their positions before the patch.
    prev_functions = copy.deepcopy(functions)

    for line in added_lines:
        for func in prev_functions:
            if line < func["start_line"]:
                func["start_line"] -= 1

            if line < func["end_line"]:
                func["end_line"] -= 1

    for line in deleted_lines:
        for func in prev_functions:
            if line < func["start_line"]:
                func["start_line"] += 1

            if line < func["end_line"]:
                func["end_line"] += 1

    # Get functions touched by removed lines.
    get_touched(prev_functions, deleted_lines)

    # Return touched functions, with their new positions.
    for function in functions:
        if function["name"] in touched_function_names:
            touched_functions.add(
                (function["name"], function["start_line"], function["end_line"])
            )

    return touched_functions


def get_metrics(commit, metrics_space):
    name = metrics_space["name"]
    error = metrics_space["kind"] in {"unit", "function"} and name == ""

    if metrics_space["kind"] == "function" and not error:
        metrics = metrics_space["metrics"]
        commit.total_cyclomatic += metrics["cyclomatic"]
        commit.total_halstead_n2 += metrics["halstead"]["n2"]
        commit.total_halstead_N2 += metrics["halstead"]["N2"]
        commit.total_halstead_n1 += metrics["halstead"]["n1"]
        commit.total_halstead_N1 += metrics["halstead"]["N1"]
        commit.total_source_loc += metrics["loc"]["sloc"]
        commit.total_logical_loc += metrics["loc"]["lloc"]

        commit.maximum_cyclomatic = max(
            commit.maximum_cyclomatic, metrics["cyclomatic"]
        )
        commit.maximum_halstead_n2 = max(
            commit.maximum_halstead_n2, metrics["halstead"]["n2"],
        )
        commit.maximum_halstead_N2 = max(
            metrics["halstead"]["N2"], commit.maximum_halstead_N2
        )
        commit.maximum_halstead_n1 = max(
            metrics["halstead"]["n1"], commit.maximum_halstead_n1,
        )
        commit.maximum_halstead_N1 = max(
            metrics["halstead"]["N1"], commit.maximum_halstead_N1
        )
        commit.maximum_source_loc = max(
            metrics["loc"]["sloc"], commit.maximum_source_loc
        )
        commit.maximum_logical_loc = max(
            metrics["loc"]["lloc"], commit.maximum_logical_loc
        )

        commit.minimum_cyclomatic = min(
            commit.minimum_cyclomatic, metrics["cyclomatic"]
        )
        commit.minimum_halstead_n2 = min(
            commit.minimum_halstead_n2, metrics["halstead"]["n2"],
        )
        commit.minimum_halstead_N2 = min(
            metrics["halstead"]["N2"], commit.minimum_halstead_N2
        )
        commit.minimum_halstead_n1 = min(
            metrics["halstead"]["n1"], commit.minimum_halstead_n1,
        )
        commit.minimum_halstead_N1 = min(
            metrics["halstead"]["N1"], commit.minimum_halstead_N1
        )
        commit.minimum_source_loc = min(
            metrics["loc"]["sloc"], commit.minimum_source_loc
        )
        commit.minimum_logical_loc = min(
            metrics["loc"]["lloc"], commit.minimum_logical_loc
        )

    for space in metrics_space["spaces"]:
        error |= get_metrics(commit, space)

    return error


def transform(hg: hglib.client, repo_dir: str, commit: Commit):
    hg_modified_files(hg, commit)

    if commit.ignored or len(commit.backsout) > 0 or commit.bug_id is None:
        return commit

    assert code_analysis_server is not None

    source_code_sizes = []
    other_sizes = []
    test_sizes = []
    metrics_file_count = 0

    patch = hg.export(revs=[commit.node.encode("ascii")], git=True)
    try:
        patch_data = rs_parsepatch.get_lines(patch)
    except Exception:
        logger.error(f"Exception while analyzing {commit.node}")
        raise

    for stats in patch_data:
        path = stats["filename"]

        if stats["binary"]:
            if not is_test(path):
                commit.types.add("binary")
            continue

        size = None
        after = None
        if not stats["deleted"]:
            try:
                after = hg.cat(
                    [os.path.join(repo_dir, path).encode("utf-8")],
                    rev=commit.node.encode("ascii"),
                )
                size = after.count(b"\n")
            except hglib.error.CommandError as e:
                if b"no such file in rev" not in e.err:
                    raise

        type_ = get_type(path)

        if is_test(path):
            commit.test_files_modified_num += 1

            commit.test_added += len(stats["added_lines"])
            commit.test_deleted += len(stats["deleted_lines"])

            if size is not None:
                test_sizes.append(size)

            # We don't have a 'test' equivalent of types, as most tests are JS,
            # so this wouldn't add useful information.
        elif type_ in SOURCE_CODE_TYPES_TO_EXT:
            commit.source_code_files_modified_num += 1

            commit.source_code_added += len(stats["added_lines"])
            commit.source_code_deleted += len(stats["deleted_lines"])

            if size is not None:
                source_code_sizes.append(size)

                if type_ != "IDL/IPDL/WebIDL":
                    metrics = code_analysis_server.metrics(path, after, unit=False)
                    if metrics.get("spaces"):
                        metrics_file_count += 1
                        error = get_metrics(commit, metrics["spaces"])
                        if error:
                            logger.debug(
                                f"rust-code-analysis error on commit {commit.node}, path {path}"
                            )

                        touched_functions = get_touched_functions(
                            metrics["spaces"],
                            stats["deleted_lines"],
                            stats["added_lines"],
                        )
                        if len(touched_functions) > 0:
                            commit.functions[path] = list(touched_functions)

                    # Replace type with "Objective-C/C++" if rust-code-analysis detected this is an Objective-C/C++ file.
                    if type_ == "C/C++" and metrics.get("language") == "obj-c/c++":
                        type_ = "Objective-C/C++"

            commit.types.add(type_)
        else:
            commit.other_files_modified_num += 1

            commit.other_added += len(stats["added_lines"])
            commit.other_deleted += len(stats["deleted_lines"])

            if size is not None:
                other_sizes.append(size)

            if type_:
                commit.types.add(type_)

    commit.total_source_code_file_size = sum(source_code_sizes)
    commit.average_source_code_file_size = (
        commit.total_source_code_file_size / len(source_code_sizes)
        if len(source_code_sizes) > 0
        else 0
    )
    commit.maximum_source_code_file_size = max(source_code_sizes, default=0)
    commit.minimum_source_code_file_size = min(source_code_sizes, default=0)

    commit.total_other_file_size = sum(other_sizes)
    commit.average_other_file_size = (
        commit.total_other_file_size / len(other_sizes) if len(other_sizes) > 0 else 0
    )
    commit.maximum_other_file_size = max(other_sizes, default=0)
    commit.minimum_other_file_size = min(other_sizes, default=0)

    commit.total_test_file_size = sum(test_sizes)
    commit.average_test_file_size = (
        commit.total_test_file_size / len(test_sizes) if len(test_sizes) > 0 else 0
    )
    commit.maximum_test_file_size = max(test_sizes, default=0)
    commit.minimum_test_file_size = min(test_sizes, default=0)

    if metrics_file_count:
        commit.average_cyclomatic = commit.total_cyclomatic / metrics_file_count
        commit.average_halstead_n2 = commit.total_halstead_n2 / metrics_file_count
        commit.average_halstead_N2 = commit.total_halstead_N2 / metrics_file_count
        commit.average_halstead_n1 = commit.total_halstead_n1 / metrics_file_count
        commit.average_halstead_N1 = commit.total_halstead_N1 / metrics_file_count
        commit.average_source_loc = commit.total_source_loc / metrics_file_count
        commit.average_logical_loc = commit.total_logical_loc / metrics_file_count
    else:
        # these values are initialized with sys.maxsize (because we take the min)
        # if no files, then reset them to 0 (it'd be stupid to have min > max)
        commit.minimum_cyclomatic = 0
        commit.minimum_halstead_N2 = 0
        commit.minimum_halstead_n2 = 0
        commit.minimum_halstead_N1 = 0
        commit.minimum_halstead_n1 = 0
        commit.minimum_source_loc = 0
        commit.minimum_logical_loc = 0

    return commit


def _transform(commit):
    return transform(HG, REPO_DIR, commit)


def hg_log(hg: hglib.client, revs: List[bytes]) -> List[Commit]:
    template = "{node}\\0{author}\\0{desc}\\0{date|hgdate}\\0{bug}\\0{backedoutby}\\0{author|email}\\0{pushdate|hgdate}\\0{reviewers}\\0{backsoutnodes}\\0"

    args = hglib.util.cmdbuilder(
        b"log", template=template, no_merges=True, rev=revs, branch="tip",
    )
    x = hg.rawcommand(args)
    out = x.split(b"\x00")[:-1]

    commits = []
    for rev in hglib.util.grouper(template.count("\\0"), out):
        assert b" " in rev[3]
        date = datetime.utcfromtimestamp(float(rev[3].split(b" ", 1)[0]))

        assert b" " in rev[7]
        pushdate_timestamp = rev[7].split(b" ", 1)[0]
        if pushdate_timestamp != b"0":
            pushdate = datetime.utcfromtimestamp(float(pushdate_timestamp))
        else:
            pushdate = datetime.utcnow()

        bug_id = int(rev[4].decode("ascii")) if rev[4] else None

        reviewers = (
            list(set(sys.intern(r) for r in rev[8].decode("utf-8").split(" ")))
            if rev[8] != b""
            else []
        )

        backsout = (
            list(set(sys.intern(r) for r in rev[9].decode("utf-8").split(" ")))
            if rev[9] != b""
            else []
        )

        commits.append(
            Commit(
                node=sys.intern(rev[0].decode("ascii")),
                author=sys.intern(rev[1].decode("utf-8")),
                desc=rev[2].decode("utf-8"),
                date=date,
                pushdate=pushdate,
                bug_id=bug_id,
                backsout=backsout,
                backedoutby=rev[5].decode("ascii"),
                author_email=rev[6].decode("utf-8"),
                reviewers=reviewers,
            )
        )

    return commits


def _hg_log(revs: List[bytes]) -> List[Commit]:
    return hg_log(thread_local.hg, revs)


def get_revs(hg, rev_start=0, rev_end="tip"):
    logger.info(f"Getting revs from {rev_start} to {rev_end}...")

    args = hglib.util.cmdbuilder(
        b"log",
        template="{node}\n",
        no_merges=True,
        branch="tip",
        rev=f"{rev_start}:{rev_end}",
    )
    x = hg.rawcommand(args)
    return x.splitlines()


class Experiences:
    def __init__(self, save):
        self.save = save

        try:
            self.db_experiences = shelve.Shelf(
                LMDBDict("data/commit_experiences.lmdb", readonly=not save),
                protocol=pickle.DEFAULT_PROTOCOL,
                writeback=save,
            )
        except lmdb.Error as e:
            if not save and "No such file or directory" in str(e):
                self.db_experiences = {}
            else:
                raise

        if not save:
            self.mem_experiences = {}

    def __contains__(self, key):
        if self.save:
            return key in self.db_experiences
        else:
            return key in self.mem_experiences or key in self.db_experiences

    def __getitem__(self, key):
        if self.save:
            return self.db_experiences[key]
        else:
            return (
                self.mem_experiences[key]
                if key in self.mem_experiences
                else self.db_experiences[key]
            )

    def __setitem__(self, key, value):
        if self.save:
            self.db_experiences[key] = value
        else:
            self.mem_experiences[key] = value


def calculate_experiences(commits, first_pushdate, save=True):
    logger.info(f"Analyzing seniorities from {len(commits)} commits...")

    experiences = Experiences(save)

    for commit in tqdm(commits):
        key = f"first_commit_time${commit.author}"
        if key not in experiences:
            experiences[key] = commit.pushdate
            commit.seniority_author = 0
        else:
            time_lapse = commit.pushdate - experiences[key]
            commit.seniority_author = time_lapse.total_seconds()

    logger.info(f"Analyzing experiences from {len(commits)} commits...")

    # Note: In the case of files, directories, components, we can't just use the sum of previous commits, as we could end
    # up overcounting them. For example, consider a commit A which modifies "dir1" and "dir2", a commit B which modifies
    # "dir1" and a commit C which modifies "dir1" and "dir2". The number of previous commits touching the same directories
    # for C should be 2 (A + B), and not 3 (A twice + B).

    def get_key(exp_type, commit_type, item):
        return f"{exp_type}${commit_type}${item}"

    def get_experience(exp_type, commit_type, item, day, default):
        key = get_key(exp_type, commit_type, item)
        try:
            return experiences[key]
        except KeyError:
            queue = utils.ExpQueue(day, EXPERIENCE_TIMESPAN + 1, default)
            experiences[key] = queue
            return queue

    def update_experiences(experience_type, day, items):
        for commit_type in ("", "backout"):
            exp_queues = tuple(
                get_experience(experience_type, commit_type, item, day, 0)
                for item in items
            )
            total_exps = tuple(exp_queues[i][day] for i in range(len(items)))
            timespan_exps = tuple(
                exp - exp_queues[i][day - EXPERIENCE_TIMESPAN]
                for exp, i in zip(total_exps, range(len(items)))
            )

            total_exps_sum = sum(total_exps)
            timespan_exps_sum = sum(timespan_exps)

            commit.set_experience(
                experience_type,
                commit_type,
                "total",
                total_exps_sum,
                max(total_exps, default=0),
                min(total_exps, default=0),
            )
            commit.set_experience(
                experience_type,
                commit_type,
                EXPERIENCE_TIMESPAN_TEXT,
                timespan_exps_sum,
                max(timespan_exps, default=0),
                min(timespan_exps, default=0),
            )

            # We don't want to consider backed out commits when calculating normal experiences.
            if (
                commit_type == ""
                and not commit.backedoutby
                or commit_type == "backout"
                and commit.backedoutby
            ):
                for i in range(len(items)):
                    exp_queues[i][day] = total_exps[i] + 1

    def update_complex_experiences(experience_type, day, items):
        for commit_type in ("", "backout"):
            exp_queues = tuple(
                get_experience(experience_type, commit_type, item, day, tuple())
                for item in items
            )
            all_commit_lists = tuple(exp_queues[i][day] for i in range(len(items)))
            before_commit_lists = tuple(
                exp_queues[i][day - EXPERIENCE_TIMESPAN] for i in range(len(items))
            )
            timespan_commit_lists = tuple(
                commit_list[len(before_commit_list) :]
                for commit_list, before_commit_list in zip(
                    all_commit_lists, before_commit_lists
                )
            )

            all_commits = set(sum(all_commit_lists, tuple()))
            timespan_commits = set(sum(timespan_commit_lists, tuple()))

            commit.set_experience(
                experience_type,
                commit_type,
                "total",
                len(all_commits),
                max(
                    (len(all_commit_list) for all_commit_list in all_commit_lists),
                    default=0,
                ),
                min(
                    (len(all_commit_list) for all_commit_list in all_commit_lists),
                    default=0,
                ),
            )
            commit.set_experience(
                experience_type,
                commit_type,
                EXPERIENCE_TIMESPAN_TEXT,
                len(timespan_commits),
                max(
                    (
                        len(timespan_commit_list)
                        for timespan_commit_list in timespan_commit_lists
                    ),
                    default=0,
                ),
                min(
                    (
                        len(timespan_commit_list)
                        for timespan_commit_list in timespan_commit_lists
                    ),
                    default=0,
                ),
            )

            # We don't want to consider backed out commits when calculating normal experiences.
            if (
                commit_type == ""
                and not commit.backedoutby
                or commit_type == "backout"
                and commit.backedoutby
            ):
                for i in range(len(items)):
                    exp_queues[i][day] = all_commit_lists[i] + (commit.node,)

    # prev_day = 0
    # prev_commit = None
    for i, commit in enumerate(tqdm(commits)):
        day = (commit.pushdate - first_pushdate).days
        assert day >= 0
        # TODO: Bring back this assertion, but using pushid instead.
        # pushdate is unreliable, e.g. 4d0e3037210dd03bdb21964a6a8c2e201c45794b was pushed after 06b578dfadc9db8b683090e0e110ba75b84fb766, but it has an earlier push date...
        # assert (
        #     day >= prev_day
        # ), f"Commit {commit.node} pushed on {commit.pushdate} should come after {prev_commit.node} pushed on {prev_commit.pushdate}"
        # prev_day = day
        # prev_commit = commit

        # When a file is moved/copied, copy original experience values to the copied path.
        for orig, copied in commit.file_copies.items():
            for commit_type in ("", "backout"):
                orig_key = get_key("file", commit_type, orig)
                if orig_key in experiences:
                    experiences[get_key("file", commit_type, copied)] = copy.deepcopy(
                        experiences[orig_key]
                    )
                else:
                    logger.warning(
                        f"Experience missing for file {orig}, type '{commit_type}', on commit {commit.node}"
                    )

        if (
            not commit.ignored
            and len(commit.backsout) == 0
            and commit.bug_id is not None
        ):
            update_experiences("author", day, (commit.author,))
            update_experiences("reviewer", day, commit.reviewers)

            update_complex_experiences("file", day, commit.files)
            update_complex_experiences("directory", day, commit.directories)
            update_complex_experiences("component", day, commit.components)


def set_commits_to_ignore(hg: hglib.client, repo_dir: str, commits: List[Commit]):
    # Skip commits which are in .hg-annotate-ignore-revs or which have
    # 'ignore-this-changeset' in their description (mostly consisting of very
    # large and not meaningful formatting changes).
    ignore_revs_content = hg.cat(
        [os.path.join(repo_dir, ".hg-annotate-ignore-revs").encode("ascii")], rev=b"-1"
    ).decode("utf-8")
    ignore_revs = set(line[:40] for line in ignore_revs_content.splitlines())

    for commit in commits:
        commit.ignored = (
            commit.node in ignore_revs or "ignore-this-changeset" in commit.desc
        )


def download_component_mapping():
    path_to_component = get_component_mapping(False)

    utils.download_check_etag(
        "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json",
        "data/component_mapping.json",
    )

    with open("data/component_mapping.json", "r") as f:
        data = json.load(f)

    for path, component in data.items():
        path_to_component[path.encode("utf-8")] = "::".join(component).encode("utf-8")

    close_component_mapping()


def get_component_mapping(readonly=True):
    global path_to_component
    path_to_component = LMDBDict("data/component_mapping.lmdb", readonly=readonly)
    return path_to_component


def close_component_mapping():
    global path_to_component
    path_to_component.close()
    path_to_component = None


def hg_log_multi(repo_dir, revs):
    if len(revs) == 0:
        return []

    threads_num = os.cpu_count() + 1
    REVS_COUNT = len(revs)
    CHUNK_SIZE = int(math.ceil(REVS_COUNT / threads_num))
    revs_groups = [revs[i : i + CHUNK_SIZE] for i in range(0, REVS_COUNT, CHUNK_SIZE)]

    with concurrent.futures.ThreadPoolExecutor(
        initializer=_init_thread, initargs=(repo_dir,), max_workers=threads_num
    ) as executor:
        commits = executor.map(_hg_log, revs_groups)
        commits = tqdm(commits, total=len(revs_groups))
        commits = list(itertools.chain.from_iterable(commits))

    while len(hg_servers) > 0:
        hg_server = hg_servers.pop()
        hg_server.close()

    return commits


@lru_cache(maxsize=None)
def get_first_pushdate(repo_dir):
    with hglib.open(repo_dir) as hg:
        return hg_log(hg, [b"0"])[0].pushdate


def download_commits(
    repo_dir: str,
    rev_start: str = None,
    revs: List[bytes] = None,
    save: bool = True,
    use_single_process: bool = False,
    include_no_bug: bool = False,
    include_backouts: bool = False,
    include_ignored: bool = False,
) -> Tuple[CommitDict, ...]:
    assert revs is not None or rev_start is not None

    with hglib.open(repo_dir) as hg:
        if revs is None:
            revs = get_revs(hg, rev_start)
            if len(revs) == 0:
                logger.info("No commits to analyze")
                return tuple()

        first_pushdate = get_first_pushdate(repo_dir)

        logger.info(f"Mining {len(revs)} commits...")

        if not use_single_process:
            logger.info(f"Using {os.cpu_count()} processes...")
            commits = hg_log_multi(repo_dir, revs)
        else:
            commits = hg_log(hg, revs)

        if save or not os.path.exists("data/component_mapping.lmdb"):
            logger.info("Downloading file->component mapping...")
            download_component_mapping()

        set_commits_to_ignore(hg, repo_dir, commits)

        commits_num = len(commits)

        logger.info(f"Mining {commits_num} patches...")

        global code_analysis_server
        code_analysis_server = rust_code_analysis_server.RustCodeAnalysisServer()

        if not use_single_process:
            with concurrent.futures.ProcessPoolExecutor(
                initializer=_init_process, initargs=(repo_dir,)
            ) as executor:
                commits = executor.map(_transform, commits, chunksize=64)
                commits = tqdm(commits, total=commits_num)
                commits = tuple(commits)
        else:
            get_component_mapping()

            commits = tuple(transform(hg, repo_dir, c) for c in commits)

            close_component_mapping()

    code_analysis_server.terminate()

    calculate_experiences(commits, first_pushdate, save)

    logger.info("Applying final commits filtering...")

    commits = tuple(commit.to_dict() for commit in commits)

    if save:
        db.append(COMMITS_DB, commits)

    return tuple(
        filter_commits(
            commits,
            include_no_bug=include_no_bug,
            include_backouts=include_backouts,
            include_ignored=include_ignored,
        )
    )


def clean(hg, repo_dir):
    logger.info("Restoring files to their checkout state...")
    hg.revert(repo_dir.encode("utf-8"), all=True)

    logger.info("Stripping non-public commits...")
    try:
        cmd = hglib.util.cmdbuilder(
            b"strip", rev=b"roots(outgoing())", force=True, backup=False
        )
        hg.rawcommand(cmd)
    except hglib.error.CommandError as e:
        if b"abort: empty revision set" not in e.err:
            raise


def clone(repo_dir, url="https://hg.mozilla.org/mozilla-central", update=False):
    try:
        with hglib.open(repo_dir) as hg:
            clean(hg, repo_dir)

        # Remove pushlog DB to make sure it's regenerated.
        try:
            os.remove(os.path.join(repo_dir, ".hg", "pushlog2.db"))
        except FileNotFoundError:
            logger.info("pushlog database doesn't exist")

        # Pull, to make sure the pushlog is generated.
        with hglib.open(repo_dir) as hg:
            logger.info(f"Pulling {repo_dir}")
            hg.pull(update=update)
            logger.info(f"{repo_dir} pulled")

        return
    except hglib.error.ServerError as e:
        if "abort: repository" not in str(e) and b"not found" not in str(e):
            raise

    cmd = hglib.util.cmdbuilder(
        "robustcheckout",
        url,
        repo_dir,
        purge=True,
        sharebase=repo_dir + "-shared",
        networkattempts=7,
        branch=b"tip",
        noupdate=not update,
    )

    cmd.insert(0, hglib.HGPATH)

    subprocess.run(cmd, check=True)

    logger.info(f"{repo_dir} cloned")


def apply_stack(
    repo_dir: str, stack: List[dict], branch: str, revision: str
) -> List[bytes]:
    """Apply a stack of patches on a repository"""
    assert len(stack) > 0, "Empty stack"

    with hglib.open(repo_dir) as hg:
        hg.pull(
            source=f"https://hg.mozilla.org/{branch}/".encode("ascii"),
            rev=revision.encode("ascii"),
        )
        return [rev["node"].encode("ascii") for rev in stack]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("repository_dir", help="Path to the repository", action="store")
    parser.add_argument(
        "rev_start", help="Which revision to start with", action="store"
    )
    args = parser.parse_args()

    download_commits(args.repository_dir, args.rev_start, save=False)