python source code of main

import json
import logging
import os
import sys
from datetime import datetime, timedelta
from signal import signal, SIGINT, SIGTERM, SIGUSR1
from typing import Optional

from apscheduler.schedulers import SchedulerNotRunningError
from docker.models.containers import Container
from fasteners import InterProcessLock

from deck_chores import __version__, jobs
from deck_chores.config import cfg, generate_config, ConfigurationError
from deck_chores.indexes import (
    container_name,
    lock_service,
    reassign_service_lock,
    unlock_service,
    service_locks_by_service_id,
    service_locks_by_container_id,
)
from deck_chores.parsers import job_config_validator, parse_labels
from deck_chores.utils import DEBUG, log, log_handler


####


lock = InterProcessLock('/tmp/deck-chores.lock')


def there_is_another_deck_chores_container() -> bool:
    matched_containers = 0
    for container in cfg.client.containers.list(ignore_removed=True, sparse=True):
        if container.image.labels.get('org.label-schema.name', '') == 'deck-chores':
            matched_containers += 1
        if matched_containers > 1:
            return True

    return False


####


def sigint_handler(signum, frame):  # pragma: nocover
    log.info("Keyboard interrupt.")
    raise SystemExit(0)


def sigterm_handler(signum, frame):  # pragma: nocover
    log.info("Received SIGTERM.")
    raise SystemExit(0)


def sigusr1_handler(signum, frame):
    log.info("SIGUSR1 received, echoing all jobs.")
    for job in jobs.scheduler.get_jobs():
        log.info(f"ID: {job.id}   Next execution: {job.next_run_time}   Configuration:")
        log.info(job.kwargs)


signal(SIGINT, sigint_handler)
signal(SIGTERM, sigterm_handler)
signal(SIGUSR1, sigusr1_handler)


####


def process_started_container_labels(container_id: str, paused: bool = False) -> None:
    service_id, flags, definitions = parse_labels(container_id)

    if not definitions:
        return

    if service_id and 'service' in flags:
        other_container_id = service_locks_by_service_id.get(service_id)
        if other_container_id:
            log.debug(
                f'Service id {service_id} is locked by container {other_container_id}.'
            )
            if cfg.client.containers.get(other_container_id).status == "paused":
                assert reassign_jobs(other_container_id, consider_paused=False)
            return

        lock_service(service_id, container_id)

    jobs.add(container_id, definitions, paused=paused)


def inspect_running_containers() -> datetime:
    log.info("Inspecting running containers.")
    last_event_time = datetime.utcnow()
    containers = cfg.client.containers.list(ignore_removed=True, sparse=True)

    for container in containers:
        container_id = container.id
        data = cfg.client.api.inspect_container(container_id)
        last_event_time = max(
            last_event_time,
            # not sure why mypy doesn't know about this method:
            datetime.fromisoformat(data['State']['StartedAt'][:26]),  # type: ignore
        )
        process_started_container_labels(
            container_id, paused=container.status == 'paused'
        )

    log.debug('Finished inspection of running containers.')
    return last_event_time


def reassign_jobs(container_id: str, consider_paused: bool) -> Optional[str]:
    other_service_container = find_other_container_for_service(
        container_id, consider_paused
    )

    if other_service_container is None:
        return None

    new_id = other_service_container.id
    container_is_paused = other_service_container.status == "paused"
    log.info(f"{container_name(container_id)}: Reassigning jobs to {new_id}.")

    for job in jobs.get_jobs_for_container(container_id):
        log.debug(f"Handling job: {job.kwargs}")
        job_is_paused = not bool(job.next_run_time)

        if container_is_paused and not job_is_paused:
            job.pause()
            log.debug("Paused job.")
        elif not container_is_paused and job_is_paused:
            job.resume()
            log.debug("Resumed job.")

        job.modify(kwargs={**job.kwargs, "container_id": new_id})

    reassign_service_lock(container_id, new_id)

    return new_id


def find_other_container_for_service(
    container_id: str, consider_paused: bool
) -> Optional[Container]:
    service_id = service_locks_by_container_id.get(container_id)
    if service_id is None:
        return None

    for status in (
        ("running", "restarting", "paused", "created")  # type: ignore
        if consider_paused
        else ("running", "restarting")
    ):
        candidates = [
            c
            for c in cfg.client.containers.list(
                all=True,
                ignore_removed=True,
                # TODO don't cast service_id to list when this patch is incorporated:
                #      https://github.com/docker/docker-py/pull/2445
                filters={"status": status, "label": list(service_id)},
            )
            if c.id != container_id
        ]

        if len(candidates):
            return candidates[0]

    return None


####


def listen(since: datetime) -> None:
    log.info("Listening to events.")
    for event_json in cfg.client.events(since=since):
        if b'container' not in event_json:
            continue

        if not any((x in event_json) for x in (b'start', b'die', b'pause', b'unpause')):
            continue

        event = json.loads(event_json)
        log.debug(f'Daemon event: {event}')
        if event['Type'] != 'container':
            continue

        action = event['Action']
        if action == 'start':
            handle_start(event)
        elif action == 'die':
            handle_die(event)
        elif action == 'pause':
            handle_pause(event)
        elif action == 'unpause':
            handle_unpause(event)


def handle_start(event: dict):
    container_id = event['Actor']['ID']
    log.debug(f'Handling start of {container_id}.')
    process_started_container_labels(container_id, paused=False)


def handle_die(event: dict):
    container_id = event['Actor']['ID']
    log.debug(f'Handling die of {container_id}.')
    if reassign_jobs(container_id, consider_paused=True) is None:
        for job in jobs.get_jobs_for_container(container_id):
            definition = job.kwargs
            log.debug(f"Removing job: {definition}")
            job.remove()
            log.info(
                f"{container_name(container_id)}: Removed '"
                + definition["job_name"]
                + "'."
            )
        unlock_service(container_id)


def handle_pause(event: dict):
    container_id = event['Actor']['ID']
    log.debug(f'Handling pause of {container_id}.')

    if reassign_jobs(container_id, consider_paused=False) is None:
        counter = 0
        for counter, job in enumerate(
            jobs.get_jobs_for_container(container_id), start=1
        ):
            job.pause()
            log.debug(f"Paused job: {job.kwargs}")
        if counter:
            log.info(f"{container_name(container_id)}: Paused {counter} jobs.")


def handle_unpause(event: dict):
    container_id = event['Actor']['ID']
    log.debug(f'Handling unpause of {container_id}.')

    if container_id not in service_locks_by_container_id:
        service_id, _, _ = parse_labels(container_id)
        if service_id:
            other_container_id = service_locks_by_service_id.get(service_id)
            if (
                other_container_id is not None
                and cfg.client.containers.get(other_container_id).status == "paused"
            ):
                container_id = reassign_jobs(other_container_id, consider_paused=False)

    counter = 0
    for counter, job in enumerate(jobs.get_jobs_for_container(container_id), start=1):
        job.resume()
        log.debug(f"Resumed job: {job.kwargs}")
    if counter:
        log.info(f"{container_name(container_id)}: Resumed {counter} jobs.")


def shutdown() -> None:  # pragma: nocover
    try:
        jobs.scheduler.shutdown()
    except SchedulerNotRunningError:
        pass

    if hasattr(cfg, "client"):
        cfg.client.close()


####


def main() -> None:  # pragma: nocover
    if DEBUG and not __debug__:
        log.debug("Replacing process with Python's optimizations off.")
        sys.stdout.flush()
        os.execlpe("deck-chores", "deck-chores", {**os.environ, "PYTHONOPTIMIZE": ""})

    if not lock.acquire(blocking=False):
        log.error(f"Couldn't acquire lock file at {lock.path}, exiting.")
        sys.exit(1)

    log.info(f'Deck Chores {__version__} started.')

    try:
        generate_config()
        log_handler.setFormatter(logging.Formatter(cfg.logformat, style='{'))
        log.debug(f'Config: {cfg.__dict__}')

        if there_is_another_deck_chores_container():
            log.error(
                "There's another container running deck-chores, maybe paused or "
                "restarting."
            )
            raise SystemExit(1)

        job_config_validator.set_defaults(cfg)

        last_event_time = inspect_running_containers()
        jobs.start_scheduler()
        listen(since=last_event_time + timedelta(microseconds=1))

    except SystemExit as e:
        exit_code = e.code
    except ConfigurationError as e:
        log.error(e)
        exit_code = 1
    except Exception:
        log.exception('Caught unhandled exception:')
        exit_code = 3
    else:
        exit_code = 0
    finally:
        shutdown()
        lock.release()
        sys.exit(exit_code)


if __name__ == '__main__':
    main()