# BSD-3-Clause License # # Copyright 2017 Orange # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # 3. Neither the name of the copyright holder nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """ Base 'Agent' classes. An Agent instance is a stand-alone autonomous object. It hosts computations, which send messages to each other. Each agent has its own thread, which is used to handle messages as they are dispatched to computations hosted on this agent. """ import logging import sys import threading import traceback import random from functools import partial from importlib import import_module from threading import Thread from time import perf_counter, sleep from typing import Dict, List, Optional, Union, Callable, Tuple from collections import defaultdict from pydcop.algorithms import AlgorithmDef, ComputationDef, load_algorithm_module from pydcop.dcop.objects import AgentDef, create_binary_variables from pydcop.dcop.objects import BinaryVariable from pydcop.dcop.relations import Constraint from pydcop.infrastructure.Events import event_bus from pydcop.infrastructure.communication import Messaging, \ CommunicationLayer, UnreachableAgent from pydcop.infrastructure.computations import MessagePassingComputation, \ build_computation from pydcop.infrastructure.discovery import Discovery, UnknownComputation, \ UnknownAgent, _is_technical from pydcop.infrastructure.ui import UiServer from pydcop.reparation import create_computation_hosted_constraint, \ create_agent_capacity_constraint, create_agent_hosting_constraint, \ create_agent_comp_comm_constraint class AgentException(Exception): pass class Agent(object): """ Object representing an agent. An agent communicates with other agents though messages, using a `CommunicationLayer` An agent hosts message passing computations and run these computations on its own thread. Notes ----- An agent does not necessarily need to known it's own definition (see agent_def argument) but is needs it for some use like replication in resilient DCOP. Parameters ---------- name: str name of the agent comm: CommunicationLayer object used to send and receive messages agent_def: AgentDef definition of this agent, optional ui_port: int the port on which to run the ui-server. If not given, no ui-server is started. delay: int An optional delay between message delivery, in second. This delay only applies to algorithm's messages and is useful when you want to observe (for example with the GUI) the behavior of the algorithm at runtime. daemon: boolean indicates if the agent should use a daemon thread (defaults to False) See Also -------- MessagePassingComputation, CommunicationLayer """ def __init__(self, name, comm: CommunicationLayer, agent_def: AgentDef=None, ui_port: int=None, delay: float=None, daemon: bool=False): self._name = name self.agent_def = agent_def self.logger = logging.getLogger('pydcop.agent.' + name) self.agt_metrics = AgentMetrics() # Setup communication and discovery self._comm = comm self.discovery = Discovery(self._name, self.address) self._comm.discovery = self.discovery self._messaging = Messaging(name, comm, delay=delay) self.discovery.discovery_computation.message_sender = \ self._messaging.post_msg # Ui server self._ui_port = ui_port self._ui_server = None self.t = Thread(target=self._run, name='thread_'+name) self.t.daemon = daemon self._stopping = threading.Event() self._shutdown = threading.Event() self._running = False # _idle means that we have finished to handle all incoming messages self._idle = False self._computations = {} # type: Dict[str, MessagePassingComputation] self.t_active = 0 # time when run the first non-technical computation is run self._run_t = None # time when starting the agent self._start_t = None # Tasks that must run periodically as {callable: (period, last_run)} self._periodic_cb = {} # type: Dict[Callable, Tuple[float, float]] # List of paused computations, any computation whose name is in this # list will not receive any message. self.paused_computations = [] @property def communication(self)-> CommunicationLayer: """ The communication used by this agent. Returns ------- CommunicationLayer The communication used by this agent. """ return self._comm def add_computation(self, computation: MessagePassingComputation, comp_name=None, publish=True): """ Add a computation to the agent. The computation will run on this agent thread and receives messages through his Messaging and CommunicationLayer. Parameters ---------- computation: a MessagePassingComputation the computation to be added comp_name: str an optional name for the computation, if not given computation.name will be used. publish: bool True (default) is the computation must be published on the discovery service. """ comp_name = computation.name if comp_name is None else comp_name self.logger.debug('Add computation %s - %s ', comp_name, self._messaging) computation.message_sender = self._messaging.post_msg computation.periodic_action_handler = self self._computations[comp_name] = computation self.discovery.register_computation(comp_name, self.name,self.address, publish=publish) # start lookup for agent hosting a neighbor computation if hasattr(computation, 'computation_def') and \ computation.computation_def is not None: for n in computation.computation_def.node.neighbors: self.discovery.subscribe_computation(n) if hasattr(computation, '_on_value_selection'): computation._on_value_selection = notify_wrap( computation._on_value_selection, partial(self._on_computation_value_changed, computation.name)) if hasattr(computation, '_on_new_cycle'): computation._on_new_cycle = notify_wrap( computation._on_new_cycle, partial(self._on_computation_new_cycle, computation.name)) computation.finished = notify_wrap( computation.finished, partial(self._on_computation_finished, computation.name)) event_bus.send("agents.add_computation."+self.name, (self.name, computation)) def remove_computation(self, computation: str) -> None: """ Removes a computation from the agent. Parameters ---------- computation: str the name of the computation Raises ------ UnknownComputation If there is no computation with this name on this agent """ try: comp = self._computations.pop(computation) except KeyError: self.logger.error( 'Removing unknown computation %s - current commutations : %s', computation, self._computations) raise UnknownComputation(computation) if comp.is_running: comp.stop() self.logger.debug('Removing computation %s', comp) self.discovery.unregister_computation(computation, self.name) event_bus.send("agents.rem_computation."+self.name, (self.name, computation)) def computations(self, include_technical=False)-> \ List[MessagePassingComputation]: """ Computations hosted on this agent. Parameters ---------- include_technical: bool If True, technical computations (like discovery, etc.) are included in the list. Returns ------- List[MessagePassingComputation] A list of computations hosted on this agents. This list is a copy and can be safely modified. """ if include_technical: return list(self._computations.values()) else: return [c for c in self._computations.values() if not c.name.startswith('_')] def computation(self, name: str) -> MessagePassingComputation: """ Get a computation hosted by this agent. Parameters ---------- name: str The name of the computation. Returns ------- The Messaging passing corresponding to the given name. Raises ------ UnknownComputation if the agent has no computation with this name. See Also -------- add_computation """ try: return self._computations[name] except KeyError: self.logger.error('unknown computation %s', name) raise UnknownComputation('unknown computation ' + name) @property def address(self): """ The address this agent can be reached at. The type of the address depends on the instance and type of the CommunicationLayer used by this agent. Returns ------- The address this agent can be reached at. """ return self._comm.address def start(self, run_computations = False): """ Starts the agent. One started, an agent will dispatch any received message to the corresponding target computation. Notes ----- Each agent has it's own thread, this will start the agent's thread, run the _on_start callback and waits for message. Incoming message are added to a queue and handled by calling the _handle_message callback. The agent (and its thread) will stop once stop() has been called and he has finished handling the current message, if any. See Also -------- _on_start(), stop() """ if self.is_running: raise AgentException('Cannot start agent {}, already running ' .format(self.name)) self.logger.info('Starting agent %s ', self.name) self._running = True self.run_computations = run_computations self._start_t = perf_counter() self.t.start() def run(self, computations: Optional[Union[str, List[str]]]=None): """ Run computations hosted on this agent. Notes ----- Attempting to start an already running computation is harmless : it will be logged but will not raise an exception. The first time this method is called, timestamp is stored, which is used as a reference when computing metrics. Parameters ---------- computations: Optional[Union[str, List[str]]] An optional computation name or list of computation names. If None, all computations hosted on this agent are started. Raises ------ AgentException If the agent was not started (using agt.start()) before calling run(). UnknownComputation If some of the computations are not hosted on this agent. All computations really hosted on the agent are started before raising this Exception. """ if not self.is_running: raise AgentException('Cannot start computation on agent %s which ' 'is not started', self.name) if computations is None: self.logger.info('Starting all computations') else: if isinstance(computations, str): computations = [computations] else: # avoid modifying caller's variable computations = computations[:] self.logger.info('Starting computations %s', computations) if self._run_t is None: # We start counter time only when the first computation is run, # to avoid counting idle time when we wait for orders. self._run_t = perf_counter() on_start_t = perf_counter() for c in list(self._computations.values()): if computations is None: if c.is_running: self.logger.debug(f'Do not start computation {c.name}, already ' 'running') else: c.start() elif c.name in computations: if c.is_running: self.logger.debug(f'Do not start computation {c.name}, already ' 'running') else: c.start() computations.remove(c.name) # add the time spent in on_start to the active time of the agent. self.t_active += perf_counter() - on_start_t if computations: raise UnknownComputation('Could not start unknown computation %s', computations) @property def start_time(self)-> float: """ float: timestamp for the first run computation call. This timestamp is used as a reference when computing various time-related metrics. """ return self._run_t def clean_shutdown(self): """ Perform a clean shutdown of the agent. All pending messages are handled before stopping the agent thread. This method returns immediately, use `join` to wait until the agent's thread has stopped. """ self.logger.debug('Clean shutdown requested') self._shutdown.set() self._messaging.shutdown() def stop(self): """ Stops the agent A computation cannot be interrupted while it handle a message, as a consequence the agent (and its thread) will stop once it he has finished handling the current message, if any. """ self.logger.debug('Stop requested on %s', self.name) self._stopping.set() def pause_computations(self, computations: Union[str, Optional[List[str]]]): """ Pauses computations. Parameters ---------- computations: Union[str, Optional[List[str]]] The name of the computation to pause, or a list of computations names. If None, all hosted computation will be paused. Raises ------ AgentException If the agent was not started (using agt.start()) before calling pause_computations(). UnknownComputation If some of the computations are not hosted on this agent. All computations really hosted on the agent are paused before raising this exception. """ if not self.is_running: raise AgentException('Cannot pause computations on agent %s which ' 'is not started') if computations is None: self.logger.info('Pausing all computations') else: if isinstance(computations, str): computations = [computations] else: computations = computations[:] self.logger.info('Pausing computations %s', computations) for c in self._computations.values(): if computations is None: if c.is_paused: self.logger.warning('Cannot pause computation %s, already ' 'paused', c.name) else: c.pause(True) elif c.name in computations: if c.is_paused: self.logger.warning('Cannot pause computation %s, already ' 'paused', c.name) else: c.pause(True) computations.remove(c.name) if computations: raise UnknownComputation('Could not pause unknown computation %s', computations) def unpause_computations(self, computations: Union[str, Optional[List[str]]]): """ Un-pause (i.e. resume) computations Parameters ---------- computations: Optional[List[str]] TThe name of the computation to resume, or a list of computations names. If None, all hosted computations will be resumed. Raises ------ AgentException If the agent was not started (using agt.start()) before calling unpause_computations(). UnknownComputation If some of the computations are not hosted on this agent. All computations really hosted on the agent are resumed before raising this exception. """ if not self.is_running: raise AgentException('Cannot resume computations on agent %s which ' 'is not started') if computations is None: self.logger.info('Resuming all computations') else: if isinstance(computations, str): computations = [computations] else: computations = computations[:] self.logger.info('Resuming computations %s', computations) for c in self._computations.values(): if computations is None: if not c.is_paused: self.logger.warning('Do not resume computation %s, not ' 'paused', c.name) else: c.pause(False) elif c.name in computations: if not c.is_paused: self.logger.warning('Do not resume computation %s, not ' 'paused', c.name) else: c.pause(False) computations.remove(c.name) if computations: raise UnknownComputation('Could not resume unknown computation %s', computations) @property def name(self): """ str: The name of the agent. """ return self._name @property def is_stopping(self)-> bool: """ bool: True if the agent is currently stopping (i.e. handling its last message). """ return self._stopping.is_set() @property def is_running(self): """ bool: True if the agent is currently running. """ return self._running def join(self): self.t.join() def _on_start(self): """ This method is called when the agent starts. Notes ----- This method is meant to be overwritten in subclasses that might need to perform some operations on startup. Do NOT forget to call `super()._on_start()` ! When `super()._on_start()` return `False`, you must also return `False` ! This method is always run in the agent's thread, even though the `start()` method is called from an other thread. Returns ------- status: boolean True if all went well, False otherwise """ self.logger.debug('on_start for {}'.format(self.name)) if self._ui_port: event_bus.enabled = True self._ui_server = UiServer(self, self._ui_port) self.add_computation(self._ui_server, publish=False) self._ui_server.start() else: self.logger.debug('No ui server for %s', self.name) self._computations[self.discovery.discovery_computation.name] = \ self.discovery.discovery_computation while True: # Check _stopping: do not prevent agent form stopping ! if self._stopping.is_set(): return False try: self.discovery.register_computation( self.discovery.discovery_computation.name, self.name, self.address) except UnreachableAgent: self.logger.warning("Could not reach directory, will retry " "later") sleep(1) else: break self.discovery.register_agent(self.name, self.address) self.discovery.discovery_computation.start() return True def _on_stop(self): """ This method is called when the agent has stopped. It is meant to be overwritten in subclasses that might need to perform some operations on stop, however, when overwriting it, you MUST call `super()._on_stop()`. Notes ----- This method always run in the agent's thread. Messages can still be sent in this message, but no new message will be received (as the agent's thread has stopped) """ self.logger.debug('on_stop for %s with computations %s ', self.name, self.computations()) # Unregister computations and agent from discovery. # This will also unregister any discovery callbacks this agent may still # have. for comp in self.computations(): comp.stop() if not _is_technical(comp.name): try: self.discovery.unregister_computation(comp.name) except UnreachableAgent: # when stopping the agent, the orchestrator / directory might have # already left. pass if self._ui_server: self._ui_server.stop() try: # Wait a bit to make sure that the stopped message can reach the # orchestrator before unregistration. sleep(0.5) self.discovery.unregister_agent(self.name) except UnreachableAgent: # when stopping the agent, the orchestrator / directory might have # already left. pass def _on_computation_value_changed(self, computation: str, value, cost, cycle): """Called when a computation selects a new value """ pass def _on_computation_new_cycle(self, computation, *args, **kwargs): """Called when a computation starts a new cycle""" pass def _on_computation_finished(self, computation: str, *args, **kwargs): """ Called when a computation finishes. This method is meant to be overwritten in sub-classes. Parameters ---------- computation: str name of the computation that just ended. """ pass def _handle_message(self, sender_name: str, dest_name: str, msg, t): # messages are delivered even to computations which have reached their # stop condition. It's up the the algorithm to decide if it wants to # handle the message. dest = self.computation(dest_name) dest.on_message(sender_name, msg, t) def metrics(self): if self._run_t is None: activity_ratio = 0 else: total_t = perf_counter() - self._run_t activity_ratio = self.t_active / (total_t) own_computations = { c.name for c in self.computations(include_technical=True)} m = { 'count_ext_msg': {k: v for k, v in self._messaging.count_ext_msg.items() if k in own_computations}, 'size_ext_msg': {k: v for k, v in self._messaging.size_ext_msg.items() if k in own_computations}, # 'last_msg_time': self._messaging.last_msg_time, 'activity_ratio': activity_ratio, 'cycles': {c.name: c.cycle_count for c in self.computations()} } return m def messages_count(self, computation: str): return self._messaging.count_ext_msg[computation] def messages_size(self, computation: str): return self._messaging.size_ext_msg[computation] def set_periodic_action(self, period: float, cb: Callable): """ Set a periodic action. The callback `cb` will be called every `period` seconds. The delay is not strict. The handling of a message is never interrupted, if it takes longer than `period`, the callback will be delayed and will only be called once the task has finished. Parameters ---------- period: float a period in second cb: Callable a callback with no argument Returns ------- handle: An handle that can be used to remove the periodic action. This handle is actually the callback object itself. """ assert period != None assert cb != None self.logger.debug("Add periodic action %s - %s ", period, cb) self._periodic_cb[cb] = (period, perf_counter()) return cb def remove_periodic_action(self, handle): """ Remove a periodic action Parameters ---------- handle: the handle returned by set_periodic_action """ self.logger.debug("Remove action %s ", handle) self._periodic_cb.pop(handle) def _run(self): self.logger.debug('Running agent ' + self._name) full_msg = None try: self._running = True self._on_start() if self.run_computations: self.run() while not self._stopping.is_set(): # Process messages, if any full_msg, t = self._messaging.next_msg(0.05) if full_msg is None: self._idle = True if self._shutdown.is_set(): self.logger.info("No message during shutdown, " "stopping agent thread") break else: current_t = perf_counter() try: sender, dest, msg, _ = full_msg self._idle = False if not self._stopping.is_set(): self._handle_message(sender, dest, msg, t) finally: if self._run_t is not None: e = perf_counter() msg_duration = e - current_t self.t_active += msg_duration if msg_duration > 1: self.logger.warning( 'Long message handling (%s) : %s', msg_duration, msg) self._process_periodic_action() except Exception as e: self.logger.error('Thread %s exits With error : %s \n ' 'Was handling message %s ', self.name, e, full_msg) self.logger.error(traceback.format_exc()) if hasattr(self, 'on_fatal_error'): self.on_fatal_error(e) except: # catch *all* exceptions e = sys.exc_info()[0] self.logger.error('Thread exits With un-managed error : %s', e) self.logger.error(e) finally: self._running = False self._comm.shutdown() self._on_stop() self.logger.info('Thread of agent %s stopped', self._name) def _process_periodic_action(self): # Process periodic action. Only once the agents runs the # computations (i.e. self._run_t is not None) ct = perf_counter() if self._start_t is not None : for cb, (p, last_t) in list(self._periodic_cb.items()): if ct - last_t >= p: # self.logger.debug('periodic cb %s, %s %s ', cb, ct, last_t) # Must update the cb entry BEFORE calling the cb, in case # the cb attemps to modify (e.g. remove) it's own entry by # calling remove_periodic_action self._periodic_cb[cb] = (p, ct) cb() def is_idle(self): """ Indicate if the agent is idle. An idle agent is an agent which has no pending messages to handle. :return: True if the agent is idle, False otherwise """ return self._idle def __str__(self): return 'Agent: '+self._name def __repr__(self): return 'Agent: ' + self._name def notify_wrap(f, cb): def wrapped(*args, **kwargs): f(*args, **kwargs) cb(*args, **kwargs) return wrapped class AgentMetrics(object): """ AgentMetrics listen to events from the event_bus to consolidate metrics. """ def __init__(self): self._computation_msg_rcv = defaultdict(lambda : (0,0)) self._computation_msg_snd = defaultdict(lambda : (0,0)) event_bus.subscribe('computations.message_rcv.*', self._on_computation_msg_rcv) event_bus.subscribe('computations.message_snd.*', self._on_computation_msg_snd) def computation_msg_rcv(self, computation: str): return self._computation_msg_rcv[computation] def computation_msg_snd(self, computation: str): return self._computation_msg_snd[computation] def _on_computation_msg_rcv(self, topic: str, msg_event): computation, msg_size = msg_event prev_count , prev_size = self._computation_msg_rcv[computation] self._computation_msg_rcv[computation] = \ prev_count+1, prev_size+ msg_size def _on_computation_msg_snd(self, topic: str, msg_event): computation, msg_size = msg_event prev_count , prev_size = self._computation_msg_snd[computation] self._computation_msg_snd[computation] = \ prev_count+1, prev_size+ msg_size from pydcop.computations_graph import constraints_hypergraph as chg repair_algo = load_algorithm_module('mgm2') class RepairComputationRegistration(object): def __init__(self, computation: MessagePassingComputation, status: str, candidate: str): self.computation = computation self.status = status self.candidate = candidate class ResilientAgent(Agent): """ An agent that supports resiliency by replicating it's computations. Parameters ---------- name: str name of the agent comm: CommunicationLayer object used to send and receive messages agent_def: AgentDef definition of this agent, optional ui_port: int the port on which to run the ui-server. If not given, no ui-server is started. replication: str name of the replication algorithm delay: int An optional delay between message delivery, in second. This delay only applies to algorithm's messages and is useful when you want to observe (for example with the GUI) the behavior of the algorithm at runtime. """ def __init__(self, name: str, comm: CommunicationLayer, agent_def: AgentDef, replication: str, ui_port=None, delay: float=None): super().__init__(name, comm, agent_def, ui_port=ui_port, delay=delay) self.replication_comp = None if replication is not None: self.logger.debug('deploying replication computation %s', replication) # DCOP computations will be added to the replication computation # as they are deployed. algo_module = import_module('pydcop.replication.{}' .format(replication)) self.replication_comp = algo_module.build_replication_computation( self, self.discovery) # self.add_computation(self.replication_comp) # Do not start the computation yet, the agent is not event started self._repair_computations =\ {} # type: Dict[str, RepairComputationRegistration] # the replication level will be set by the when requested to # replicate, by the ReplicateComputationsMessage self._replication_level = None # Register notification for when all computations have been # replicated. self.replication_comp.replication_done = notify_wrap( self.replication_comp.replication_done, self._on_replication_done) def _on_start(self): """ See Also -------- Agent._on_start Returns ------- status """ self.logger.debug('Resilient agent on_start') if not super()._on_start(): return False if self.replication_comp is not None: self.add_computation(self.replication_comp) self.replication_comp.start() return True def _on_stop(self): if self.replication_comp is not None: self.replication_comp.stop() self.discovery.unregister_computation(self.replication_comp.name) super()._on_stop() def add_computation(self, computation: MessagePassingComputation, comp_name=None, publish=True): """ Add a computation to the agent. See Also -------- Agent.add_computation Parameters ---------- computation comp_name publish Returns ------- """ super().add_computation(computation, comp_name, publish) if self.replication_comp is not None \ and not computation.name.startswith('_')\ and not computation.name.startswith('B'): # FIXME : find a better way to filter out repair computation than # looking at the first character (B). self.replication_comp.add_computation(computation.computation_def, computation.footprint()) def remove_computation(self, computation: str): if self.replication_comp is not None \ and not computation.startswith('_'): self.replication_comp.remove_computation(computation) super().remove_computation(computation) def replicate(self, k: int): if self.replication_comp is not None: self._replication_level = k self.replication_comp.replicate(k) def setup_repair(self, repair_info): self.logger.info('Setup repair %s', repair_info) # create computation for the reparation dcop # The reparation dcop uses a dcop algorithm where computations maps to # variable (in order to have another dcop distribution problem) and use # binary variable for each candidate computation. # This agent will host one variable-computation for each # binary variable x_i^m indicating if the candidate computation x_i # is hosted on this agent a_m. Notice that by construction, # the agent already have a replica for all the candidates x_i. # The reparation dcop includes several constraints and variables: # Variables # * one binary variable for each orphaned computation # Constraints # * hosted constraints : one for each candidate computation # * capacity constraint : one for this agent # * hosting costs constraint : one for this agent # * communication constraint # # For reparation, we use a dcop algorithm where computations maps to # variables of the dcop. On this agent, we host the computations # corresponding to the variables representing the orphaned computation # that could be hosted on this agent (aka candidate computation). # Here, we use MGM own_name = self.name # `orphaned_binvars` is a map that contains binary variables for # orphaned computations. # Notice that it only contains variables for computations # that this agents knows of, i.e. computations that could be hosted # here (aka candidate computations) or that depends on computations # that could be hosted here. # There is one binary variable x_i^m for each pair (x_i, a_m), # where x_i is an orphaned computation and a_m is an agent that could # host x_i (i.e. has a replica of x_i). orphaned_binvars = {} # type: Dict[Tuple, BinaryVariable] # One binary variable x_i^m for each candidate computation x_i that # could be hosted on this agent a_m. Computation for these variables # will be hosted in this agent. This is a subset of orphaned_binvars. candidate_binvars = {} # type: Dict[Tuple, BinaryVariable] # Agent that will host the computation for each binary var. # it is a dict { bin var name : agent_name } # agt_hosting_binvar = {} # type: Dict[str, str] # `hosted_cs` contains hard constraints ensuring that all candidate # computations are hosted: hosted_cs = {} # type: Dict[str, Constraint] for candidate_comp, candidate_info in repair_info.items(): try: # This computation is not hosted any more, if we had it in # discovery, forget about it but do not publish this # information, this agent is not responsible for updatings # other's discovery services. self.discovery.unregister_computation(candidate_comp, publish=False) except UnknownComputation: pass agts, _, neighbors = candidate_info # One binary variable for each candidate agent for computation # candidate_comp: v_binvar = create_binary_variables( 'B', ([candidate_comp], candidate_info[0])) # Set initial values for binary decision variable for v in v_binvar.values(): v._intial_value = 1 if random.random() < 1/3 else 0 orphaned_binvars.update(v_binvar) # the variable representing if the computation will be hosted on # this agent: candidate_binvars[(candidate_comp, own_name)] = \ v_binvar[(candidate_comp, own_name)] # the 'hosted' hard constraint for this candidate variable: hosted_cs[candidate_comp] =\ create_computation_hosted_constraint(candidate_comp, v_binvar) self.logger.debug('Hosted hard constraint for computation %s : %r', candidate_comp, hosted_cs[candidate_comp]) # One binary variable for each pair (x_j, a_n) where x_j is an # orphaned neighbors of candidate_comp and a_n is an agent that # could host a_n: for neighbor in neighbors: v_binvar = create_binary_variables( 'B', ([neighbor], neighbors[neighbor])) orphaned_binvars.update(v_binvar) self.logger.debug('Binary variable for reparation %s ', orphaned_binvars) # Agent that will host the computation for each binary var. # it is a dict { bin var name : agent_name } agt_hosting_binvar = {v.name: a for (_, a), v in orphaned_binvars.items()} self.logger.debug('Agents hosting the computations for these binary ' 'variables : %s ', agt_hosting_binvar) # The capacity (hard) constraint for this agent. This ensures that the # capacity of the current agent will not be overflown by hosting too # many candidate computations. This constraints depends on the binary # variables for the candidate computations. remaining_capacity = self.agent_def.capacity - \ sum(c.footprint() for c in self.computations()) self.logger.debug('Remaining capacity on agent %s : %s', self.name, remaining_capacity) def footprint_func(c_name: str): # We have a replica for these computation, we known its footprint. return self.replication_comp.hosted_replicas[c_name][1] capacity_c = create_agent_capacity_constraint( own_name, remaining_capacity, footprint_func, candidate_binvars) self.logger.debug('Capacity constraint for agt %s : %r', self.name, capacity_c) # Hosting costs constraint for this agent. This soft constraint is # used to minimize the hosting costs on this agent ; it depends on # the binary variables for the candidate computations. hosting_c = create_agent_hosting_constraint( own_name, self.agent_def.hosting_cost, candidate_binvars) self.logger.debug('Hosting cost constraint for agt %s : %r', self.name, hosting_c) # The communication constraint. This soft constraints is used to # minimize the communication cost on this agent. As communication # cost depends on where computation on both side of an edge are # hosted, it also depends on the binary variables for orphaned # computations that could not be hosted here. def comm_func(candidate_comp: str, neighbor_comp: str, agt: str): # returns the communication cost between the computation # candidate_name hosted on the current agent and it's neighbor # computation neigh_comp hosted on agt. route_cost = self.agent_def.route(agt) comp_def = self.replication_comp.replicas[candidate_comp] algo = comp_def.algo.algo algo_module = load_algorithm_module(algo) communication_load = algo_module.communication_load msg_load = 0 for l in comp_def.node.neighbors: if l == neighbor_comp: msg_load += communication_load(comp_def.node, neighbor_comp) com_load = msg_load * route_cost return com_load # Now that we have the variables and constraints, we can create # computation instances for each of the variable this agent is # responsible for, i.e. the binary variables x_i^m that correspond to # the candidate variable x_i (and a_m is the current agent) self._repair_computations.clear() algo_def = AlgorithmDef.build_with_default_param( repair_algo.algorithm_name, {'stop_cycle': 20, 'threshold': 0.2}, mode='min', parameters_definitions=repair_algo.algo_params) for (comp, agt), candidate_var in candidate_binvars.items(): self.logger.debug('Building computation for binary variable %s (' 'variable %s on %s)', candidate_var, comp, agt) comm_c = create_agent_comp_comm_constraint( agt, comp, repair_info[comp], comm_func, orphaned_binvars) self.logger.debug('Communication constraint for computation %s ' 'on agt %s : %r', comp, self.name, comm_c) constraints = [comm_c, hosting_c, capacity_c, hosted_cs[comp]] # constraints.extend(hosted_cs.values()) self.logger.debug('Got %s Constraints for var %s : %s ', len(constraints), candidate_var, constraints) node = chg.VariableComputationNode(candidate_var, constraints) comp_def = ComputationDef(node, algo_def) computation = repair_algo.build_computation(comp_def) self.logger.debug('Computation for %s : %r ', candidate_var, computation) # add the computation on this agents and register the neighbors self.add_computation(computation, publish=True) self._repair_computations[computation.name] = \ RepairComputationRegistration(computation, 'ready', comp) for neighbor_comp in node.neighbors: neighbor_agt = agt_hosting_binvar[neighbor_comp] try: self.discovery.register_computation( neighbor_comp, neighbor_agt, publish=False) except UnknownAgent: # If we don't know this agent yet, we must perform a lookup # and only register the computation once found. # Note the use of partial, to force the capture of # neighbor_comp. def _agt_lookup_done(comp, evt, evt_agt, _): if evt == 'agent_added': self.discovery.register_computation( comp, evt_agt, publish=False) self.discovery.subscribe_agent( neighbor_agt, partial(_agt_lookup_done, neighbor_comp), one_shot=True) self.logger.info('Repair setup done one %s, %s computations created, ' 'inform orchestrator', self.name, len(candidate_binvars)) return candidate_binvars def repair_run(self): self.logger.info('Agent runs Repair dcop computations') comps = list(self._repair_computations.values()) for c in comps: c.computation.start() c.status = 'started' def _on_replication_done(self, replica_hosts: Dict[str, List[str]]): """ Called when all computations have been replicated. This method method is meant to the overwritten in subclasses. Parameters ---------- replica_hosts: a map { computation name -> List of agt name } For each active computation hosted by this agent, this map contains a list of agents that have been selected to host a replica. """ self.logger.info('Replica distribution finished for agent ' '%s : %s (level requested : %s)', self.name, replica_hosts, self._replication_level) rep_levels = {computation: len(replica_hosts[computation]) for computation in replica_hosts} if not all([level >= self._replication_level for level in rep_levels.values()]): self.logger.warning('Insufficient replication for computations: ' '%s ', rep_levels) def _on_computation_finished(self, computation: str, *args, **kwargs): self.logger.debug('Computation %s has finished', computation) if self.replication_comp and computation in self._repair_computations: self._on_repair_computation_finished(computation) def _on_repair_computation_finished(self, computation: str): repair_comp = self._repair_computations[computation] repair_comp.status = 'finished' # deploy the computation if it was selected during reparation: if repair_comp.computation.current_value == 1: self.logger.info('Reparation: computation %s selected on %s', repair_comp.candidate, self.name) comp_def = self.replication_comp.replicas[repair_comp.candidate] self.logger.info('Deploying computation %s locally with ' 'definition , %r', repair_comp.candidate, comp_def) comp = build_computation(comp_def) self.add_computation(comp, publish=True) else: self.logger.info('Reparation: computation %s NOT selected on ' '%s', repair_comp.candidate, self.name) # Remove replica: it will be re-replicated by its new host. self.replication_comp.remove_replica(repair_comp.candidate) if all(c.status == 'finished' for c in self._repair_computations.values()): selected_computations = \ [c.candidate for c in self._repair_computations.values() if c.computation.current_value == 1] self.logger.info('All repair computations have finished, ' 'selected computation : %s', selected_computations) metrics = self.metrics() print(f" metrics repair {self.name} - {metrics}") repair_metrics = {'count_ext_msg' : {}, 'size_ext_msg': {} , 'cycles' :{}} for c in self._repair_computations.values(): c_name = c.computation.name if c_name in metrics['count_ext_msg']: repair_metrics['count_ext_msg'][c_name] = metrics['count_ext_msg'][c_name] else: repair_metrics['count_ext_msg'][c_name] = 0 if c_name in metrics['size_ext_msg']: repair_metrics['size_ext_msg'][c_name] = metrics['size_ext_msg'][c_name] else: repair_metrics['size_ext_msg'][c_name] = 0 if c_name in metrics['cycles']: repair_metrics['cycles'][c_name] = metrics['cycles'][c_name] else: repair_metrics['cycles'][c_name] = 0 print(f" {self.name} : metrics after repair {repair_metrics}") self._on_repair_done(selected_computations, repair_metrics) if selected_computations: self.logger.info('Re-replicate newly activated computations ' 'on %s : %s , level %s', self.name, selected_computations, self._replication_level) try: self.replication_comp.replicate(self._replication_level, selected_computations) except UnknownComputation: # avoid crashing if one of the neighbor comp is not repaired yet pass self.logger.info('Starting newly activated computations on ' '%s : %s ', self.name, selected_computations) for selected in selected_computations: self.computation(selected).start() self.computation(selected).pause() # Remove / undeploy repair comp once repaired for repair_comp in self._repair_computations.values(): self.remove_computation(repair_comp.computation.name) self._repair_computations.clear() def _on_repair_done(self, selected_computations: List[str]): """ Called when all repair computations have finished. This method method is meant to the overwritten in subclasses. """ pass class RepairComputation(MessagePassingComputation): """ """ def __init__(self, agent: ResilientAgent): super().__init__('_resilience_' + self.agent.name) self.agent = agent self.logger = logging.getLogger('pydcop.agent.repair.'+agent.name) self._handlers = { #'replication': self._on_replication, # 'setup_repair': self._on_setup_repair, # 'repair_run': self._on_repair_run, } @property def type(self): return 'replication' def on_message(self, var_name, msg, t): self._handlers[msg.type](msg) def footprint(self): return 0 def replication_done(self, replica_hosts: Dict[str, List[str]]): """ Called when all computations have been replicated. The replication algorithm only selects agents to host replicas, here we send the actual computations definitions to the agents selected to host a replica. We also send the obtained replication to the orchestrator. Parameters ---------- replica_hosts: a map { computation name -> List of agt name } For each active computation hosted by this agent, this map contains a list of agents that have been selected to host a replica. """ self.logger.info('Replica distribution finished for agent ' '%s : %s', self.name, replica_hosts) # self.agent.on_replication_done() # dist_msg = ComputationReplicatedMessage(self.name, replica_hosts) # self.message_sender.post_send_to_orchestrator(dist_msg)