from __future__ import absolute_import from __future__ import print_function import atexit import inspect import os import traceback from threading import Lock import collections import math import signal import time import numpy as np import simplejson import six import PIL.Image import sys from aetros.JobModel import JobModel from aetros.client import JobClient from aetros.const import JOB_STATUS from aetros.cuda_gpu import CudaNotImplementedException from aetros.git import Git from aetros.logger import GeneralLogger from aetros.utils import git, invalid_json_values, read_config, is_ignored, prepend_signal_handler, raise_sigint, \ read_parameter_by_path, stop_time, read_home_config, lose_parameters_to_full, extract_parameters, get_logger, \ is_debug, find_config from aetros.MonitorThread import MonitoringThread import subprocess if not isinstance(sys.stdout, GeneralLogger): sys.stdout = GeneralLogger(redirect_to=sys.__stdout__) if not isinstance(sys.stderr, GeneralLogger): sys.stderr = GeneralLogger(redirect_to=sys.__stderr__) last_exit_code = None original_exit = sys.exit def patched_exit(status=None): global last_exit_code last_exit_code = status original_exit(status) sys.exit = patched_exit def on_shutdown(): for job in on_shutdown.started_jobs: job.on_shutdown() on_shutdown.started_jobs = [] atexit.register(on_shutdown) class StdoutApiException(Exception): pass def Popen(*args, **kwargs): """ Executes a command using subprocess.Popen and redirects output to AETROS and stdout. Parses stdout as well for stdout API calls. Use read_line argument to read stdout of command's stdout line by line. Use returned process stdin to communicate with the command. :return: subprocess.Popen """ read_line = None if 'read_line' in kwargs: read_line = kwargs['read_line'] del kwargs['read_line'] p = subprocess.Popen(*args, **kwargs) wait_stdout = None wait_stderr = None if p.stdout: wait_stdout = sys.stdout.attach(p.stdout, read_line=read_line) if p.stderr: wait_stderr = sys.stderr.attach(p.stderr) original_wait = p.wait def wait(): original_wait() if wait_stdout: wait_stdout() if wait_stderr: wait_stderr() p.wait = wait return p def dict_factory(cursor, row): d = {} for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d class EventListener: def __init__(self): self.events = {} def on(self, name, callback): if name not in self.events: self.events[name] = [] self.events[name].append(callback) def fire(self, name, parameter=None): if name in self.events: for callback in self.events[name]: callback(parameter) def context(): """ Returns a new JobBackend instance which connects to AETROS Trainer based on "model" in aetros.yml or (internal: env:AETROS_MODEL_NAME environment variable). internal: If env:AETROS_JOB_ID is not defined, it creates a new job. Job is ended either by calling JobBackend.done(), JobBackend.fail() or JobBackend.abort(). If the script ends without calling one of the methods above, JobBackend.stop() is called and exit code defines whether it is a fail() or done() result. :return: JobBackend """ job = JobBackend() offline = False if '1' == os.getenv('AETROS_OFFLINE', ''): offline = True if os.getenv('AETROS_JOB_ID'): job.load(os.getenv('AETROS_JOB_ID')) if not offline: job.connect() else: job.create() if not offline: job.connect() job.start(offline=offline) return job class JobLossChannel: """ :type job_backend : JobBackend """ def __init__(self, job_backend, name, xaxis=None, yaxis=None, layout=None): self.name = name self.job_backend = job_backend message = { 'name': self.name, 'traces': [{'name': 'training'}, {'name': 'validation'}], 'type': JobChannel.NUMBER, 'main': True, 'timed': True, 'xaxis': xaxis, 'yaxis': yaxis, 'layout': layout, 'lossChannel': True } self.lock = Lock() self.job_backend.git.commit_json_file('CREATE_CHANNEL', 'aetros/job/channel/' + name+ '/config', message) self.stream = self.job_backend.git.stream_file('aetros/job/channel/' + name+ '/data.csv') self.stream.write('"time", "x","training","validation"\n') def send(self, x, training, validation): line = simplejson.dumps([self.job_backend.get_run_time(), x, training, validation])[1:-1] self.lock.acquire() try: self.stream.write(line + "\n") self.job_backend.git.store_file('aetros/job/channel/' + self.name + '/last.csv', line) finally: self.lock.release() class JobImage: def __init__(self, name, pil_image, label=None, pos=None): self.id = name if not isinstance(pil_image, PIL.Image.Image): raise Exception('JobImage requires a PIL.Image as image argument.') self.image = pil_image self.label = label self.pos = pos if self.pos is None: self.pos = time.time() class JobChannel: NUMBER = 'number' TEXT = 'text' """ :type job_backend: JobBackend """ def __init__(self, job_backend, name, traces=None, main=False, kpi=False, kpiTrace=0, max_optimization=True, type=None, xaxis=None, yaxis=None, layout=None): """ :param job_backend: JobBakend :param name: str :param traces: None|list : per default create a trace based on "name". :param main: bool : whether this channel is visible in the job list as column for better comparison. :param kpi: bool : whether this channel is the KPI (key performance indicator). Used for hyperparameter optimization. Only one channel can be a kpi. Only first trace used. :param kpiTrace: bool : if you have multiple traces, define which is the KPI. 0 based index. :param max_optimization: bool : whether the optimization maximizes or minmizes the kpi . Use max_optimization=False to tell the optimization algorithm that his channel minimizes a kpi, for instance the loss of a model. :param type: str : One of JobChannel.NUMBER, JobChannel.TEXT, JobChannel.IMAGE :param xaxis: dict :param yaxis: dict :param layout: dict """ self.name = name self.job_backend = job_backend self.kpi = kpi self.kpiTrace = kpiTrace self.lock = Lock() if self.kpi: self.job_backend.kpi_channel = self if not (isinstance(traces, list) or traces is None): raise Exception( 'traces can only be None or a list of dicts: [{name: "name", option1: ...}, {name: "name2"}, ...]') if not traces: traces = [{'name': ''}] if isinstance(traces, list) and isinstance(traces[0], six.string_types): traces = list(map(lambda x: {'name': x}, traces)) message = { 'name': name, 'traces': traces, 'type': type or JobChannel.NUMBER, 'main': main, 'kpi': kpi, 'timed': True, 'kpiTrace': kpiTrace, 'maxOptimization': max_optimization, 'xaxis': xaxis, 'yaxis': yaxis, 'layout': layout, } self.traces = traces self.job_backend.git.commit_json_file('CREATE_CHANNEL', 'aetros/job/channel/' + name+ '/config', message) self.stream = self.job_backend.git.stream_file('aetros/job/channel/' + name+ '/data.csv') if self.kpi: self.job_backend.git.commit_file('KPI_CHANNEL', 'aetros/job/kpi/name', name) line = simplejson.dumps(['time', 'x'] + [str(x['name']) for x in traces])[1:-1] self.stream.write(line + "\n") def send(self, x, y): if not isinstance(y, list): y = [y] if len(y) != len(self.traces): raise Exception( 'You tried to set more y values (%d items) then traces available in channel %s (%d traces).' % ( len(y), self.name, len(self.traces))) for v in y: if not isinstance(v, (int, float)) and not isinstance(v, six.string_types): raise Exception('Could not send channel value for ' + self.name+' since type ' + type(y).__name__+' is not supported. Use int, float or string values.') line = simplejson.dumps([self.job_backend.get_run_time(), x] + y)[1:-1] self.lock.acquire() try: self.stream.write(line + "\n") self.job_backend.git.store_file('aetros/job/channel/' + self.name + '/last.csv', line) finally: self.lock.release() if self.kpi: self.job_backend.git.store_file('aetros/job/kpi/last.json', simplejson.dumps(y[self.kpiTrace])) class JobBackend: """ :type event_listener: EventListener :type id: str: model name :type job_id: str: job id :type client: Client :type job: dict """ def __init__(self, model_name=None, logger=None, config_path='aetros.yml', name=None): self.event_listener = EventListener() self.log_file_handle = None self.job = {'parameters': {}} self.git = None self.ssh_stream = None self.model_name = model_name self.logger = logger self.config_path = config_path self.name = name if not self.name: if os.getenv('AETROS_JOB_NAME'): self.name = os.getenv('AETROS_JOB_NAME') else: self.name = 'master' self.client = None self.stream_log = None self.speed_stream = None self.parameter_change_callback = None self.lock = Lock() self.last_speed = 0 self.last_speed_time = 0 self.last_step_time = time.time() self.last_step = 0 self.made_steps_since_last_sync = 0 self.made_steps_size_since_last_sync = 0 self.current_epoch = 0 self.total_epochs = 0 self.step_label = None self.step_speed_label = None # indicates whether early_stop() has been called. Is called by reaching maxTimes or maxEpochs limitation. # This flag stops exiting with > 0, since the reach of a limitation is a valid exit. self.in_early_stop = False # ended means: done, abort or fail method has been called. self.ended = False # when stop(wait_for_client=True) is called, we sync last messages. # this flag indicates that end() hasn't been called yet self.stopped = False # running means: the syncer client is running. self.running = False # whether it has started once self.started = False self.start_time = time.time() # whether we are in paused state self.is_paused = False # whether on_shutdown has been called and thus the python interpreter is dying. self.in_shutdown = False self.insight_images_info = {} self.insight_created = [] self.monitoring_thread = None if not self.logger: self.logger = get_logger(os.getenv('AETROS_JOB_NAME', 'aetros-job')) self.last_progress_call = None self.job_ids = [] self.in_request = False self.stop_requested = False self.stop_requested_force = False self.kpi_channel = None self.progresses = {} self.event_listener.on('stop', self.external_stop) self.event_listener.on('aborted', self.external_aborted) self.event_listener.on('registration', self.on_registration) self.event_listener.on('registration_failed', self.on_registration_failed) self.event_listener.on('offline', self.on_client_offline) self.event_listener.on('parameter_changed', self.on_parameter_changed) self.event_listener.on('action', self.on_action) if hasattr(signal, 'SIGUSR1'): prepend_signal_handler(signal.SIGUSR1, self.on_signusr1) self.pid = os.getpid() self.ensure_model_name() self.home_config = read_home_config() self.client = JobClient(self.home_config, self.event_listener, self.logger) self.git = Git(self.logger, self.client, self.home_config, self.model_name, self.is_master_process()) self.logger.debug("Started tracking of job files in git %s for remote %s" % (self.git.git_path, self.git.origin_url)) @property def log_level(self): if os.getenv('DEBUG') == '1': return 'DEBUG' return 'INFO' @property def host(self): return self.home_config['host'] def get_run_time(self, precision=3): return round(time.time() - self.start_time, precision) def section(self, title): title = title.replace("\t", " ") seconds = self.get_run_time() line = "## %s\t%.2f\n" % (title, seconds) sys.stdout.write(line) sys.stdout.flush() def on_registration_failed(self, params): self.logger.warning("Connecting to AETROS Trainer at %s failed. Reasons: %s" % (self.host, params['reason'],)) if 'Permission denied' in params['reason']: self.logger.warning("Make sure you have saved your ssh pub key in your AETROS Trainer user account.") def on_client_offline(self, params): if self.is_master_process(): self.logger.warning("Could not establish a connection. We stopped automatic syncing.") self.logger.warning("You can publish later this job to AETROS Trainer by executing following command.") self.logger.warning("$ aetros job-push " + self.job_id[0:9]) self.git.online = False def on_registration(self, params): pass def on_signusr1(self, signal, frame): self.logger.warning("USR1: backend job_id=%s (running=%s, ended=%s), client (online=%s, active=%s, registered=%s, " "connected=%s, queue=%s), git (active_thread=%s, last_push_time=%s)." % ( str(self.job_id), str(self.running), str(self.ended), str(self.client.online), str(self.client.active), str(self.client.registered), str(self.client.connected), str([str(i)+':'+str(len(x)) for i, x in six.iteritems(self.client.queues)]), str(self.git.active_thread), str(self.git.last_push_time), )) def on_force_exit(self): """ External hook. """ pass def on_sigint(self, sig, frame): """ We got SIGINT signal. """ if self.stop_requested or self.stop_requested_force: # signal has already been sent or we force a shutdown. # handles the keystroke 2x CTRL+C to force an exit. self.stop_requested_force = True self.logger.warning('Force stopped: ' + str(sig)) # just kill the process, we don't care about the results self.on_force_exit() os._exit(1) # with force_exit we really close the process, killing it in unknown state # self.fail('Force stopped', force_exit=True) # return if self.is_master_process(): self.logger.warning('Received signal '+str(sig)+'. Send again to force stop. Stopping ...') else: self.logger.debug("Got child signal " + str(sig)) self.stop_requested = True # the default SIGINT handle in python is not always installed, so we can't rely on the # KeyboardInterrupt exception to be thrown. # thread.interrupt_main would call sigint again. # the shutdown listener will do the rest like committing rest memory files into Git and closing connections. sys.exit(0 if self.in_early_stop else 1) def external_aborted(self, params): """ Immediately abort the job by server. This runs in the Client:read() thread. """ self.ended = True self.running = False # When the server sends an abort signal, we really have to close immediately, # since for example the job has been already deleted. # without touching the git and client any further os._exit(1) def external_stop(self, force): """ Stop signal by server. """ # only the master processes handles the regular stop signal from the server, sending a SIGINT to # all its child (means to us, non-master process) if not self.is_master_process(): if force: # make sure even the subprocess dies really on force os._exit(1) return self.logger.warning("Received stop signal by server.") if not self.stop_requested_force: self.stop_requested_force = force raise_sigint() def early_stop(self): """ Stop when a limitation is reached (like maxEpoch, maxTime). """ self.in_early_stop = True raise_sigint() def batch(self, batch, total, size, label='BATCH', speed_label='SAMPLES/S'): self.step(batch, total, size=size, label=label, speed_label=speed_label) def sample(self, sample, total, label='SAMPLE', speed_label='SAMPLES/S'): self.step(sample, total, size=1, label=label, speed_label=speed_label) def step(self, step, total, label='STEP', speed_label='STEPS/S', size=1): """ Increase the step indicator, which is a sub progress circle of the actual main progress circle (epoch, progress() method). """ self.lock.acquire() try: time_diff = time.time() - self.last_step_time if self.last_step > step: # it restarted self.last_step = 0 made_steps_since_last_call = step - self.last_step self.last_step = step self.made_steps_since_last_sync += made_steps_since_last_call self.made_steps_size_since_last_sync += made_steps_since_last_call * size if time_diff >= 1 or step == total: # only each second or last batch self.set_system_info('step', step, True) self.set_system_info('steps', total, True) steps_per_second = self.made_steps_since_last_sync / time_diff samples_per_second = self.made_steps_size_since_last_sync / time_diff self.last_step_time = time.time() if size: self.report_speed(samples_per_second) epochs_per_second = steps_per_second / total # all batches self.set_system_info('epochsPerSecond', epochs_per_second, True) current_epochs = self.current_epoch if self.current_epoch else 1 total_epochs = self.total_epochs if self.total_epochs else 1 self.made_steps_since_last_sync = 0 self.made_steps_size_since_last_sync = 0 eta = 0 if step < total: # time to end this epoch if steps_per_second != 0: eta = (total - step) / steps_per_second # time until all epochs are done if total_epochs - current_epochs > 0: if epochs_per_second != 0: eta += (total_epochs - (current_epochs)) / epochs_per_second self.git.store_file('aetros/job/times/eta.json', simplejson.dumps(eta)) if label and self.step_label != label: self.set_system_info('stepLabel', label, True) self.step_label = label if speed_label and self.step_speed_label != speed_label: self.set_system_info('stepSpeedLabel', speed_label, True) self.step_speed_label = speed_label finally: self.lock.release() def report_speed(self, speed, x=None, label=None): if not self.is_master_process(): self.stdout_api_call('speed', x=x, speed=speed, label=label) return self.last_speed = speed self.last_speed_time = time.time() if x is None: x = round(time.time()-self.start_time, 3) self.set_system_info('samplesPerSecond', speed, True) self.speed_stream.write(simplejson.dumps([x, speed])[1:-1] + "\n") if label and self.step_speed_label != label: self.set_system_info('stepSpeedLabel', label, True) self.step_speed_label = label def stdout_api_call(self, command, **kwargs): action = {'aetros': command} action.update(kwargs) print(simplejson.dumps(action)) @property def job_settings(self): if 'settings' in self.job['config']: return self.job['config']['settings'] return {} def set_parameter_change_callback(self, callback): self.parameter_change_callback = callback def on_parameter_changed(self, params): pass def create_progress(self, name, total_steps=100): if name in self.progresses: return self.progresses[name] class Controller(): def __init__(self, git, name, total_steps=100): self.started = False self.stopped = False self.lock = Lock() self.name = name self.step = 0 self.steps = total_steps self.eta = 0 self.last_call = 0 self.git = git self._label = name self.store() def store(self): info = { 'label': self._label, 'started': self.started, 'stopped': self.stopped, 'step': self.step, 'steps': self.steps, 'eta': self.eta, } self.git.store_file('aetros/job/progress/' + self.name + '.json', simplejson.dumps(info)) def label(self, label): self._label = label self.store() def start(self): if self.started is not False: return self.step = 0 self.started = time.time() self.last_call = time.time() self.store() def stop(self): if self.stopped is not False: return self.stopped = time.time() self.store() def advance(self, steps=1): if steps <= 0: return self.lock.acquire() if self.started is False: self.start() took = (time.time() - self.last_call) / steps self.last_call = time.time() self.step += steps self.eta = took * (self.steps - self.step) if self.step >= self.steps: self.stop() self.store() self.lock.release() self.progresses[name] = Controller(self.git, name, total_steps) return self.progresses[name] def epoch(self, epoch=None, total=None): self.progress(epoch, total) def progress(self, progress=None, total=None): self.current_epoch = self.current_epoch if progress is None else progress self.total_epochs = self.total_epochs if total is None else total epoch_limit = False if 'maxEpochs' in self.job['config'] and isinstance(self.job['config']['maxEpochs'], int) and self.job['config']['maxEpochs'] > 0: epoch_limit = True self.total_epochs = self.job['config']['maxEpochs'] if self.current_epoch is not 0 and self.last_progress_call: # how long took it since the last call? time_per_epoch = time.time() - self.last_progress_call eta = time_per_epoch * (self.total_epochs - self.current_epoch) if self.current_epoch > self.total_epochs: eta = 0 self.git.store_file('aetros/job/times/eta.json', simplejson.dumps(eta)) if time_per_epoch > 0: self.set_system_info('epochsPerSecond', 1 / time_per_epoch, True) self.set_system_info('epoch', self.current_epoch, True) self.set_system_info('epochs', self.total_epochs, True) self.last_progress_call = time.time() if epoch_limit and self.total_epochs > 0: if self.current_epoch >= self.total_epochs: self.logger.warning("Max epoch of "+str(self.total_epochs)+" reached") self.early_stop() return def create_loss_channel(self, name='loss', xaxis=None, yaxis=None, layout=None): """ :param name: string :return: JobLossGraph """ return JobLossChannel(self, name, xaxis, yaxis, layout) def create_channel(self, name, traces=None, main=False, kpi=False, kpiTrace=0, max_optimization=True, type=JobChannel.NUMBER, xaxis=None, yaxis=None, layout=None): """ :param name: str :param traces: None|list : per default create a trace based on "name". :param main: bool : whether this channel is visible in the job list as column for better comparison. :param kpi: bool : whether this channel is the KPI (key performance indicator). Used for hyperparameter optimization. Only one channel can be a kpi. Only first trace used. :param kpiTrace: bool : if you have multiple traces, define which is the KPI. 0 based index. :param max_optimization: bool : whether the optimization maximizes or minmizes the kpi. Use max_optimization=False to tell the optimization algorithm that his channel minimizes a kpi, for instance the loss of a model. :param type: str : One of JobChannel.NUMBER, JobChannel.TEXT :param xaxis: dict :param yaxis: dict :param layout: dict """ return JobChannel(self, name, traces, main, kpi, kpiTrace, max_optimization, type, xaxis, yaxis, layout) def connect(self): self.client.configure(self.model_name, self.job_id, self.name) return self.client.start(['', 'files']) def start(self, collect_system=True, offline=False, push=True): if self.started: raise Exception('Job was already started.') if self.running: raise Exception('Job already running.') if not self.job_id: raise Exception('No job id found. Use create() first.') if not self.job: raise Exception('Job not loaded') prepend_signal_handler(signal.SIGINT, self.on_sigint) self.start_time = time.time() self.started = True self.running = True self.ended = False on_shutdown.started_jobs.append(self) self.client.configure(self.model_name, self.job_id, self.name) if not offline: # Marks client as active if not already. If not already starts to connect to the server self.client.start(['', 'files']) else: self.logger.debug('Job backend not started since offline.') if self.is_master_process(): # this is the process that actually starts the job. # other sub-processes may only modify other data. self.git.commit_file('JOB_STARTED', 'aetros/job/times/started.json', simplejson.dumps(self.start_time)) self.job_add_status('progress', JOB_STATUS.PROGRESS_STATUS_STARTED) self.git.store_file('aetros/job/times/elapsed.json', str(0)) if collect_system: self.collect_system_information() self.collect_environment() # make sure we get the progress first, before monitoring sends elapses and # updates the job cache if not offline and self.client.is_online() and push: self.git.push() self.git.start_push_sync() if collect_system: self.start_monitoring() # log stdout to Git by using self.write_log -> git:stream_file self.stream_log = self.git.stream_file('aetros/job/log.txt', fast_lane=False) self.speed_stream = self.git.stream_file('aetros/job/speed.csv') header = ["x", "speed"] self.speed_stream.write(simplejson.dumps(header)[1:-1] + "\n") if isinstance(sys.stdout, GeneralLogger): sys.stdout.job_backend = self sys.stdout.flush() if isinstance(sys.stderr, GeneralLogger): sys.stderr.job_backend = self sys.stdout.flush() else: # if this process has been called within another process that is already using JobBackend. # we disable some stuff if isinstance(sys.stdout, GeneralLogger) and not sys.stderr.job_backend: sys.stdout.disable_buffer() if isinstance(sys.stderr, GeneralLogger) and not sys.stderr.job_backend: sys.stderr.disable_buffer() def set_paused(self, v): self.is_paused = v self.set_system_info('paused', self.is_paused, True) def is_master_process(self): """ Master means that aetros.backend.start_job() has been called without using the command `aetros start`. If master is true, we collect and track some data that usually `aetros start` would do and reset the job's temp files on the server. :return: """ return os.getenv('AETROS_JOB_ID') is None def detect_git_version(self, working_dir=None): current_dir = os.getcwd() try: if working_dir: os.chdir(working_dir) with self.git.batch_commit('Git Version'): value = git.get_current_remote_url() if value: self.set_system_info('git_remote_url', value) value = git.get_current_commit_hash() if value: self.set_system_info('git_version', value) value = git.get_current_branch() if value: self.set_system_info('git_branch', value) value = git.get_current_commit_message() if value: self.set_system_info('git_commit_message', value) value = git.get_current_commit_author() if value: self.set_system_info('git_commit_author', value) finally: if working_dir: os.chdir(current_dir) def start_monitoring(self, cpu_cores=1, gpu_devices=None, docker_container=None): if not self.monitoring_thread: self.monitoring_thread = MonitoringThread(self, cpu_cores, gpu_devices, docker_container) self.monitoring_thread.daemon = True self.monitoring_thread.start() def create_keras_callback(self, model, insights=False, insights_x=None, additional_insights_layer=[], confusion_matrix=False, validation_data=None, validation_data_size=None): """ :type validation_data: int|None: (x, y) or generator :type validation_data_size: int|None: Defines the size of validation_data, if validation_data is a generator """ if insights and (insights_x is None or insights_x is False): raise Exception('Can not build Keras callback with active insights but with invalid `insights_x` as input.') if confusion_matrix and (validation_data is None or validation_data is False): raise Exception('Can not build Keras callback with active confusion_matrix but with invalid `validation_data` as input.') from aetros.KerasCallback import KerasCallback self.callback = KerasCallback(self, self.logger, force_insights=insights) self.callback.insights_x = insights_x self.callback.insight_layer = additional_insights_layer self.callback.confusion_matrix = confusion_matrix self.callback.set_validation_data(validation_data, validation_data_size) return self.callback def upload_keras_graph(self, model): from aetros.keras import model_to_graph import keras if keras.__version__[0] == '2': graph = model_to_graph(model) self.set_graph(graph) def on_shutdown(self): """ Shutdown routine. Sets the last progress (done, aborted, failed) and tries to send last logs and git commits. Also makes sure the ssh connection is closed (thus, the job marked as offline). Is triggered by atexit.register(). """ self.in_shutdown = True self.logger.debug('on_shutdown, stopped=%s, ended=%s, early_stop=%s, stop_requested=%s' % (str(self.stopped), str(self.ended), str(self.in_early_stop), str(self.stop_requested))) if self.stopped or self.ended: # make really sure, ssh connection closed self.client.close() return if self.in_early_stop: self.done() return if self.stop_requested: # when SIGINT has been triggered if self.stop_requested_force: if not self.is_master_process(): # if not master process, we just stop everything. status/progress is set by master self.stop(force_exit=True) else: # master process self.fail('Force stopped.', force_exit=True) else: if not self.is_master_process(): # if not master process, we just stop everything. status/progress is set by master self.stop() else: # master process self.abort() return if hasattr(sys, 'last_value'): # sys.last_value contains a exception, when there was an uncaught one if isinstance(sys.last_value, KeyboardInterrupt): # can only happen when KeyboardInterrupt has been raised manually # since the one from the default sigint handler will never reach here # since we catch the sigint signal and sys.exit() before the default sigint handler # is able to raise KeyboardInterrupt self.abort() else: self.fail(type(sys.last_value).__name__ + ': ' + str(sys.last_value)) elif self.running: self.done() def done(self, force_exit=False): if not self.running: return self.stop(JOB_STATUS.PROGRESS_STATUS_DONE, force_exit=force_exit) def send_std_buffer(self): if isinstance(sys.stdout, GeneralLogger): sys.stdout.send_buffer() if isinstance(sys.stderr, GeneralLogger): sys.stderr.send_buffer() def stop(self, progress=None, force_exit=False): global last_exit_code if self.stopped: return if self.is_master_process(): self.section('Ended') self.logger.debug("stop: " + str(progress)) self.send_std_buffer() self.stopped = True self.ended = True self.running = False if self.is_master_process() and progress is not None: # if not master process, the master process will set it self.job_add_status('progress', progress) exit_code = last_exit_code or 0 if progress == JOB_STATUS.PROGRESS_STATUS_DONE: exit_code = 0 if progress == JOB_STATUS.PROGRESS_STATUS_ABORTED: exit_code = 1 if progress == JOB_STATUS.PROGRESS_STATUS_FAILED: exit_code = 2 if self.is_master_process(): self.set_system_info('exit_code', exit_code) # stop push thread and commit STREAMED/STORE END files in local git self.logger.debug("Git stopping ...") self.git.stop() if self.client.is_online() and not force_exit: # make sure all queues are empty and everything has been sent self.logger.debug("Wait for queue empty and store Git blobs on server: master=" +str(self.is_master_process())) # Say server to store received files as blob in remote Git self.client.send({'type': 'sync-blob'}, channel='') self.client.send({'type': 'sync-blob'}, channel='files') report = self.is_master_process() or is_debug() if report: sys.stdout.write("Uploading last job data ... ") self.client.wait_until_queue_empty(['', 'files'], report=report, clear_end=True) self.logger.debug("Blobs remotely stored, build latest git pack now") # all further client.send calls won't be included in the final git push calculation # and might be sent again. failure_in_last_sync = False # do now the last final git push, where we upload commits and trees. # blobs should be added already via streaming if self.is_master_process(): # non-master commit and upload only. # master tracks what commits habe been sent already if self.git.push() is False: failure_in_last_sync = True # send all last messages and git pack self.logger.debug("Last wait_until_queue_empty") self.client.wait_until_queue_empty(['', 'files'], report=report, clear_end=False) # it's important to have it here, since its tracks not only hardware but also network speed # for uploading last messages and Git. # Also, after each message we get from this thread on the server, we check if the job # should be ended/terminated or not. if self.monitoring_thread: self.monitoring_thread.stop() self.monitoring_thread.join() # send last monitoring stuff and close channels self.client.wait_sending_last_messages() if self.is_master_process(): sys.stdout.write(" done.\n") # wait for end of client. Server will now close connection when ready. self.client.end() if self.is_master_process(): # check if we have uncommitted stuff objects_to_sync, types = self.git.diff_objects(self.git.get_head_commit()) if objects_to_sync: failure_in_last_sync = True if self.is_master_process() and failure_in_last_sync: self.logger.warning("Not all job data have been uploaded.") self.logger.warning("Please run following command to make sure your job is stored on the server.") self.logger.warning("$ aetros job-push " + self.job_id[0:9]) elif self.is_master_process(): if force_exit: self.logger.warning("Not all job data have been uploaded because you force the exit.") else: self.logger.warning("Not all job data have been uploaded because you went offline.") self.logger.warning("Run following command to make sure your job is stored on the server.") self.logger.warning("$ aetros job-push " + self.job_id[0:9]) if self.is_master_process(): # remove the index file # non-master use the same as master, so master cleans up self.git.clean_up() # make sure client is really stopped self.client.close() self.logger.debug("Stopped %s with last commit %s." % (self.git.ref_head, self.git.get_head_commit())) if force_exit: self.on_force_exit() os._exit(exit_code) elif not self.in_shutdown: sys.exit(exit_code) def abort(self, force_exit=False): if not self.running: return self.set_status('ABORTED', add_section=False) self.stop(JOB_STATUS.PROGRESS_STATUS_ABORTED, force_exit=force_exit) def fail(self, message=None, force_exit=False): """ Marks the job as failed, saves the given error message and force exists the process when force_exit=True. """ global last_exit_code if not last_exit_code: last_exit_code = 1 with self.git.batch_commit('FAILED'): self.set_status('FAILED', add_section=False) self.git.commit_json_file('FAIL_MESSAGE', 'aetros/job/crash/error', str(message) if message else '') if isinstance(sys.stderr, GeneralLogger): self.git.commit_json_file('FAIL_MESSAGE_LAST_LOG', 'aetros/job/crash/last_message', sys.stderr.last_messages) self.logger.debug('Crash report stored in commit ' + self.git.get_head_commit()) self.stop(JOB_STATUS.PROGRESS_STATUS_FAILED, force_exit=force_exit) def write_log(self, message): """ Proxy method for GeneralLogger. """ if self.stream_log and not self.ended: # points to the Git stream write self.stream_log.write(message) return True def set_status(self, status, add_section=True): """ Set an arbitrary status, visible in the big wheel of the job view. """ status = str(status) if add_section: self.section(status) self.job_add_status('status', status) @property def job_id(self): return self.git.job_id @property def config(self): return self.job['config'] def create(self, create_info=None, hyperparameter=None, server='local', insights=False): """ Creates a new job in git and pushes it. :param create_info: from the api.create_job_info(id). Contains the config and job info (type, server) :param hyperparameter: simple nested dict with key->value, which overwrites stuff from aetros.yml :param server: if None, the the job will be assigned to a server. :param insights: whether you want to activate insights (for simple models) """ if not create_info: create_info = { 'server': server, 'config': { 'insights': insights, 'command': ' '.join(sys.argv) } } config = find_config(self.config_path, logger=self.logger) if not config['model']: raise Exception('AETROS config file (aetros.yml) not found.') # first transform simple format in the full definition with parameter types # (string, number, group, choice_group, etc) full_hyperparameters = lose_parameters_to_full(config['parameters']) # now extract hyperparameters from full definition, and overwrite stuff using # incoming_hyperparameter if available hyperparameter = extract_parameters(full_hyperparameters, hyperparameter) create_info['config']['parameters'] = hyperparameter self.job = create_info if 'server' not in self.job and server: # setting this disables server assignment self.job['server'] = server self.job['optimization'] = None self.job['type'] = 'custom' if 'parameters' not in self.job['config']: self.job['config']['parameters'] = {} if 'insights' not in self.job['config']: self.job['config']['insights'] = insights self.job['created'] = time.time() self.git.create_job_id(self.job) self.logger.debug("Job created with Git ref " + self.git.ref_head) return self.job_id def is_simple_model(self): if not self.job: raise Exception('Job not loaded yet. Use load(id) first.') if 'type' in self.job: return self.job['type'] == 'simple' return False def ensure_model_name(self, model_name=None): if self.model_name: return if self.job and 'model' in self.job: return self.job['model'] config = find_config(self.config_path, logger=self.logger) self.logger.debug('config: ' + simplejson.dumps(config)) if model_name is None: model_name = os.getenv('AETROS_MODEL_NAME') if model_name is None: if 'model' not in config or not config['model']: sys.stderr.write('Error: No AETROS Trainer model name given. Specify it in aetros.yml `model: user/model-name` or use "aetros init model-name".\n') sys.exit(2) self.model_name = config['model'] else: self.model_name = model_name def get_parameter(self, path, default=None, return_group=False): """ Reads hyperparameter from job configuration. If nothing found use given default. :param path: str :param default: * :param return_group: If true and path is a choice_group, we return the dict instead of the group name. :return: * """ value = read_parameter_by_path(self.job['config']['parameters'], path, return_group) if value is None: return default return value def fetch(self, job_id): """ Fetches the job from the server updating the job ref. """ self.git.fetch_job(job_id) def load(self, job_id): """ Loads job into index and work-tree, restart its ref and sets as current. :param job_id: int """ self.git.read_job(job_id, checkout=self.is_master_process()) self.load_job_from_ref() def load_job_from_ref(self): """ Loads the job.json into self.job """ if not self.job_id: raise Exception('Job not loaded yet. Use load(id) first.') if not os.path.exists(self.git.work_tree + '/aetros/job.json'): raise Exception('Could not load aetros/job.json from git repository. Make sure you have created the job correctly.') with open(self.git.work_tree + '/aetros/job.json') as f: self.job = simplejson.loads(f.read(), object_pairs_hook=collections.OrderedDict) if not self.job: raise Exception('Could not parse aetros/job.json from git repository. Make sure you have created the job correctly.') self.logger.debug('job: ' + str(self.job)) def restart(self, job_id): self.git.read_job(job_id, checkout=True) progress = self.git.contents('aetros/job/status/progress.json') if progress is not None: progress = float(progress) else: progress = 0 if progress >= 2: self.logger.error('You can not restart an existing job that was already running. You need to restart the ' 'job through AETROS Trainer. progress='+str(progress)) sys.exit(1) self.load_job_from_ref() def get_job_model(self): """ Returns a new JobModel instance with current loaded job data attached. :return: JobModel """ if not self.job: raise Exception('Job not loaded yet. Use load(id) first.') return JobModel(self.job_id, self.job, self.home_config['storage_dir']) def sync_weights(self, push=True): if not os.path.exists(self.get_job_model().get_weights_filepath_latest()): return self.logger.debug("sync weights...") self.set_status('SYNC WEIGHTS', add_section=False) with open(self.get_job_model().get_weights_filepath_latest(), 'rb') as f: import keras.backend self.git.commit_file('Added weights', 'aetros/weights/latest.hdf5', f.read()) image_data_format = None if hasattr(keras.backend, 'set_image_data_format'): image_data_format = keras.backend.image_data_format() info = { 'framework': 'keras', 'backend': keras.backend.backend(), 'image_data_format': image_data_format } self.git.commit_file('Added weights', 'aetros/weights/latest.json', simplejson.dumps(info)) if push: self.git.push() # todo, implement optional saving of self.get_job_model().get_weights_filepath_best() def job_add_status(self, key, value): path = 'aetros/job/status/' + key + '.json' data = simplejson.dumps(value, default=invalid_json_values) self.git.commit_file('STATUS ' + str(value), path, data) if self.client.is_online(): # just so have it faster self.client.send({'type': 'store-blob', 'path': path, 'data': data}, channel='') def set_info(self, name, value, commit_end_of_job=False): if commit_end_of_job: self.git.store_file('aetros/job/info/' + name + '.json', simplejson.dumps(value, default=invalid_json_values)) else: self.git.commit_json_file('INFO ' + name, 'aetros/job/info/' + name, value) def set_graph(self, graph): self.git.commit_json_file('GRAPH', 'aetros/job/graph', graph) def set_system_info(self, key, value, commit_end_of_job=False): if commit_end_of_job: self.git.store_file('aetros/job/system/' + key + '.json', simplejson.dumps(value, default=invalid_json_values)) else: self.git.commit_json_file('SYSTEM_INFO ' + key, 'aetros/job/system/' + key, value) def commit_file(self, path, git_path=None, title=None): path = os.path.expanduser(path).strip() if not git_path: git_path = os.path.relpath(path, os.getcwd()) git_path = git_path.replace('../', '') git_path = git_path.replace('./', '') with self.git.batch_commit('FILE ' + (title or git_path)): if os.path.isdir(path): for file in os.listdir(path): self.commit_file(path + '/' + file) return if os.path.getsize(path) > 10 * 1024 * 1024: self.logger.error('Can not upload files bigger than 10MB: ' + str(path)) return with open(path, 'rb') as f: contents = f.read() self.git.commit_file('FILE ' + (title or git_path), git_path, contents) registered_actions = {} def register_action(self, callback, name=None, label=None, description=None): if name is None: name = callback.__name__ args = {} inspect_args = inspect.getargspec(callback) if inspect_args.args: defaults = inspect_args.defaults if inspect_args.defaults else [] start_default_idx = len(inspect_args.args) - len(defaults) for idx, argname in enumerate(inspect_args.args): args[argname] = {'default': None, 'type': 'mixed'} if idx >= start_default_idx: default_value = defaults[idx - start_default_idx] arg_type = 'mixed' if isinstance(default_value, six.string_types): arg_type = 'string' if isinstance(default_value, int): arg_type = 'integer' if isinstance(default_value, float): arg_type = 'float' if isinstance(default_value, bool): arg_type = 'bool' args[argname] = {'default': default_value, 'type': arg_type} value = { 'label': label, 'description': description, 'args': args, 'instance': self.name } self.git.store_file('aetros/job/actions/' + name + '/config.json', simplejson.dumps(value, default=invalid_json_values)) self.registered_actions[name] = value value['callback'] = callback def on_pause(self): pass def on_continue(self): pass def on_action(self, params): action_id = params['id'] action_name = params['name'] action_value = params['value'] if action_name in ['pause', 'continue']: try: if action_name == 'pause': return self.on_pause() if action_name == 'continue': return self.on_continue() except SystemExit: raise except KeyboardInterrupt: raise except Exception as e: traceback.print_exc() self.logger.warning("Trigger action %s failed: %s" % (action_name, type(e).__name__ + ': ' + str(e))) if action_name not in self.registered_actions: # Received action ' + str(action_name) + ' but no callback registered.') return self.logger.debug("Trigger action: " + str(params)) self.logger.info("Trigger action %s(%s)" %( action_name, str(action_value))) config = self.registered_actions[action_name] callback = config['callback'] action = { 'name': action_name, 'value': action_value, 'time': time.time(), } self.git.store_file( 'aetros/job/actions/' + str(action_name) + '/result/' + str(action_id) + '.json', simplejson.dumps(action, default=invalid_json_values) ) def done(value): result = { 'value': value, 'time': time.time() } # if value is binary, include mimetype and save it in a separate file self.git.store_file( 'aetros/job/actions/' + str(action_name) + '/result/' + str(action_id) + '.json', simplejson.dumps(result, default=invalid_json_values) ) kwargs = {} try: if action_value: kwargs = action_value if 'done' in config['args']: kwargs['done'] = done result = callback(**kwargs) # returning done as result marks this as async call if result is not done: # we have no async call done(result) except SystemExit: raise except KeyboardInterrupt: raise except Exception as e: traceback.print_exc() self.logger.warning("Trigger action %s(%s) failed: %s" % (action_name, str(kwargs), type(e).__name__+': '+ str(e))) result = { 'exception': type(e).__name__, 'message': str(e), 'time': time.time() } self.git.store_file( 'aetros/job/actions/' + str(action_name) + '/result/' + str(action_id) + '.json', simplejson.dumps(result, default=invalid_json_values) ) def file_list(self): """ Lists all files in the working directory. """ blacklist = ['.git', 'aetros'] working_tree = self.git.work_tree def recursive(path='.'): if os.path.basename(path) in blacklist: return 0, 0 if os.path.isdir(path): files = [] for file in os.listdir(path): if path and path != '.': file = path + '/' + file added_files = recursive(file) files += added_files return files else: if path.endswith('.pyc'): return [] if is_ignored(path, self.config['ignore']): return [] return [os.path.relpath(path, working_tree)] return recursive(working_tree) def add_files(self, working_tree, report=False): """ Commits all files from limited in aetros.yml. `files` is a whitelist, `exclude_files` is a blacklist. If both are empty, we commit all files smaller than 10MB. :return: """ blacklist = ['.git'] def add_resursiv(path = '.', report=report): if os.path.basename(path) in blacklist: return 0, 0 if working_tree + '/aetros' == path: # ignore in work_tree the folder ./aetros/, as it could be # that we checked out a job and start it again. return 0, 0 if os.path.isdir(path): files = 0 size = 0 for file in os.listdir(path): if path and path != '.': file = path + '/' + file added_files, added_size = add_resursiv(file) files += added_files size += added_size return files, size else: if path.endswith('.pyc'): return 0, 0 relative_path = os.path.relpath(path, working_tree) if is_ignored(relative_path, self.config['ignore']): return 0, 0 self.logger.debug("added file to job " + relative_path) if report: print("Added job file: " + relative_path) self.git.add_file_path_in_work_tree(path, working_tree, verbose=False) return 1, os.path.getsize(path) return add_resursiv(working_tree, report=report) def add_embedding_word2vec(self, x, path, dimensions=None, header_with_dimensions=True): """ Parse the word2vec file and extracts vectors as bytes and labels as TSV file. The format is simple: It's a UTF-8 encoded file, each word + vectors separated by new line. Vector is space separated. At the very first line might be dimensions, given as space separated value. Line 1: 2 4\n Line 2: word 200.3 4004.4 34.2 22.3\n Line 3: word2 20.0 4.4 4.2 0.022\n and so on For performance reasons, you should prefer add_embedding_path(). """ if path.endswith('.txt'): if not os.path.exists(path): raise Exception("Given word2vec file does not exist: " + path) f = open(path, 'r') if not header_with_dimensions and not dimensions: raise Exception('Either the word2vec file should contain the dimensions as header or it needs to be' 'specified manually using dimensions=[x,y] argument.') if header_with_dimensions: line = f.readline() if ' ' not in line: raise Exception('Given word2vec file should have in first line the dimensions, e.g.: 1000 200') dimensions = np.fromstring(line, dtype=np.uint, sep=' ').tolist() labels = '' vectors = '' line_pos = 1 if header_with_dimensions else 0 if len(dimensions) != 2: raise Exception('dimensions invalid shape. e.g. [200, 32] => 200 rows, 32 cols.') for line in iter(f.readline, ''): line_pos += 1 space_pos = line.find(' ') if -1 == space_pos: message = 'Given word2vec does not have correct format in line ' + str(line_pos) message += '\nGot: ' + str(line) raise Exception(message) labels += line[:space_pos] + '\n' vectors += line[space_pos+1:] + ' ' vectors = np.fromstring(vectors, dtype=np.float32, sep=' ').tobytes() else: raise Exception("Given word2vec is not a .txt file. Other file formats are not supported.") info = { 'dimensions': dimensions } name = os.path.basename(path) self._ensure_insight(x) remote_path = 'aetros/job/insight/'+str(x)+'/embedding/' with self.git.batch_commit('INSIGHT_EMBEDDING ' + str(x)): self.git.commit_file('WORD2VEC', remote_path + name + '/tensor.bytes', vectors) self.git.commit_file('WORD2VEC', remote_path + name + '/metadata.tsv', labels) self.git.commit_file('WORD2VEC', remote_path + name + '/info.json', simplejson.dumps(info)) def add_embedding_path(self, x, dimensions, vectors_path, metadata=None, image_shape=None, image=None): """ Adds a new embedding with optional metadata. Example how to generate vectors based on 2D numpy array: # 4 vectors, each size of 3 vectors = [ [2.3, 4.0, 33], [2.4, 4.2, 44], [2.5, 3.9, 34], [5.5, 200.2, 66] ] metadata = [ # header, only necessary when more then on column # can be anything. ['label', 'count'], # for each vector from above an entry. ['red', '123'], ['white', '143'], ['yellow', '344'], ['house', '24'], ] numpy.array(vectors, dtype=numpy.float32).tofile('vectors.bytes') numpy.savetxt('metadata.tsv', numpy.array(metadata), delimiter='\t', fmt='%s') job.add_embedding_path([4, 3], 'vectors.bytes', 'metadata.tsv') Metadata format example: Label\tCount\n red\t4\n yellow\t6\n :param x: The x axis of the insights. :param dimensions: 2D List of dimension, e.g [200, 20], means 200 vectors and each vector has size of 20. :param vectors_path: A path to a floats64 bytes file, no separators, sum(dimensions)*floats64 long. Example: If dimensions [200, 20] then the tensor file has 200*20 float32 bytes in it :param metadata: A TSV file. If only one column long (=no tab separator per line), then there's no need for a header. If you have more than one column, use the first line as header. :param image_shape: Size of the image of each vector. :param image: Path to an image sprite. :return: """ if not os.path.exists(vectors_path): raise Exception("Given embedding vectors file does not exist: " + vectors_path) if metadata and not os.path.exists(metadata): raise Exception("Given embedding metadata file does not exist: " + metadata) name = os.path.basename(vectors_path) self._ensure_insight(x) remote_path = 'aetros/job/insight/'+str(x)+'/embedding/' info = { 'dimensions': dimensions, 'image_shape': image_shape, 'image': os.path.basename(image) if image else None, } with self.git.lock_write(): self.git.add_file_path(remote_path + name + '/tensor.bytes', vectors_path) self.git.add_file_path(remote_path + name + '/metadata.tsv', metadata) self.git.add_file(remote_path + name + '/info.json', simplejson.dumps(info)) if image: self.git.add_file(remote_path + name + '/' + os.path.basename(image), image) self.git.commit_index('INSIGHT_EMBEDDING ' + str(x)) def add_insight_image_path(self, x, path, name=None, label=None): image = PIL.Image.open(path) if not name: name = os.path.basename(path) return self.add_insight_image(x, JobImage(name, image, label)) def add_insight_image(self, x, image): self.add_insight_images(x, [image]) def add_insight_images(self, x, images): converted_images = [] self._ensure_insight(x) if x not in self.insight_images_info: self.insight_images_info[x] = {} for image in images: if not isinstance(image, JobImage): raise Exception('job_add_insight only accepts JobImage instances in images argument') if image.id in self.insight_images_info[x]: continue converted_images.append({ 'id': image.id, 'image': self.pil_image_to_jpeg(image.image) }) self.insight_images_info[x][image.id] = { 'file': image.id+'.jpg', 'label': image.label, 'pos': image.pos } with self.git.batch_commit('INSIGHT_IMAGES ' + str(x)): for image in converted_images: remote_path = 'aetros/job/insight/'+str(x)+'/image/'+image['id']+'.jpg' self.git.commit_file('IMAGE ' + str(image['id']), remote_path, image['image']) remote_path = 'aetros/job/insight/' + str(x) + '/info.json' self.git.commit_file('IMAGE INFO', remote_path, simplejson.dumps(self.insight_images_info[x])) def add_insight_confusion_matrix(self, x, confusion_matrix): self._ensure_insight(x) remote_path = 'aetros/job/insight/' + str(x) + '/confusion_matrix.json' self.git.commit_file('INSIGHT CONFUSION_MATRIX ' + str(x), remote_path, simplejson.dumps(confusion_matrix)) def job_add_insight(self, x, images=None, confusion_matrix=None): if images: self.add_insight_images(x, images) if confusion_matrix: self.add_insight_confusion_matrix(x, confusion_matrix) def _ensure_insight(self, x): if x in self.insight_created: return self.insight_created.append(x) remote_path = 'aetros/job/insight/' + str(x) + '/created' self.git.commit_file('WORD2VEC ' + str(x), remote_path, str(time.time())) def pil_image_to_jpeg(self, image): buffer = six.BytesIO() image.save(buffer, format="JPEG", optimize=True, quality=70) return buffer.getvalue() def collect_environment(self, overwrite_variables=None): import socket import os import pip import platform env = {} if not overwrite_variables: overwrite_variables = {} import aetros env['aetros_version'] = aetros.__version__ env['python_version'] = platform.python_version() env['python_executable'] = sys.executable env['hostname'] = socket.gethostname() env['variables'] = dict(os.environ) env['variables'].update(overwrite_variables) if 'AETROS_SSH_KEY' in env['variables']: del env['variables']['AETROS_SSH_KEY'] if 'AETROS_SSH_KEY_BASE64' in env['variables']: del env['variables']['AETROS_SSH_KEY_BASE64'] env['pip_packages'] = sorted([[i.key, i.version] for i in pip.get_installed_distributions()]) self.set_system_info('environment', env) def collect_device_information(self, gpu_ids): import aetros.cuda_gpu try: if gpu_ids: self.set_system_info('cuda_version', aetros.cuda_gpu.get_version()) gpus = {} for gpu_id, gpu in enumerate(aetros.cuda_gpu.get_ordered_devices()): if gpu_id in gpu_ids: gpus[gpu_id] = gpu self.set_system_info('gpus', gpus) except CudaNotImplementedException: self.logger.warning("Could not collect GPU/CUDA system information.") if self.get_job_model().has_dpu(): self.set_system_info('dpus', [{'memory': 64*1024*1024*1024}]) def collect_system_information(self): import psutil mem = psutil.virtual_memory() with self.git.batch_commit('JOB_SYSTEM_INFORMATION'): self.set_system_info('memory_total', mem.total) import cpuinfo cpu = cpuinfo.get_cpu_info() self.set_system_info('cpu_name', cpu['brand']) self.set_system_info('cpu', [cpu['hz_actual_raw'][0], cpu['count']]) stdout_api_channels = {} def handle_stdout_api(self, data): action = data['aetros'] del data['aetros'] def validate_action(requires_attributes): for attr in requires_attributes: if attr not in data: raise StdoutApiException("AETROS stdout API call %s requires value for '%s'. " % (action, attr)) return True def failed(message): raise StdoutApiException( "AETROS stdout API call %s failed: %s Following ignored: %s" % (action, message, str(data))) def default(attr, default=None): return data[attr] if attr in data else default if action == 'progress': self.progress(**data) return True if action == 'epoch': self.epoch(**data) return True if action == 'batch': if validate_action(['batch', 'total', 'size']): self.batch(**data) return True if action == 'step': if validate_action(['step', 'total']): self.step(**data) return True if action == 'sample': if validate_action(['sample', 'total']): self.sample(**data) return True if action == 'info': if validate_action(['name', 'value']): self.set_info(**data) return True if action == 'status': if validate_action(['status']): self.set_status(**data) return True if action == 'speed': if validate_action(['x', 'speed']): self.report_speed(**data) return True if action == 'add_embedding_word2vec': if validate_action(['x', 'path']): self.add_embedding_word2vec(**data) return True if action == 'add_embedding_path': if validate_action(['x', 'dimensions', 'vectors_path']): self.add_embedding_path(**data) return True if action == 'add_insight_image': if validate_action(['x', 'path']): self.add_insight_image_path(**data) return True if action == 'create-channel': if validate_action(['name']): if data['name'] in self.stdout_api_channels: failed("Channel %s already defined. " % (data['name'], )) else: self.stdout_api_channels[data['name']] = self.create_channel(**data) return True if action == 'channel': if validate_action(['name', 'x', 'y']): if data['name'] not in self.stdout_api_channels: self.stdout_api_channels[data['name']] = self.create_channel(data['name']) self.stdout_api_channels[data['name']].send(data['x'], data['y']) return True if action == 'loss': if validate_action(['x', 'training', 'validation']): if 'loss' not in self.stdout_api_channels: self.stdout_api_channels['loss'] = self.create_loss_channel('loss') self.stdout_api_channels['loss'].send(data['x'], data['training'], data['validation']) return True # if action == 'insight': # if validate_action(['x']): # # self.job_add_insight(data['x']) # return True if action == 'abort': self.abort() if action == 'fail': self.fail(default('message')) return False