#!/usr/bin/env python from __future__ import print_function from multiprocessing import cpu_count import baseDriver import time import os import parameterParser import django_initial try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser import checkPoint from ui.models import Queue, Protocol, References, Training import threading import subprocess import psutil settings = baseDriver.get_all_config() CPU_POOL, MEMORY_POOL, DISK_POOL, VRT_POOL = baseDriver.get_init_resource() MAX_JOB = cpu_count() JOB_TABLE = dict() USER_REFERENCES = dict() JOB_PARAMETERS = dict() JOB_COMMAND = dict() RUN_PARAMETERS = dict() JOB_INPUT_FILES = dict() LAST_OUTPUT_STRING = dict() OUTPUTS = dict() OUTPUT_DICT = dict() OUTPUT_DICT_SUFFIX = dict() NEW_FILES = dict() LAST_OUTPUT = dict() LAST_OUTPUT_SUFFIX = dict() INPUT_SIZE = dict() OUTPUT_SIZE = dict() FOLDER_SIZE_BEFORE = dict() CUMULATIVE_OUTPUT_SIZE = dict() USER_MAIL_DICT = dict() RESOURCES = dict() LATEST_JOB_ID = 0 LATEST_JOB_STEP = 0 RUNNING_JOBS = 0 root_path = os.path.split(os.path.realpath(__file__))[0] def get_steps(protocol_id): """ Get steps of a protocol :param protocol_id: int, protocol id :return: list, list of unresolved steps """ step_list = [] steps = Protocol.objects.filter(parent=protocol_id).order_by('step_order') html_parser = HTMLParser() workspace_path = settings['env']['workspace'] for index, step in enumerate(steps): # priority for self-compiled tool software_path = os.path.join(os.path.join(os.path.join(workspace_path, str(step.user_id)), 'bin'), str(step.software)) if os.path.exists(software_path) and os.path.isfile(software_path): step.software = software_path step_list.append({ 'id': index, 'parameter': html_parser.unescape(str(step.software).rstrip() + " " + str(step.parameter)), 'specify_output': step.specify_output, 'hash': step.hash, 'env': step.env, 'force_local': step.force_local, }) return step_list def get_user_mail(user): """ Get user email address :param user: int, user id :return: string, email address, return '' when no record was found """ global USER_MAIL_DICT if user in USER_MAIL_DICT.keys(): return USER_MAIL_DICT[user] else: try: from django.contrib.auth.models import User user_obj = User.objects.get(id=int(user)) USER_MAIL_DICT[user] = user_obj.email return user_obj.email except: return '' def get_user_reference(user): """ Get user references :param user: int, user id :return: None, the references will be stored directly into the USER_REFERENCES dictionary """ references = References.objects.filter(user_id=user) for reference in references: if user not in USER_REFERENCES.keys(): USER_REFERENCES[user] = dict() USER_REFERENCES[user][reference.name] = reference.path def create_user_folder(uf, jf): """ Create user folder :param uf: string, path to user's folder :param jf: string, path to job's folder :return: None """ try: if not os.path.exists(uf): os.mkdir(uf) if not os.path.exists(jf): os.mkdir(jf) except Exception as e: print(e) def prepare_workspace(resume, run_folder, job_id, user_id, result=''): """ Build path info for the execution of a job :param resume: int, if resume equals to 1, BioQueue will use the old folder instead of creating a new one :param run_folder: string, parent folder :param job_id: int, job id :param user_id: int, user id :param result: string, folder name for the job :return: tuple, path to user folder and job folder """ if result=='' or resume==-1: result_store = baseDriver.rand_sig() + str(job_id) user_folder = os.path.join(run_folder, str(user_id)) run_folder = os.path.join(user_folder, result_store) create_user_folder(user_folder, run_folder) # baseDriver.update(settings['datasets']['job_db'], job_id, 'result', result_store) try: job_record = Queue.objects.get(id=job_id) job_record.set_result(result_store) except: pass else: result_store = result user_folder = os.path.join(run_folder, str(user_id)) run_folder = os.path.join(user_folder, result_store) return user_folder, run_folder def initialize_job_parameters(job_parameter, input_file, user, job_id): """ Parse reference and job parameter (special and input files) :param job_parameter: list, job parameter :param input_file: list, input files :param user: int, user id :param job_id: int, job id :return: None """ JOB_PARAMETERS[job_id] = parameterParser.build_special_parameter_dict(job_parameter) get_user_reference(user) if user in USER_REFERENCES.keys(): JOB_PARAMETERS[job_id] = dict(JOB_PARAMETERS[job_id], **USER_REFERENCES[user]) JOB_INPUT_FILES[job_id] = input_file.split(';') INPUT_SIZE[job_id] = 0 def get_job(max_fetch=1): """ Get job information and store into JOB_TABLE :param max_fetch: int, the amount of records to fetch :return: None """ job_table = dict() global JOB_TABLE, OUTPUT_DICT, LAST_OUTPUT, CUMULATIVE_OUTPUT_SIZE, OUTPUT_DICT_SUFFIX, LAST_OUTPUT_SUFFIX # fetch jobs jobs = Queue.objects.filter(status=0)[:max_fetch] for job in jobs: if job.id in JOB_TABLE.keys(): continue else: user_folder, job_folder = prepare_workspace(job.resume, job.run_dir, job.id, job.user_id, job.result) initialize_job_parameters(job.parameter, job.input_file, job.user_id, job.id) JOB_TABLE[job.id] = { 'protocol': job.protocol_id, 'input_file': job.input_file, 'parameter': job.parameter, 'run_dir': job.run_dir, 'result': job.result, 'status': job.status, 'user_id': job.user_id, 'resume': job.resume, 'steps': get_steps(job.protocol_id), 'user_folder': user_folder, 'job_folder': job_folder, 'wait_for': 0, 'name': job.job_name, } OUTPUT_DICT[job.id] = dict() OUTPUT_DICT_SUFFIX[job.id] = dict() LAST_OUTPUT[job.id] = [] LAST_OUTPUT_SUFFIX[job.id] = dict() CUMULATIVE_OUTPUT_SIZE[job.id] = 0 def get_training_items(step_hash): """ Get the amount of training items :param step_hash: string, step hash :return: int, the number of training items """ trainings = Training.objects.filter(step=step_hash, lock=0) return len(trainings) def create_machine_learning_item(step_hash, input_size): """ Create machine learning item and store it into the training table :param step_hash: string :param input_size: int :return: record id """ training = Training(step=step_hash, input=input_size, lock=1) training.save() return training.id def update_resource_pool(resource_dict, direction=1): global CPU_POOL, MEMORY_POOL, DISK_POOL, VRT_POOL if 'cpu' in resource_dict.keys() and resource_dict['cpu'] is not None: CPU_POOL += direction * resource_dict['cpu'] if 'mem' in resource_dict.keys() and resource_dict['mem'] is not None: MEMORY_POOL += direction * resource_dict['mem'] if 'disk' in resource_dict.keys() and resource_dict['disk'] is not None: DISK_POOL += direction * resource_dict['disk'] if 'vrt_mem' in resource_dict.keys() and resource_dict['vrt_mem'] is not None: VRT_POOL += direction * resource_dict['vrt_mem'] return CPU_POOL, MEMORY_POOL, DISK_POOL, VRT_POOL def finish_job(job_id, error=0): """ Mark a job as finished and release resources it occupied If mail notify is switched on, it will send e-mail :param job_id: int, job id :param error: int, if error occurs, it should be 1 :return: None """ global DISK_POOL, JOB_TABLE, NEW_FILES, OUTPUTS, OUTPUT_DICT,\ OUTPUT_SIZE, FOLDER_SIZE_BEFORE, CUMULATIVE_OUTPUT_SIZE,\ LAST_OUTPUT_STRING, LAST_OUTPUT_SUFFIX, OUTPUT_DICT_SUFFIX if job_id in JOB_TABLE.keys(): if error == 1: if settings['mail']['notify'] == 'on': try: from notify import MailNotify mail = MailNotify(JOB_TABLE[job_id]['user_id'], 2, job_id, JOB_TABLE[job_id]['protocol'], JOB_TABLE[job_id]['input_file'], JOB_TABLE[job_id]['parameter']) mail.send_mail(mail.get_user_mail_address(JOB_TABLE[job_id]['user_id'])) except Exception as e: print(e) else: try: job = Queue.objects.get(id=job_id) job.status = -1 job.save() except: pass baseDriver.del_output_dict(job_id) if settings['mail']['notify'] == 'on': try: from notify import MailNotify mail = MailNotify(JOB_TABLE[job_id]['user_id'], 1, job_id, JOB_TABLE[job_id]['protocol'], JOB_TABLE[job_id]['input_file'], JOB_TABLE[job_id]['parameter']) mail.send_mail(mail.get_user_mail_address(JOB_TABLE[job_id]['user_id'])) except Exception as e: print(e) if job_id in JOB_TABLE.keys(): resume = JOB_TABLE[job_id]['resume'] res_key = str(job_id) + '_' + str(resume + 1) if res_key in RESOURCES.keys(): RESOURCES.pop(res_key) DISK_POOL += CUMULATIVE_OUTPUT_SIZE[job_id] - baseDriver.get_folder_size(JOB_TABLE[job_id]['job_folder']) JOB_TABLE.pop(job_id) if job_id in OUTPUTS.keys(): OUTPUTS.pop(job_id) if job_id in OUTPUT_DICT.keys(): OUTPUT_DICT.pop(job_id) if job_id in LAST_OUTPUT.keys(): LAST_OUTPUT.pop(job_id) if job_id in LAST_OUTPUT_STRING.keys(): LAST_OUTPUT_STRING.pop(job_id) if job_id in CUMULATIVE_OUTPUT_SIZE.keys(): CUMULATIVE_OUTPUT_SIZE.pop(job_id) if job_id in LAST_OUTPUT_SUFFIX.keys(): LAST_OUTPUT_SUFFIX.pop(job_id) if job_id in OUTPUT_DICT_SUFFIX.keys(): OUTPUT_DICT_SUFFIX.pop(job_id) def run_prepare(job_id, job, no_new_learn=0): """ Parse step's parameter and predict the resources needed by the step :param job_id: int, jod id :param job: dict, job dict :param no_new_learn: int, 1 means refusing creating new training item :return: """ global LAST_OUTPUT_STRING, OUTPUTS, OUTPUT_DICT, OUTPUT_DICT_SUFFIX, NEW_FILES, LAST_OUTPUT, LAST_OUTPUT_STRING learning = 0 outside_size = 0 if job['status'] == -1 and job['resume'] != -1: # skip and resume tmp_dict = baseDriver.load_output_dict(job_id) if 'LAST_OUTPUT_STRING' in tmp_dict.keys(): LAST_OUTPUT_STRING[job_id] = tmp_dict['LAST_OUTPUT_STRING'] if 'OUTPUTS' in tmp_dict.keys(): OUTPUTS[job_id] = tmp_dict['OUTPUTS'] if 'OUTPUT_DICT' in tmp_dict.keys(): OUTPUT_DICT[job_id] = tmp_dict['OUTPUT_DICT'] if 'OUTPUT_DICT_SUFFIX' in tmp_dict.keys(): OUTPUT_DICT_SUFFIX[job_id] = tmp_dict['OUTPUT_DICT_SUFFIX'] if 'NEW_FILES' in tmp_dict.keys(): NEW_FILES[job_id] = tmp_dict['NEW_FILES'] if 'LAST_OUTPUT' in tmp_dict.keys(): LAST_OUTPUT[job_id] = tmp_dict['LAST_OUTPUT'] if 'LAST_OUTPUT_SUFFIX' in tmp_dict.keys(): LAST_OUTPUT_SUFFIX[job_id] = tmp_dict['LAST_OUTPUT_SUFFIX'] ve = None if (job['resume'] + 1) == len(job['steps']): return None elif job['status'] > 0: return 'running' else: step = job['steps'][job['resume'] + 1]['parameter'] ve = job['steps'][job['resume'] + 1]['env'] step = step.replace('{Job}', str(job_id)) step = step.replace('{JobName}', str(JOB_TABLE[job_id]['name'])) if job_id in LAST_OUTPUT_STRING.keys(): step = step.replace('{LastOutput}', LAST_OUTPUT_STRING[job_id]) if job_id in OUTPUTS.keys(): step = step.replace('{AllOutputBefore}', ' '.join(OUTPUTS[job_id])) if job_id in NEW_FILES.keys(): step = parameterParser.last_output_map(step, NEW_FILES[job_id]) if job_id in JOB_PARAMETERS.keys(): step = parameterParser.special_parameter_map(step, JOB_PARAMETERS[job_id]) if job_id in OUTPUT_DICT.keys(): step = parameterParser.output_file_map(step, OUTPUT_DICT[job_id]) if job_id in JOB_INPUT_FILES.keys(): step, outside_size = parameterParser.input_file_map(step, JOB_INPUT_FILES[job_id], job['user_folder']) if job_id in LAST_OUTPUT_SUFFIX.keys() and job_id in OUTPUT_DICT_SUFFIX.keys(): step = parameterParser.suffix_map(step, OUTPUT_DICT_SUFFIX[job_id], LAST_OUTPUT_SUFFIX[job_id]) step = parameterParser.history_map(step, job['user_id'], job['user_folder'], Queue) step, outside_size_upload = parameterParser.upload_file_map(step, job['user_folder']) outside_size += outside_size_upload step = step.replace('{Workspace}', job['job_folder']) user_bin_dir = os.path.join(os.path.join(settings['env']['workspace'], job['user_id'], 'bin')) if not os.path.exists(user_bin_dir): try: os.makedirs(user_bin_dir) except: pass step = step.replace('{UserBin}', user_bin_dir) if settings['cluster']['type']: if 'cpu' in settings['cluster'].keys() and settings['cluster']['cpu']: step = step.replace('{ThreadN}', str(settings['cluster']['cpu'])) else: step = step.replace('{ThreadN}', str(settings['env']['cpu'])) else: step = step.replace('{ThreadN}', str(settings['env']['cpu'])) # support for virtual envs if ve is not None: step = "source activate " + ve.value + "&&" + step step += " && source deactivate" JOB_COMMAND[job_id] = parameterParser.parameter_string_to_list(step) LAST_OUTPUT[job_id] = baseDriver.get_folder_content(job['job_folder']) training_num = get_training_items(job['steps'][job['resume'] + 1]['hash']) if training_num < 10: learning = 1 if INPUT_SIZE[job_id] == 0: INPUT_SIZE[job_id] = baseDriver.get_folder_size(job['job_folder']) else: if job_id in OUTPUT_SIZE.keys(): INPUT_SIZE[job_id] = OUTPUT_SIZE[job_id] else: INPUT_SIZE[job_id] = 0 FOLDER_SIZE_BEFORE[job_id] = baseDriver.get_folder_size(job['job_folder']) INPUT_SIZE[job_id] += outside_size resource_needed = checkPoint.predict_resource_needed(job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id], training_num) if resource_needed['cpu'] > int(settings['env']['cpu']) * 100: resource_needed['cpu'] = int(settings['env']['cpu']) * 95 # if resource_needed['mem'] > if learning == 1 and no_new_learn == 0: trace_id = create_machine_learning_item(job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id]) resource_needed['trace'] = trace_id return resource_needed def forecast_step(job_id, step_order, resources): """ Before the running of a step :param job_id: int, job id :param step_order: int, step order :param resources: dictionary, resources required by the step :return: If system resources is not enough for the step, it will return False, otherwise, it returns True """ global JOB_TABLE rollback = 0 if settings['cluster']['type'] == '': # for clusters new_cpu, new_mem, new_disk, new_vrt_mem = update_resource_pool(resources, -1) if new_cpu < 0 or new_mem < 0 or new_disk < 0 or new_vrt_mem < 0: rollback = 1 if not rollback: try: job = Queue.objects.get(id=job_id) job.set_status(step_order + 1) except: pass JOB_TABLE[job_id]['status'] = step_order + 1 return True else: update_resource_pool(resources) return False def build_suffix_dict(output_files): """ Build suffix dictionary for output file :param output_files: list, output files :return: dict, suffix dictionary """ suffix_dict = dict() for output_file in output_files: _, suffix = os.path.splitext(output_file) suffix = suffix.replace('.', '') if suffix in suffix_dict.keys(): suffix_dict[suffix].append(output_file) else: suffix_dict[suffix] = [output_file] return suffix_dict def finish_step(job_id, step_order, resources): """ Mark a step as finished :param job_id: int, job id :param step_order: int, step order :param resources: dictionary, resources required by the step :return: None """ global JOB_TABLE, NEW_FILES, OUTPUTS, OUTPUT_DICT, OUTPUT_SIZE, FOLDER_SIZE_BEFORE,\ CUMULATIVE_OUTPUT_SIZE, LAST_OUTPUT_STRING try: job = Queue.objects.get(id=job_id) job.resume = step_order job.status = -2 job.save() JOB_TABLE[job_id]['status'] = -2 JOB_TABLE[job_id]['resume'] = step_order this_output = baseDriver.get_folder_content(JOB_TABLE[job_id]['job_folder']) NEW_FILES[job_id] = sorted(list(set(this_output).difference(set(LAST_OUTPUT[job_id])))) NEW_FILES[job_id] = [os.path.join(JOB_TABLE[job_id]['job_folder'], file_name) for file_name in NEW_FILES[job_id]] except Exception as e: print(e) if job_id in OUTPUTS.keys(): OUTPUTS[job_id].extend(NEW_FILES[job_id]) else: OUTPUTS[job_id] = NEW_FILES[job_id] suffix_dict = build_suffix_dict(NEW_FILES[job_id]) if job_id in OUTPUT_DICT.keys(): OUTPUT_DICT[job_id][step_order + 1] = NEW_FILES[job_id] else: OUTPUT_DICT[job_id] = {step_order + 1: NEW_FILES[job_id]} if job_id in OUTPUT_DICT_SUFFIX.keys(): OUTPUT_DICT_SUFFIX[job_id][step_order+1] = suffix_dict else: OUTPUT_DICT_SUFFIX[job_id] = {step_order + 1: suffix_dict} LAST_OUTPUT_SUFFIX[job_id] = suffix_dict LAST_OUTPUT_STRING[job_id] = ' '.join(NEW_FILES[job_id]) OUTPUT_SIZE[job_id] = baseDriver.get_folder_size(JOB_TABLE[job_id]['job_folder']) - FOLDER_SIZE_BEFORE[job_id] CUMULATIVE_OUTPUT_SIZE[job_id] += OUTPUT_SIZE[job_id] if 'trace' in resources.keys(): training_item = Training.objects.get(id=resources['trace']) if training_item.cpu != '-' and training_item.mem != '-' \ and training_item.cpu != '' and training_item.mem != '': training_item.output = OUTPUT_SIZE[job_id] training_item.lock = 0 training_item.save() if settings['cluster']['type'] == '': update_resource_pool(resources) def error_job(job_id, resources): """ Error occurs :param job_id: int, job id :param resources: dict, job resources :return: None """ try: job = Queue.objects.get(id=job_id) job.status = -3 job.ter = 0 job.save() except: pass file_map = dict() if job_id in OUTPUT_DICT.keys(): file_map['OUTPUT_DICT'] = OUTPUT_DICT[job_id] if job_id in LAST_OUTPUT_STRING.keys(): file_map['LAST_OUTPUT_STRING'] = LAST_OUTPUT_STRING[job_id] if job_id in OUTPUT_DICT_SUFFIX.keys(): file_map['OUTPUT_DICT_SUFFIX'] = OUTPUT_DICT_SUFFIX[job_id] if job_id in OUTPUTS.keys(): file_map['OUTPUTS'] = OUTPUTS[job_id] if job_id in NEW_FILES.keys(): file_map['NEW_FILES'] = NEW_FILES[job_id] if job_id in LAST_OUTPUT.keys(): file_map['LAST_OUTPUT'] = LAST_OUTPUT[job_id] if job_id in LAST_OUTPUT_SUFFIX.keys(): file_map['LAST_OUTPUT_SUFFIX'] = LAST_OUTPUT_SUFFIX[job_id] baseDriver.save_output_dict(file_map, job_id) update_resource_pool(resources) if 'trace' in resources.keys(): try: training = Training.objects.get(id=resources['trace']) training.delete() except: pass finish_job(job_id, 1) def kill_proc(proc): """ Kill a process and its children processes :param proc: Process class defined in psutil :return: None """ try: children = proc.children() for child in children: try: child.terminate() except: pass gone, still_alive = psutil.wait_procs(children, timeout=3) for p in still_alive: p.kill() proc.kill() except: pass def bytes_to_readable(bytes_value): """ Convert bytes to a readable form :param bytes_value: int, bytes :return: string, readable value, like 1GB """ from math import ceil if bytes_value > 1073741824: # 1073741824 = 1024 * 1024 * 1024 # bytes to gigabytes readable_value = str(int(ceil(bytes_value * 1.1 / 1073741824))) + 'GB' elif bytes_value > 1048576: # 1048576 = 1024 * 1024 # bytes to megabytes readable_value = str(int(ceil(bytes_value * 1.1 / 1048576))) + 'MB' else: # bytes to kilobytes readable_value = str(int(ceil(bytes_value * 1.1 / 1024))) + 'KB' return readable_value def run_step(job_desc, resources): """ Run step (parallel to main thread) :param job_desc: string, jod id + "_" + step order :param resources: dictionary, resources required by the step :return: None """ global LATEST_JOB_ID, LATEST_JOB_STEP, CPU_POOL, MEMORY_POOL, DISK_POOL, RUNNING_JOBS items = job_desc.split('_') job_id = int(items[0]) step_order = int(items[1]) user_id = JOB_TABLE[job_id]['user_id'] try: is_force_local = JOB_TABLE[job_id]['steps'][step_order]['force_local'] except Exception as e: print(e) is_force_local = 0 recheck = forecast_step(job_id, step_order, resources) log_file = os.path.join(settings["env"]["log"], str(job_id)) if recheck is not True: return if 'feedback' in settings['env'].keys() and settings['env']['feedback'] == 'yes': try: from feedback import feedback feedback(JOB_COMMAND[job_id][0], ' '.join(JOB_COMMAND[job_id][1:]), get_user_mail(JOB_TABLE[job_id]['user_id'])) except: pass if settings['cluster']['type'] and not is_force_local: # for cluster import clusterSupport if resources['cpu'] is None: allocate_cpu = settings['cluster']['cpu'] else: from math import ceil predict_cpu = int(ceil(round(resources['cpu']) / 100)) if predict_cpu > settings['cluster']['cpu'] or predict_cpu == 0: allocate_cpu = settings['cluster']['cpu'] else: allocate_cpu = predict_cpu if 'mem' not in resources.keys() or resources['mem'] is None: allocate_mem = settings['cluster']['mem'] else: allocate_mem = bytes_to_readable(resources['mem']) if 'vrt_mem' not in resources.keys() or resources['vrt_mem'] is None: allocate_vrt = settings['cluster']['vrt'] else: allocate_vrt = bytes_to_readable(resources['vrt_mem']) # baseDriver.update(settings['datasets']['job_db'], job_id, 'status', step_order + 1) try: job_record = Queue.objects.get(id=job_id) job_record.set_status(step_order+1) except: pass if 'trace' in resources.keys(): # learn return_code = clusterSupport.main(settings['cluster']['type'], ' '.join(JOB_COMMAND[job_id]), job_id, step_order, allocate_cpu, allocate_mem, allocate_vrt, settings['cluster']['queue'], JOB_TABLE[job_id]['job_folder'], log_file, settings['cluster']['walltime'], 1, resources['trace']) else: return_code = clusterSupport.main(settings['cluster']['type'], ' '.join(JOB_COMMAND[job_id]), job_id, step_order, allocate_cpu, allocate_mem, allocate_vrt, settings['cluster']['queue'], JOB_TABLE[job_id]['job_folder'], log_file, settings['cluster']['walltime']) if return_code != 0: try: from feedback import feedback_error, get_error_log feedback_error(JOB_COMMAND[job_id][0], ' '.join(JOB_COMMAND[job_id][1:]), get_error_log(log_file), get_user_mail(user_id)) except: pass error_job(job_id, resources) else: # RUNNING_JOBS -= 1 finish_step(job_id, step_order, resources) else: # for local environment or cloud print("Now run %s" % job_desc) print(CPU_POOL, MEMORY_POOL, DISK_POOL) try: log_file_handler = open(log_file, "a") RUNNING_JOBS += 1 true_shell = baseDriver.check_shell_sig(JOB_COMMAND[job_id]) if true_shell: step_process = subprocess.Popen(' '.join(JOB_COMMAND[job_id]), shell=True, cwd=JOB_TABLE[job_id]['job_folder']) else: step_process = subprocess.Popen(JOB_COMMAND[job_id], shell=False, stdout=log_file_handler, stderr=log_file_handler, cwd=JOB_TABLE[job_id]['job_folder']) # step_process = subprocess.Popen(JOB_COMMAND[job_id], shell=False, stdout=log_file_handler, # stderr=log_file_handler, cwd=JOB_TABLE[job_id]['job_folder']) process_id = step_process.pid if 'trace' in resources.keys(): learn_process = subprocess.Popen(["python", os.path.join(root_path, 'mlCollector.py'), "-p", str(step_process.pid), "-n", str(JOB_TABLE[job_id]['steps'][step_order]['hash']), "-j", str(resources['trace'])], shell=False, stdout=None, stderr=subprocess.STDOUT) while step_process.poll() is None: if process_id in psutil.pids(): proc_info = psutil.Process(process_id) if proc_info.is_running(): job = Queue.objects.get(id=job_id) if job.ter: job.status = -3 job.ter = 0 job.save() # proc_info.kill() kill_proc(proc_info) # step_process.kill() error_job(job_id, resources) RUNNING_JOBS -= 1 return None time.sleep(30) log_file_handler.close() print("Now job %s finished." % job_desc) # finish_step(job_id, step_order, resources) JOB_TABLE[job_id]['resume'] = step_order if step_process.returncode != 0: RUNNING_JOBS -= 1 error_job(job_id, resources) else: RUNNING_JOBS -= 1 finish_step(job_id, step_order, resources) if job_id > LATEST_JOB_ID and (step_order + 1) > LATEST_JOB_STEP: LATEST_JOB_ID = job_id LATEST_JOB_STEP = step_order except Exception as e: print(e) try: from feedback import feedback_error, get_error_log feedback_error(JOB_COMMAND[job_id][0], ' '.join(JOB_COMMAND[job_id][1:]), get_error_log(log_file), get_user_mail(user_id)) except: pass RUNNING_JOBS -= 1 error_job(job_id, resources) def set_checkpoint_info(job_id, cause): """ Interact with frontend for checkpoint :param job_id: int, job id :param cause: int, cause for the suspension :return: None """ global JOB_TABLE try: if JOB_TABLE[job_id]['wait_for'] != cause: JOB_TABLE[job_id]['wait_for'] = cause job = Queue.objects.get(id=job_id) job.wait_for = cause job.status = -2 job.save() except: pass def reset_status(): """ Reset dead jobs :return: 0/1 """ try: Queue.objects.filter(status__gt=0).update(status=-3) return 1 except: return 0 def main(): global LATEST_JOB_ID, LATEST_JOB_STEP, RESOURCES reset_status() while True: try: cpu_indeed = baseDriver.get_cpu_available() mem_indeed, vrt_indeed = baseDriver.get_memo_usage_available() disk_indeed = baseDriver.get_disk_free(settings['env']['workspace']) get_job(MAX_JOB - len(JOB_TABLE)) sorted_job_info = sorted(JOB_TABLE.keys()) for job_id in sorted_job_info: previous_step = str(job_id) + '_' + str(JOB_TABLE[job_id]['resume']) now_step = str(job_id) + '_' + str(JOB_TABLE[job_id]['resume'] + 1) if previous_step in RESOURCES.keys(): RESOURCES.pop(previous_step) if now_step in RESOURCES.keys(): if RESOURCES[now_step]['cpu'] is None \ and RESOURCES[now_step]['mem'] is None \ and RESOURCES[now_step]['disk'] is None: resource = run_prepare(job_id, JOB_TABLE[job_id], 1) if 'trace' in RESOURCES[now_step].keys() and resource != 'running': resource['trace'] = RESOURCES[now_step]['trace'] else: continue else: resource = run_prepare(job_id, JOB_TABLE[job_id]) if resource is None: finish_job(job_id) continue elif resource == 'running': continue else: RESOURCES[now_step] = resource biggest_cpu = None biggest_mem = None biggest_id = None # biggest_vrt_mem = None sorted_resources_info = sorted(RESOURCES.keys()) if settings['cluster']['type']: # for cluster # switch off greedy algorithm for index, job_desc in enumerate(sorted_resources_info): items = job_desc.split('_') job_id = int(items[0]) step_order = int(items[1]) if job_id not in JOB_TABLE.keys(): continue if JOB_TABLE[job_id]['status'] > 0: continue if RESOURCES[job_desc]['cpu'] is None \ and RESOURCES[job_desc]['mem'] is None \ and RESOURCES[job_desc]['disk'] is None: if RUNNING_JOBS > 0: set_checkpoint_info(job_id, 4) else: new_thread = threading.Thread(target=run_step, args=(job_desc, RESOURCES[job_desc])) new_thread.setDaemon(True) new_thread.start() break else: new_thread = threading.Thread(target=run_step, args=(job_desc, RESOURCES[job_desc])) new_thread.setDaemon(True) new_thread.start() else: # local / cloud # greedy algorithm for index, job_desc in enumerate(sorted_resources_info): items = job_desc.split('_') job_id = int(items[0]) step_order = int(items[1]) if job_id not in JOB_TABLE.keys(): continue if JOB_TABLE[job_id]['status'] > 0: continue if RESOURCES[job_desc]['cpu'] is None \ and RESOURCES[job_desc]['mem'] is None \ and RESOURCES[job_desc]['disk'] is None: if RUNNING_JOBS > 0: set_checkpoint_info(job_id, 4) else: new_thread = threading.Thread(target=run_step, args=(job_desc, RESOURCES[job_desc])) new_thread.setDaemon(True) new_thread.start() break else: if RESOURCES[job_desc]['cpu'] > cpu_indeed or RESOURCES[job_desc]['cpu'] > CPU_POOL: set_checkpoint_info(job_id, 3) elif RESOURCES[job_desc]['mem'] > mem_indeed or RESOURCES[job_desc]['mem'] > MEMORY_POOL: set_checkpoint_info(job_id, 2) elif "disk" in RESOURCES[job_desc].keys() and (RESOURCES[job_desc]['disk'] > disk_indeed or RESOURCES[job_desc]['disk'] > DISK_POOL): set_checkpoint_info(job_id, 1) # elif RESOURCES[job_desc]['vrt_mem'] > vrt_indeed or RESOURCES[job_desc]['vrt_mem'] > VRT_POOL: # set_checkpoint_info(job_id, 6) else: if biggest_cpu is None: biggest_cpu = RESOURCES[job_desc]['cpu'] if biggest_mem is None: biggest_mem = RESOURCES[job_desc]['mem'] if biggest_id is None: biggest_id = job_desc # if biggest_vrt_mem is None: # biggest_vrt_mem = RESOURCES[job_desc]['vrt_mem'] if biggest_cpu < RESOURCES[job_desc]['cpu']: biggest_cpu = RESOURCES[job_desc]['cpu'] biggest_mem = RESOURCES[job_desc]['mem'] # biggest_vrt_mem = RESOURCES[job_desc]['vrt_mem'] biggest_id = job_desc if biggest_id is not None: new_thread = threading.Thread(target=run_step, args=(biggest_id, RESOURCES[biggest_id])) new_thread.setDaemon(True) new_thread.start() time.sleep(5) except Exception as e: print(e) if __name__ == '__main__': main()