# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, you can obtain one at http://mozilla.org/MPL/2.0/. import math from datetime import timedelta from autorepr import autorepr, autostr from django.db import models, transaction from django.utils import timezone from django.utils.functional import cached_property from ..clusters.models import Cluster, EMRReleaseModel from ..clusters.provisioners import ClusterProvisioner from ..models import CreatedByModel, EditedAtModel, URLActionModel from ..stats.models import Metric from .provisioners import SparkJobProvisioner from .queries import SparkJobQuerySet, SparkJobRunQuerySet DEFAULT_STATUS = "" class SparkJob(EMRReleaseModel, CreatedByModel, EditedAtModel, URLActionModel): """ A data model to store details about a scheduled Spark job, to be run on AWS EMR. """ INTERVAL_DAILY = 24 INTERVAL_WEEKLY = INTERVAL_DAILY * 7 INTERVAL_MONTHLY = INTERVAL_DAILY * 30 INTERVAL_CHOICES = [ (INTERVAL_DAILY, "Daily"), (INTERVAL_WEEKLY, "Weekly"), (INTERVAL_MONTHLY, "Monthly"), ] RESULT_PRIVATE = "private" RESULT_PUBLIC = "public" RESULT_VISIBILITY_CHOICES = [(RESULT_PRIVATE, "Private"), (RESULT_PUBLIC, "Public")] identifier = models.CharField( max_length=100, help_text="Job name, used to uniqely identify individual jobs.", unique=True, db_index=True, ) description = models.TextField(help_text="Job description.", default="") notebook_s3_key = models.CharField( max_length=800, help_text="S3 key of the notebook after uploading it to the Spark code bucket.", ) result_visibility = models.CharField( # can currently be "public" or "private" max_length=50, help_text="Whether notebook results are uploaded to a public or private bucket", choices=RESULT_VISIBILITY_CHOICES, default=RESULT_PRIVATE, ) size = models.IntegerField(help_text="Number of computers to use to run the job.") interval_in_hours = models.IntegerField( help_text="Interval at which the job should run, in hours.", choices=INTERVAL_CHOICES, default=INTERVAL_DAILY, ) job_timeout = models.IntegerField( help_text="Number of hours before the job times out." ) start_date = models.DateTimeField( help_text="Date/time that the job should start being scheduled to run." ) end_date = models.DateTimeField( blank=True, null=True, help_text="Date/time that the job should stop being scheduled to run, null if no end date.", ) expired_date = models.DateTimeField( blank=True, null=True, help_text="Date/time that the job was expired.", db_index=True, ) is_enabled = models.BooleanField( default=True, help_text="Whether the job should run or not." ) objects = SparkJobQuerySet.as_manager() class Meta: permissions = [("view_sparkjob", "Can view Spark job")] __str__ = autostr("{self.identifier}") __repr__ = autorepr(["identifier", "size", "is_enabled"]) url_prefix = "jobs" url_actions = ["delete", "detail", "download", "edit", "run", "zeppelin"] def get_absolute_url(self): return self.urls.detail @property def provisioner(self): return SparkJobProvisioner() # TEMPORARY till we have 1:1 relationship to cluster object # and we can then ask for spark_job.cluster.provisioner @property def cluster_provisioner(self): return ClusterProvisioner() @property def schedule(self): from .schedules import SparkJobSchedule return SparkJobSchedule(self) def has_future_end_date(self, now): # no end date means it'll always be due if self.end_date is None: return True return self.end_date >= now @property def has_never_run(self): """ Whether the job has run before. Looks at both the cluster status and our own record when we asked it to run. """ return ( self.latest_run is None or self.latest_run.status == DEFAULT_STATUS or self.latest_run.scheduled_at is None ) @property def has_finished(self): """Whether the job's cluster is terminated or failed""" return self.latest_run and self.latest_run.status in Cluster.FINAL_STATUS_LIST @property def has_timed_out(self): """ Whether the current job run has been running longer than the job's timeout allows. """ if self.has_never_run: # Job isn't even running at the moment and never ran before return False timeout_delta = timedelta(hours=self.job_timeout) max_run_time = self.latest_run.scheduled_at + timeout_delta timed_out = timezone.now() >= max_run_time return not self.is_runnable and timed_out @property def is_due(self): """ Whether the start date is in the past and the end date is in the future. """ now = timezone.now() has_past_start_date = self.start_date <= now return has_past_start_date and self.has_future_end_date(now) @property def is_runnable(self): """ Either the job has never run before or was never finished. This is checked right before the actual provisioning. """ return self.has_never_run or self.has_finished @property def should_run(self): """Whether the scheduled Spark job should run.""" return self.is_runnable and self.is_enabled and self.is_due @property def is_public(self): return self.result_visibility == self.RESULT_PUBLIC @property def is_active(self): return self.latest_run and self.latest_run.status in Cluster.ACTIVE_STATUS_LIST @property def notebook_name(self): return self.notebook_s3_key.rsplit("/", 1)[-1] @cached_property def notebook_s3_object(self): return self.provisioner.get(self.notebook_s3_key) @cached_property def results(self): return self.provisioner.results(self.identifier, self.is_public) def get_latest_run(self): try: return self.runs.latest() except SparkJobRun.DoesNotExist: return None latest_run = cached_property(get_latest_run, name="latest_run") def run(self): """Actually run the scheduled Spark job.""" # if the job ran before and is still running, don't start it again if not self.is_runnable: return jobflow_id = self.provisioner.run( user_username=self.created_by.username, user_email=self.created_by.email, identifier=self.identifier, emr_release=self.emr_release.version, size=self.size, notebook_key=self.notebook_s3_key, is_public=self.is_public, job_timeout=self.job_timeout, ) # Create new job history record. run = self.runs.create( spark_job=self, jobflow_id=jobflow_id, scheduled_at=timezone.now(), emr_release_version=self.emr_release.version, size=self.size, ) # Remove the cached latest run to this objects will requery it. try: delattr(self, "latest_run") except AttributeError: # pragma: no cover pass # It didn't have a `latest_run` and that's ok. with transaction.atomic(): Metric.record( "sparkjob-emr-version", data={"version": self.emr_release.version} ) # sync with EMR API transaction.on_commit(run.sync) def expire(self): # TODO disable the job as well once it's easy to re-enable the job deleted = self.schedule.delete() self.expired_date = timezone.now() self.save() return deleted def terminate(self): """Stop the currently running scheduled Spark job.""" if self.latest_run: self.cluster_provisioner.stop(self.latest_run.jobflow_id) def first_run(self): if self.latest_run: return None from .tasks import run_job return run_job.apply_async( args=(self.pk,), kwargs={"first_run": True}, # make sure we run this task only when we expect it # may be in the future, may be in the past # but definitely at a specific time eta=self.start_date, ) def save(self, *args, **kwargs): # whether the job is being created for the first time first_save = self.pk is None # resetting expired_date in case a user resets the end_date if self.expired_date and self.end_date and self.end_date > timezone.now(): self.expired_date = None super().save(*args, **kwargs) # Remove the cached latest run to this objects will requery it. try: delattr(self, "latest_run") except AttributeError: # pragma: no cover pass # It didn't have a `latest_run` and that's ok. # first remove if it exists self.schedule.delete() # and then add it, but only if the end date is in the future if self.has_future_end_date(timezone.now()): self.schedule.add() if first_save: transaction.on_commit(self.first_run) def delete(self, *args, **kwargs): # make sure to shut down the cluster if it's currently running self.terminate() # make sure to clean up the job notebook from storage self.provisioner.remove(self.notebook_s3_key) self.schedule.delete() super().delete(*args, **kwargs) class SparkJobRun(EditedAtModel): """ A data model to store information about every individual run of a scheduled Spark job. This denormalizes some values from its related data model :class:`SparkJob`. """ spark_job = models.ForeignKey( SparkJob, on_delete=models.CASCADE, related_name="runs", related_query_name="runs", ) jobflow_id = models.CharField(max_length=50, blank=True, null=True) emr_release_version = models.CharField(max_length=50, blank=True, null=True) size = models.IntegerField( help_text="Number of computers used to run the job.", blank=True, null=True ) status = models.CharField( max_length=50, blank=True, default=DEFAULT_STATUS, db_index=True ) scheduled_at = models.DateTimeField( blank=True, null=True, help_text="Date/time that the job was scheduled." ) started_at = models.DateTimeField( blank=True, null=True, help_text="Date/time when the cluster was started on AWS EMR.", ) ready_at = models.DateTimeField( blank=True, null=True, help_text="Date/time when the cluster was ready to run steps on AWS EMR.", ) finished_at = models.DateTimeField( blank=True, null=True, help_text="Date/time that the job was terminated or failed.", ) objects = SparkJobRunQuerySet.as_manager() class Meta: get_latest_by = "created_at" ordering = ["-created_at"] __str__ = autostr("{self.jobflow_id}") def spark_job_identifier(self): return self.spark_job.identifier __repr__ = autorepr( ["jobflow_id", "spark_job_identifier", "emr_release_version", "size"], spark_job_identifier=spark_job_identifier, ) @property def info(self): return self.spark_job.cluster_provisioner.info(self.jobflow_id) def sync(self, info=None): """ Updates latest status and life cycle datetimes. """ if info is None: info = self.info # a mapping between what the provisioner returns what the data model uses model_field_map = ( ("state", "status"), ("creation_datetime", "started_at"), ("ready_datetime", "ready_at"), ("end_datetime", "finished_at"), ) save_needed = False date_fields_updated = False # set the various model fields to the value the API returned for api_field, model_field in model_field_map: field_value = info.get(api_field) if field_value is None or field_value == getattr(self, model_field): continue setattr(self, model_field, field_value) save_needed = True if model_field in ("started_at", "ready_at", "finished_at"): date_fields_updated = True with transaction.atomic(): # If the job cluster terminated with error raise the alarm. if self.status == Cluster.STATUS_TERMINATED_WITH_ERRORS: transaction.on_commit(lambda: self.alert(info)) # If any data changed, save it. if save_needed: self.save() with transaction.atomic(): if date_fields_updated: # When job cluster is ready, record time to ready. if self.ready_at and not self.finished_at: # Time in seconds it took the cluster to be ready. time_to_ready = (self.ready_at - self.started_at).seconds Metric.record( "sparkjob-time-to-ready", time_to_ready, data={ "identifier": self.spark_job.identifier, "size": self.size, "jobflow_id": self.jobflow_id, }, ) if self.finished_at: # When job is finished, record normalized instance hours. hours = math.ceil( (self.finished_at - self.started_at).seconds / 60 / 60 ) normalized_hours = hours * self.size Metric.record( "sparkjob-normalized-instance-hours", normalized_hours, data={ "identifier": self.spark_job.identifier, "size": self.size, "jobflow_id": self.jobflow_id, }, ) if self.finished_at and self.ready_at: # When job is finished, record time in seconds it took the # scheduled job to run. Sometimes `ready_at` won't be # available if the cluster terminated with errors. run_time = (self.finished_at - self.ready_at).seconds Metric.record( "sparkjob-run-time", run_time, data={ "identifier": self.spark_job.identifier, "size": self.size, "jobflow_id": self.jobflow_id, }, ) return self.status def alert(self, info): self.alerts.get_or_create( reason_code=info["state_change_reason_code"], reason_message=info["state_change_reason_message"], ) class SparkJobRunAlert(EditedAtModel): """ A data model to store job run alerts for later processing by an async job that sends out emails. """ run = models.ForeignKey( SparkJobRun, on_delete=models.CASCADE, related_name="alerts" ) reason_code = models.CharField( max_length=50, blank=True, null=True, help_text="The reason code for the creation of the alert.", ) reason_message = models.TextField( default="", help_text="The reason message for the creation of the alert." ) mail_sent_date = models.DateTimeField( blank=True, null=True, help_text="The datetime the alert email was sent." ) class Meta: unique_together = [["run", "reason_code", "reason_message"]] index_together = [["reason_code", "mail_sent_date"]] __str__ = autostr("{self.id}") def short_reason_message(self): return self.reason_message[:50] __repr__ = autorepr( ["id", "reason_code", "short_reason_message"], short_reason_message=short_reason_message, )