1
0
mirror of https://github.com/ARM-software/workload-automation.git synced 2025-01-31 10:11:17 +00:00

framework/run: fix job status handling.

- Re-order Status entries so that higher severity entries have higher
  enum values.
- Add set_status() to Job that ensures that a status is only set if it
  is of higher severity (e.g. a Job that has been marked as PARTIAL by
  an instrument will not be overwritten as OK by the runner).
- Retry no generates a new job, rather than re-enqueuing the existing
  object; this ensures that the output status is tracked properly.
- Adjust ManagedCallback to set set job status to FAILED if it sees a
  WorkloadError, and to PARTIAL other wise. The idea being that
  instruments raise WorkloadError if they have a reason to believe
  workload did not execute properly and indicated failure even if the
  workload itself has failed to detect it (e.g. FPS instrument detecting
  crashed content, where the workload might lack any feedback regarding
  the crash). Other errors would indicate an issue with the instrument
  itself, and so the job is marked as PARTIAL, as there is no reason to
  suspect that the workload is at fault and the other results generated
  for this execution may be valid.
This commit is contained in:
Sergei Trofimov 2017-09-18 16:12:03 +01:00
parent fe53efcd49
commit 7c7ffe3e77
4 changed files with 34 additions and 18 deletions

View File

@ -34,7 +34,7 @@ KIND_MAP = {
Status = enum(['UNKNOWN', 'NEW', 'PENDING', Status = enum(['UNKNOWN', 'NEW', 'PENDING',
'STARTED', 'CONNECTED', 'INITIALIZED', 'RUNNING', 'STARTED', 'CONNECTED', 'INITIALIZED', 'RUNNING',
'SKIPPED', 'ABORTED', 'FAILED', 'PARTIAL', 'OK']) 'OK', 'PARTIAL', 'FAILED', 'ABORTED', 'SKIPPED'])

View File

@ -32,6 +32,7 @@ from wa.framework.configuration.core import settings, Status
from wa.framework.exception import (WAError, ConfigError, TimeoutError, from wa.framework.exception import (WAError, ConfigError, TimeoutError,
InstrumentError, TargetError, HostError, InstrumentError, TargetError, HostError,
TargetNotRespondingError) TargetNotRespondingError)
from wa.framework.job import Job
from wa.framework.output import init_job_output from wa.framework.output import init_job_output
from wa.framework.plugin import Artifact from wa.framework.plugin import Artifact
from wa.framework.processor import ProcessorManager from wa.framework.processor import ProcessorManager
@ -155,10 +156,10 @@ class ExecutionContext(object):
self.output.write_result() self.output.write_result()
self.current_job = None self.current_job = None
def set_status(self, status): def set_status(self, status, force=False):
if not self.current_job: if not self.current_job:
raise RuntimeError('No jobs in progress') raise RuntimeError('No jobs in progress')
self.current_job.status = Status(status) self.current_job.set_status(status, force)
def extract_results(self): def extract_results(self):
self.tm.extract_results(self) self.tm.extract_results(self)
@ -391,12 +392,12 @@ class Runner(object):
try: try:
log.indent() log.indent()
self.do_run_job(job, context) self.do_run_job(job, context)
job.status = Status.OK job.set_status(Status.OK)
except KeyboardInterrupt: except KeyboardInterrupt:
job.status = Status.ABORTED job.set_status(Status.ABORTED)
raise raise
except Exception as e: except Exception as e:
job.status = Status.FAILED job.set_status(Status.FAILED)
context.add_event(e.message) context.add_event(e.message)
if not getattr(e, 'logged', None): if not getattr(e, 'logged', None):
log.log_error(e, self.logger) log.log_error(e, self.logger)
@ -410,7 +411,7 @@ class Runner(object):
self.check_job(job) self.check_job(job)
def do_run_job(self, job, context): def do_run_job(self, job, context):
job.status = Status.RUNNING job.set_status(Status.RUNNING)
self.send(signal.JOB_STARTED) self.send(signal.JOB_STARTED)
with signal.wrap('JOB_TARGET_CONFIG', self): with signal.wrap('JOB_TARGET_CONFIG', self):
@ -429,15 +430,15 @@ class Runner(object):
self.pm.process_job_output(context) self.pm.process_job_output(context)
self.pm.export_job_output(context) self.pm.export_job_output(context)
except Exception: except Exception:
job.status = Status.PARTIAL job.set_status(Status.PARTIAL)
raise raise
except KeyboardInterrupt: except KeyboardInterrupt:
job.status = Status.ABORTED job.set_status(Status.ABORTED)
self.logger.info('Got CTRL-C. Aborting.') self.logger.info('Got CTRL-C. Aborting.')
raise raise
except Exception as e: except Exception as e:
job.status = Status.FAILED job.set_status(Status.FAILED)
if not getattr(e, 'logged', None): if not getattr(e, 'logged', None):
log.log_error(e, self.logger) log.log_error(e, self.logger)
e.logged = True e.logged = True
@ -454,19 +455,24 @@ class Runner(object):
if job.retries < rc.max_retries: if job.retries < rc.max_retries:
msg = 'Job {} iteration {} completed with status {}. retrying...' msg = 'Job {} iteration {} completed with status {}. retrying...'
self.logger.error(msg.format(job.id, job.status, job.iteration)) self.logger.error(msg.format(job.id, job.status, job.iteration))
self.retry_job(job)
self.context.move_failed(job) self.context.move_failed(job)
job.retries += 1
job.status = Status.PENDING
self.context.job_queue.insert(0, job)
self.context.write_state() self.context.write_state()
else: else:
msg = 'Job {} iteration {} completed with status {}. '\ msg = 'Job {} iteration {} completed with status {}. '\
'Max retries exceeded.' 'Max retries exceeded.'
self.logger.error(msg.format(job.id, job.status, job.iteration)) self.logger.error(msg.format(job.id, job.iteration, job.status))
self.context.failed_jobs += 1 self.context.failed_jobs += 1
else: # status not in retry_on_status else: # status not in retry_on_status
self.logger.info('Job completed with status {}'.format(job.status)) self.logger.info('Job completed with status {}'.format(job.status))
self.context.successful_jobs += 1 self.context.successful_jobs += 1
def retry_job(self, job):
retry_job = Job(job.spec, job.iteration, self.context)
retry_job.workload = job.workload
retry_job.retries = job.retries + 1
retry_job.set_status(Status.PENDING)
self.context.job_queue.insert(0, retry_job)
def send(self, s): def send(self, s):
signal.send(s, self, self.context) signal.send(s, self, self.context)

View File

@ -104,7 +104,8 @@ from collections import OrderedDict
from wa.framework import signal from wa.framework import signal
from wa.framework.plugin import Plugin from wa.framework.plugin import Plugin
from wa.framework.exception import WAError, TargetNotRespondingError, TimeoutError from wa.framework.exception import (WAError, TargetNotRespondingError, TimeoutError,
WorkloadError)
from wa.utils.log import log_error from wa.utils.log import log_error
from wa.utils.misc import get_traceback, isiterable from wa.utils.misc import get_traceback, isiterable
from wa.utils.types import identifier, enum, level from wa.utils.types import identifier, enum, level
@ -250,7 +251,7 @@ def check_failures():
class ManagedCallback(object): class ManagedCallback(object):
""" """
This wraps instruments' callbacks to ensure that errors do interfer This wraps instruments' callbacks to ensure that errors do not interfer
with run execution. with run execution.
""" """
@ -270,7 +271,11 @@ class ManagedCallback(object):
global failures_detected # pylint: disable=W0603 global failures_detected # pylint: disable=W0603
failures_detected = True failures_detected = True
log_error(e, logger) log_error(e, logger)
disable(self.instrument) context.add_event(e.message)
if isinstance(e, WorkloadError):
context.set_status('FAILED')
else:
context.set_status('PARTIAL')
# Need this to keep track of callbacks, because the dispatcher only keeps # Need this to keep track of callbacks, because the dispatcher only keeps

View File

@ -58,7 +58,7 @@ class Job(object):
self.logger.info('Initializing job {}'.format(self.id)) self.logger.info('Initializing job {}'.format(self.id))
with signal.wrap('WORKLOAD_INITIALIZED', self, context): with signal.wrap('WORKLOAD_INITIALIZED', self, context):
self.workload.initialize(context) self.workload.initialize(context)
self.status = Status.PENDING self.set_status(Status.PENDING)
context.update_job_state(self) context.update_job_state(self)
def configure_target(self, context): def configure_target(self, context):
@ -96,3 +96,8 @@ class Job(object):
self.logger.info('Finalizing job {}'.format(self.id)) self.logger.info('Finalizing job {}'.format(self.id))
with signal.wrap('WORKLOAD_FINALIZED', self, context): with signal.wrap('WORKLOAD_FINALIZED', self, context):
self.workload.finalize(context) self.workload.finalize(context)
def set_status(self, status, force=False):
status = Status(status)
if force or self.status < status:
self.status = status