1
0
mirror of https://github.com/ARM-software/workload-automation.git synced 2025-01-31 10:11:17 +00:00

framework/run: fix job status handling.

- Re-order Status entries so that higher severity entries have higher
  enum values.
- Add set_status() to Job that ensures that a status is only set if it
  is of higher severity (e.g. a Job that has been marked as PARTIAL by
  an instrument will not be overwritten as OK by the runner).
- Retry no generates a new job, rather than re-enqueuing the existing
  object; this ensures that the output status is tracked properly.
- Adjust ManagedCallback to set set job status to FAILED if it sees a
  WorkloadError, and to PARTIAL other wise. The idea being that
  instruments raise WorkloadError if they have a reason to believe
  workload did not execute properly and indicated failure even if the
  workload itself has failed to detect it (e.g. FPS instrument detecting
  crashed content, where the workload might lack any feedback regarding
  the crash). Other errors would indicate an issue with the instrument
  itself, and so the job is marked as PARTIAL, as there is no reason to
  suspect that the workload is at fault and the other results generated
  for this execution may be valid.
This commit is contained in:
Sergei Trofimov 2017-09-18 16:12:03 +01:00
parent fe53efcd49
commit 7c7ffe3e77
4 changed files with 34 additions and 18 deletions

View File

@ -34,7 +34,7 @@ KIND_MAP = {
Status = enum(['UNKNOWN', 'NEW', 'PENDING',
'STARTED', 'CONNECTED', 'INITIALIZED', 'RUNNING',
'SKIPPED', 'ABORTED', 'FAILED', 'PARTIAL', 'OK'])
'OK', 'PARTIAL', 'FAILED', 'ABORTED', 'SKIPPED'])

View File

@ -32,6 +32,7 @@ from wa.framework.configuration.core import settings, Status
from wa.framework.exception import (WAError, ConfigError, TimeoutError,
InstrumentError, TargetError, HostError,
TargetNotRespondingError)
from wa.framework.job import Job
from wa.framework.output import init_job_output
from wa.framework.plugin import Artifact
from wa.framework.processor import ProcessorManager
@ -155,10 +156,10 @@ class ExecutionContext(object):
self.output.write_result()
self.current_job = None
def set_status(self, status):
def set_status(self, status, force=False):
if not self.current_job:
raise RuntimeError('No jobs in progress')
self.current_job.status = Status(status)
self.current_job.set_status(status, force)
def extract_results(self):
self.tm.extract_results(self)
@ -391,12 +392,12 @@ class Runner(object):
try:
log.indent()
self.do_run_job(job, context)
job.status = Status.OK
job.set_status(Status.OK)
except KeyboardInterrupt:
job.status = Status.ABORTED
job.set_status(Status.ABORTED)
raise
except Exception as e:
job.status = Status.FAILED
job.set_status(Status.FAILED)
context.add_event(e.message)
if not getattr(e, 'logged', None):
log.log_error(e, self.logger)
@ -410,7 +411,7 @@ class Runner(object):
self.check_job(job)
def do_run_job(self, job, context):
job.status = Status.RUNNING
job.set_status(Status.RUNNING)
self.send(signal.JOB_STARTED)
with signal.wrap('JOB_TARGET_CONFIG', self):
@ -429,15 +430,15 @@ class Runner(object):
self.pm.process_job_output(context)
self.pm.export_job_output(context)
except Exception:
job.status = Status.PARTIAL
job.set_status(Status.PARTIAL)
raise
except KeyboardInterrupt:
job.status = Status.ABORTED
job.set_status(Status.ABORTED)
self.logger.info('Got CTRL-C. Aborting.')
raise
except Exception as e:
job.status = Status.FAILED
job.set_status(Status.FAILED)
if not getattr(e, 'logged', None):
log.log_error(e, self.logger)
e.logged = True
@ -454,19 +455,24 @@ class Runner(object):
if job.retries < rc.max_retries:
msg = 'Job {} iteration {} completed with status {}. retrying...'
self.logger.error(msg.format(job.id, job.status, job.iteration))
self.retry_job(job)
self.context.move_failed(job)
job.retries += 1
job.status = Status.PENDING
self.context.job_queue.insert(0, job)
self.context.write_state()
else:
msg = 'Job {} iteration {} completed with status {}. '\
'Max retries exceeded.'
self.logger.error(msg.format(job.id, job.status, job.iteration))
self.logger.error(msg.format(job.id, job.iteration, job.status))
self.context.failed_jobs += 1
else: # status not in retry_on_status
self.logger.info('Job completed with status {}'.format(job.status))
self.context.successful_jobs += 1
def retry_job(self, job):
retry_job = Job(job.spec, job.iteration, self.context)
retry_job.workload = job.workload
retry_job.retries = job.retries + 1
retry_job.set_status(Status.PENDING)
self.context.job_queue.insert(0, retry_job)
def send(self, s):
signal.send(s, self, self.context)

View File

@ -104,7 +104,8 @@ from collections import OrderedDict
from wa.framework import signal
from wa.framework.plugin import Plugin
from wa.framework.exception import WAError, TargetNotRespondingError, TimeoutError
from wa.framework.exception import (WAError, TargetNotRespondingError, TimeoutError,
WorkloadError)
from wa.utils.log import log_error
from wa.utils.misc import get_traceback, isiterable
from wa.utils.types import identifier, enum, level
@ -250,7 +251,7 @@ def check_failures():
class ManagedCallback(object):
"""
This wraps instruments' callbacks to ensure that errors do interfer
This wraps instruments' callbacks to ensure that errors do not interfer
with run execution.
"""
@ -270,7 +271,11 @@ class ManagedCallback(object):
global failures_detected # pylint: disable=W0603
failures_detected = True
log_error(e, logger)
disable(self.instrument)
context.add_event(e.message)
if isinstance(e, WorkloadError):
context.set_status('FAILED')
else:
context.set_status('PARTIAL')
# Need this to keep track of callbacks, because the dispatcher only keeps

View File

@ -58,7 +58,7 @@ class Job(object):
self.logger.info('Initializing job {}'.format(self.id))
with signal.wrap('WORKLOAD_INITIALIZED', self, context):
self.workload.initialize(context)
self.status = Status.PENDING
self.set_status(Status.PENDING)
context.update_job_state(self)
def configure_target(self, context):
@ -96,3 +96,8 @@ class Job(object):
self.logger.info('Finalizing job {}'.format(self.id))
with signal.wrap('WORKLOAD_FINALIZED', self, context):
self.workload.finalize(context)
def set_status(self, status, force=False):
status = Status(status)
if force or self.status < status:
self.status = status