mirror of
https://github.com/ARM-software/workload-automation.git
synced 2025-01-31 10:11:17 +00:00
framework/run: fix job status handling.
- Re-order Status entries so that higher severity entries have higher enum values. - Add set_status() to Job that ensures that a status is only set if it is of higher severity (e.g. a Job that has been marked as PARTIAL by an instrument will not be overwritten as OK by the runner). - Retry no generates a new job, rather than re-enqueuing the existing object; this ensures that the output status is tracked properly. - Adjust ManagedCallback to set set job status to FAILED if it sees a WorkloadError, and to PARTIAL other wise. The idea being that instruments raise WorkloadError if they have a reason to believe workload did not execute properly and indicated failure even if the workload itself has failed to detect it (e.g. FPS instrument detecting crashed content, where the workload might lack any feedback regarding the crash). Other errors would indicate an issue with the instrument itself, and so the job is marked as PARTIAL, as there is no reason to suspect that the workload is at fault and the other results generated for this execution may be valid.
This commit is contained in:
parent
fe53efcd49
commit
7c7ffe3e77
@ -34,7 +34,7 @@ KIND_MAP = {
|
||||
|
||||
Status = enum(['UNKNOWN', 'NEW', 'PENDING',
|
||||
'STARTED', 'CONNECTED', 'INITIALIZED', 'RUNNING',
|
||||
'SKIPPED', 'ABORTED', 'FAILED', 'PARTIAL', 'OK'])
|
||||
'OK', 'PARTIAL', 'FAILED', 'ABORTED', 'SKIPPED'])
|
||||
|
||||
|
||||
|
||||
|
@ -32,6 +32,7 @@ from wa.framework.configuration.core import settings, Status
|
||||
from wa.framework.exception import (WAError, ConfigError, TimeoutError,
|
||||
InstrumentError, TargetError, HostError,
|
||||
TargetNotRespondingError)
|
||||
from wa.framework.job import Job
|
||||
from wa.framework.output import init_job_output
|
||||
from wa.framework.plugin import Artifact
|
||||
from wa.framework.processor import ProcessorManager
|
||||
@ -155,10 +156,10 @@ class ExecutionContext(object):
|
||||
self.output.write_result()
|
||||
self.current_job = None
|
||||
|
||||
def set_status(self, status):
|
||||
def set_status(self, status, force=False):
|
||||
if not self.current_job:
|
||||
raise RuntimeError('No jobs in progress')
|
||||
self.current_job.status = Status(status)
|
||||
self.current_job.set_status(status, force)
|
||||
|
||||
def extract_results(self):
|
||||
self.tm.extract_results(self)
|
||||
@ -391,12 +392,12 @@ class Runner(object):
|
||||
try:
|
||||
log.indent()
|
||||
self.do_run_job(job, context)
|
||||
job.status = Status.OK
|
||||
job.set_status(Status.OK)
|
||||
except KeyboardInterrupt:
|
||||
job.status = Status.ABORTED
|
||||
job.set_status(Status.ABORTED)
|
||||
raise
|
||||
except Exception as e:
|
||||
job.status = Status.FAILED
|
||||
job.set_status(Status.FAILED)
|
||||
context.add_event(e.message)
|
||||
if not getattr(e, 'logged', None):
|
||||
log.log_error(e, self.logger)
|
||||
@ -410,7 +411,7 @@ class Runner(object):
|
||||
self.check_job(job)
|
||||
|
||||
def do_run_job(self, job, context):
|
||||
job.status = Status.RUNNING
|
||||
job.set_status(Status.RUNNING)
|
||||
self.send(signal.JOB_STARTED)
|
||||
|
||||
with signal.wrap('JOB_TARGET_CONFIG', self):
|
||||
@ -429,15 +430,15 @@ class Runner(object):
|
||||
self.pm.process_job_output(context)
|
||||
self.pm.export_job_output(context)
|
||||
except Exception:
|
||||
job.status = Status.PARTIAL
|
||||
job.set_status(Status.PARTIAL)
|
||||
raise
|
||||
|
||||
except KeyboardInterrupt:
|
||||
job.status = Status.ABORTED
|
||||
job.set_status(Status.ABORTED)
|
||||
self.logger.info('Got CTRL-C. Aborting.')
|
||||
raise
|
||||
except Exception as e:
|
||||
job.status = Status.FAILED
|
||||
job.set_status(Status.FAILED)
|
||||
if not getattr(e, 'logged', None):
|
||||
log.log_error(e, self.logger)
|
||||
e.logged = True
|
||||
@ -454,19 +455,24 @@ class Runner(object):
|
||||
if job.retries < rc.max_retries:
|
||||
msg = 'Job {} iteration {} completed with status {}. retrying...'
|
||||
self.logger.error(msg.format(job.id, job.status, job.iteration))
|
||||
self.retry_job(job)
|
||||
self.context.move_failed(job)
|
||||
job.retries += 1
|
||||
job.status = Status.PENDING
|
||||
self.context.job_queue.insert(0, job)
|
||||
self.context.write_state()
|
||||
else:
|
||||
msg = 'Job {} iteration {} completed with status {}. '\
|
||||
'Max retries exceeded.'
|
||||
self.logger.error(msg.format(job.id, job.status, job.iteration))
|
||||
self.logger.error(msg.format(job.id, job.iteration, job.status))
|
||||
self.context.failed_jobs += 1
|
||||
else: # status not in retry_on_status
|
||||
self.logger.info('Job completed with status {}'.format(job.status))
|
||||
self.context.successful_jobs += 1
|
||||
|
||||
def retry_job(self, job):
|
||||
retry_job = Job(job.spec, job.iteration, self.context)
|
||||
retry_job.workload = job.workload
|
||||
retry_job.retries = job.retries + 1
|
||||
retry_job.set_status(Status.PENDING)
|
||||
self.context.job_queue.insert(0, retry_job)
|
||||
|
||||
def send(self, s):
|
||||
signal.send(s, self, self.context)
|
||||
|
@ -104,7 +104,8 @@ from collections import OrderedDict
|
||||
|
||||
from wa.framework import signal
|
||||
from wa.framework.plugin import Plugin
|
||||
from wa.framework.exception import WAError, TargetNotRespondingError, TimeoutError
|
||||
from wa.framework.exception import (WAError, TargetNotRespondingError, TimeoutError,
|
||||
WorkloadError)
|
||||
from wa.utils.log import log_error
|
||||
from wa.utils.misc import get_traceback, isiterable
|
||||
from wa.utils.types import identifier, enum, level
|
||||
@ -250,7 +251,7 @@ def check_failures():
|
||||
|
||||
class ManagedCallback(object):
|
||||
"""
|
||||
This wraps instruments' callbacks to ensure that errors do interfer
|
||||
This wraps instruments' callbacks to ensure that errors do not interfer
|
||||
with run execution.
|
||||
|
||||
"""
|
||||
@ -270,7 +271,11 @@ class ManagedCallback(object):
|
||||
global failures_detected # pylint: disable=W0603
|
||||
failures_detected = True
|
||||
log_error(e, logger)
|
||||
disable(self.instrument)
|
||||
context.add_event(e.message)
|
||||
if isinstance(e, WorkloadError):
|
||||
context.set_status('FAILED')
|
||||
else:
|
||||
context.set_status('PARTIAL')
|
||||
|
||||
|
||||
# Need this to keep track of callbacks, because the dispatcher only keeps
|
||||
|
@ -58,7 +58,7 @@ class Job(object):
|
||||
self.logger.info('Initializing job {}'.format(self.id))
|
||||
with signal.wrap('WORKLOAD_INITIALIZED', self, context):
|
||||
self.workload.initialize(context)
|
||||
self.status = Status.PENDING
|
||||
self.set_status(Status.PENDING)
|
||||
context.update_job_state(self)
|
||||
|
||||
def configure_target(self, context):
|
||||
@ -96,3 +96,8 @@ class Job(object):
|
||||
self.logger.info('Finalizing job {}'.format(self.id))
|
||||
with signal.wrap('WORKLOAD_FINALIZED', self, context):
|
||||
self.workload.finalize(context)
|
||||
|
||||
def set_status(self, status, force=False):
|
||||
status = Status(status)
|
||||
if force or self.status < status:
|
||||
self.status = status
|
||||
|
Loading…
x
Reference in New Issue
Block a user