framework/run: fix job status handling.

- Re-order Status entries so that higher severity entries have higher enum values. - Add set_status() to Job that ensures that a status is only set if it is of higher severity (e.g. a Job that has been marked as PARTIAL by an instrument will not be overwritten as OK by the runner). - Retry no generates a new job, rather than re-enqueuing the existing object; this ensures that the output status is tracked properly. - Adjust ManagedCallback to set set job status to FAILED if it sees a WorkloadError, and to PARTIAL other wise. The idea being that instruments raise WorkloadError if they have a reason to believe workload did not execute properly and indicated failure even if the workload itself has failed to detect it (e.g. FPS instrument detecting crashed content, where the workload might lack any feedback regarding the crash). Other errors would indicate an issue with the instrument itself, and so the job is marked as PARTIAL, as there is no reason to suspect that the workload is at fault and the other results generated for this execution may be valid.
2025-10-12 06:44:07 +01:00 · 2017-09-18 16:12:03 +01:00
parent fe53efcd49
commit 7c7ffe3e77
4 changed files with 34 additions and 18 deletions
--- a/wa/framework/configuration/core.py
+++ b/wa/framework/configuration/core.py
@@ -34,7 +34,7 @@ KIND_MAP = {

 Status = enum(['UNKNOWN', 'NEW', 'PENDING',
               'STARTED', 'CONNECTED', 'INITIALIZED', 'RUNNING',
-               'SKIPPED', 'ABORTED', 'FAILED', 'PARTIAL', 'OK'])
+               'OK', 'PARTIAL', 'FAILED', 'ABORTED', 'SKIPPED'])



--- a/wa/framework/execution.py
+++ b/wa/framework/execution.py
@@ -32,6 +32,7 @@ from wa.framework.configuration.core import settings, Status
 from wa.framework.exception import (WAError, ConfigError, TimeoutError,
                                    InstrumentError, TargetError, HostError,
                                    TargetNotRespondingError)
+from wa.framework.job import Job
 from wa.framework.output import init_job_output
 from wa.framework.plugin import Artifact
 from wa.framework.processor import ProcessorManager
@@ -155,10 +156,10 @@ class ExecutionContext(object):
        self.output.write_result()
        self.current_job = None

-    def set_status(self, status):
+    def set_status(self, status, force=False):
        if not self.current_job:
            raise RuntimeError('No jobs in progress')
-        self.current_job.status = Status(status)
+        self.current_job.set_status(status, force)

    def extract_results(self):
        self.tm.extract_results(self)
@@ -391,12 +392,12 @@ class Runner(object):
        try:
            log.indent()
            self.do_run_job(job, context)
-            job.status = Status.OK
+            job.set_status(Status.OK)
        except KeyboardInterrupt:
-            job.status = Status.ABORTED
+            job.set_status(Status.ABORTED)
            raise
        except Exception as e:
-            job.status = Status.FAILED
+            job.set_status(Status.FAILED)
            context.add_event(e.message)
            if not getattr(e, 'logged', None):
                log.log_error(e, self.logger)
@@ -410,7 +411,7 @@ class Runner(object):
            self.check_job(job)

    def do_run_job(self, job, context):
-        job.status = Status.RUNNING
+        job.set_status(Status.RUNNING)
        self.send(signal.JOB_STARTED)

        with signal.wrap('JOB_TARGET_CONFIG', self):
@@ -429,15 +430,15 @@ class Runner(object):
                self.pm.process_job_output(context)
                self.pm.export_job_output(context)
            except Exception:
-                job.status = Status.PARTIAL
+                job.set_status(Status.PARTIAL)
                raise

        except KeyboardInterrupt:
-            job.status = Status.ABORTED
+            job.set_status(Status.ABORTED)
            self.logger.info('Got CTRL-C. Aborting.')
            raise
        except Exception as e:
-            job.status = Status.FAILED
+            job.set_status(Status.FAILED)
            if not getattr(e, 'logged', None):
                log.log_error(e, self.logger)
                e.logged = True
@@ -454,19 +455,24 @@ class Runner(object):
            if job.retries < rc.max_retries:
                msg = 'Job {} iteration {} completed with status {}. retrying...'
                self.logger.error(msg.format(job.id, job.status, job.iteration))
+                self.retry_job(job)
                self.context.move_failed(job)
-                job.retries += 1
-                job.status = Status.PENDING
-                self.context.job_queue.insert(0, job)
                self.context.write_state()
            else:
                msg = 'Job {} iteration {} completed with status {}. '\
                      'Max retries exceeded.'
-                self.logger.error(msg.format(job.id, job.status, job.iteration))
+                self.logger.error(msg.format(job.id, job.iteration, job.status))
                self.context.failed_jobs += 1
        else:  # status not in retry_on_status
            self.logger.info('Job completed with status {}'.format(job.status))
            self.context.successful_jobs += 1
+
+    def retry_job(self, job):
+        retry_job = Job(job.spec, job.iteration, self.context)
+        retry_job.workload = job.workload
+        retry_job.retries = job.retries + 1
+        retry_job.set_status(Status.PENDING)
+        self.context.job_queue.insert(0, retry_job)
        
    def send(self, s):
        signal.send(s, self, self.context)
--- a/wa/framework/instrumentation.py
+++ b/wa/framework/instrumentation.py
@@ -104,7 +104,8 @@ from collections import OrderedDict

 from wa.framework import signal
 from wa.framework.plugin import Plugin
-from wa.framework.exception import WAError, TargetNotRespondingError, TimeoutError
+from wa.framework.exception import (WAError, TargetNotRespondingError, TimeoutError,
+                                    WorkloadError)
 from wa.utils.log import log_error
 from wa.utils.misc import get_traceback, isiterable
 from wa.utils.types import identifier, enum, level
@@ -250,7 +251,7 @@ def check_failures():

 class ManagedCallback(object):
    """
-    This wraps instruments' callbacks to ensure that errors do interfer
+    This wraps instruments' callbacks to ensure that errors do not interfer
    with run execution.

    """
@@ -270,7 +271,11 @@ class ManagedCallback(object):
                global failures_detected  # pylint: disable=W0603
                failures_detected = True
                log_error(e, logger)
-                disable(self.instrument)
+                context.add_event(e.message)
+                if isinstance(e, WorkloadError):
+                    context.set_status('FAILED')
+                else:
+                    context.set_status('PARTIAL')


 # Need this to keep track of callbacks, because the dispatcher only keeps
--- a/wa/framework/job.py
+++ b/wa/framework/job.py
@@ -58,7 +58,7 @@ class Job(object):
        self.logger.info('Initializing job {}'.format(self.id))
        with signal.wrap('WORKLOAD_INITIALIZED', self, context):
            self.workload.initialize(context)
-        self.status = Status.PENDING
+        self.set_status(Status.PENDING)
        context.update_job_state(self)

    def configure_target(self, context):
@@ -96,3 +96,8 @@ class Job(object):
        self.logger.info('Finalizing job {}'.format(self.id))
        with signal.wrap('WORKLOAD_FINALIZED', self, context):
            self.workload.finalize(context)
+
+    def set_status(self, status, force=False):
+        status = Status(status)
+        if force or self.status < status:
+            self.status = status