framework: Add bail_on_init_failure run configuration

This maintains the default behaviour of bailing out immediately if any workload fails in initialize(), but adds a setting, bail_on_init_failure, to change this behaviour optionally. This can be useful where WA is being used more as a batch processor.
2025-11-04 00:52:08 +00:00 · 2017-10-23 16:41:28 +01:00
parent 4a001713bb
commit 9fa1b133dc
2 changed files with 43 additions and 3 deletions
--- a/wa/framework/configuration/core.py
+++ b/wa/framework/configuration/core.py
@@ -779,6 +779,19 @@ class RunConfiguration(Configuration):
            .. note:: this number does not include the original attempt
            ''',
        ),
+        ConfigurationPoint(
+            'bail_on_init_failure',
+            kind=bool,
+            default=True,
+            description='''
+            When jobs fail during their main setup and run phases, WA will
+            continue attempting to run the remaining jobs. However, by default,
+            if they fail during their early initialization phase, the entire run
+            will end without continuing to run jobs. Setting this to ``False``
+            means that WA will instead skip all the jobs from the job spec that
+            failed, but continue attempting to run others.
+            '''
+        ),
        ConfigurationPoint(
            'result_processors',
            kind=toggle_set,
--- a/wa/framework/execution.py
+++ b/wa/framework/execution.py
@@ -31,7 +31,7 @@ from wa.framework import instrumentation, pluginloader
 from wa.framework.configuration.core import settings, Status
 from wa.framework.exception import (WAError, ConfigError, TimeoutError,
                                    InstrumentError, TargetError, HostError,
-                                    TargetNotRespondingError)
+                                    TargetNotRespondingError, WorkloadError)
 from wa.framework.job import Job
 from wa.framework.output import init_job_output
 from wa.framework.plugin import Artifact
@@ -229,6 +229,34 @@ class ExecutionContext(object):
    def add_event(self, message):
        self.output.add_event(message)

+    def initialize_jobs(self):
+        new_queue = []
+        failed_ids = []
+        for job in self.job_queue:
+            if job.id in failed_ids:
+                # Don't try to initialize a job if another job with the same ID
+                # (i.e. same job spec) has failed - we can assume it will fail
+                # too.
+                self.skip_job(job)
+                continue
+
+            try:
+                job.initialize(self)
+            except WorkloadError as e:
+                job.set_status(Status.FAILED)
+                self.add_event(e.message)
+                if not getattr(e, 'logged', None):
+                    log.log_error(e, self.logger)
+                    e.logged = True
+                failed_ids.append(job.id)
+
+                if self.cm.run_config.bail_on_init_failure:
+                    raise
+            else:
+                new_queue.append(job)
+
+        self.job_queue = new_queue
+

 class Executor(object):
    """
@@ -378,8 +406,7 @@ class Runner(object):
        self.context.start_run()
        self.pm.initialize()
        log.indent()
-        for job in self.context.job_queue:
-            job.initialize(self.context)
+        self.context.initialize_jobs()
        log.dedent()
        self.context.write_state()