1
0
mirror of https://github.com/ARM-software/workload-automation.git synced 2025-01-18 12:06:08 +00:00

core,execution: Add run skipping on job failure

Add a global configuration parameter ``bail_on_job_failure`` that
allows all remaining jobs in a run to be skipped should a job fail its
initial execution and its retries. This is by default disabled.
This commit is contained in:
Jonathan Paynter 2020-08-13 16:44:56 +01:00 committed by Marc Bonnici
parent 66e220d444
commit 971289698b
2 changed files with 15 additions and 1 deletions

View File

@ -725,6 +725,17 @@ class RunConfiguration(Configuration):
failed, but continue attempting to run others. failed, but continue attempting to run others.
''' '''
), ),
ConfigurationPoint(
'bail_on_job_failure',
kind=bool,
default=False,
description='''
When a job fails during its run phase, WA will attempt to retry the
job, then continue with remaining jobs after. Setting this to
``True`` means WA will skip remaining jobs and end the run if a job
has retried the maximum number of times, and still fails.
'''
),
ConfigurationPoint( ConfigurationPoint(
'allow_phone_home', 'allow_phone_home',
kind=bool, default=True, kind=bool, default=True,

View File

@ -25,7 +25,7 @@ from datetime import datetime
import wa.framework.signal as signal import wa.framework.signal as signal
from wa.framework import instrument as instrumentation from wa.framework import instrument as instrumentation
from wa.framework.configuration.core import Status from wa.framework.configuration.core import Status
from wa.framework.exception import TargetError, HostError, WorkloadError from wa.framework.exception import TargetError, HostError, WorkloadError, ExecutionError
from wa.framework.exception import TargetNotRespondingError, TimeoutError # pylint: disable=redefined-builtin from wa.framework.exception import TargetNotRespondingError, TimeoutError # pylint: disable=redefined-builtin
from wa.framework.job import Job from wa.framework.job import Job
from wa.framework.output import init_job_output from wa.framework.output import init_job_output
@ -657,6 +657,9 @@ class Runner(object):
self.logger.error(msg.format(job.id, job.iteration, job.status)) self.logger.error(msg.format(job.id, job.iteration, job.status))
self.context.failed_jobs += 1 self.context.failed_jobs += 1
self.send(signal.JOB_FAILED) self.send(signal.JOB_FAILED)
if rc.bail_on_job_failure:
raise ExecutionError('Job {} failed, bailing.'.format(job.id))
else: # status not in retry_on_status else: # status not in retry_on_status
self.logger.info('Job completed with status {}'.format(job.status)) self.logger.info('Job completed with status {}'.format(job.status))
if job.status != 'ABORTED': if job.status != 'ABORTED':