1
0
mirror of https://github.com/ARM-software/workload-automation.git synced 2025-02-20 20:09:11 +00:00

Added retries

Failed jobs will now be automatically retired. This is controlled by two
new settings:

retry_on_status - a list of statuses which will be consided failures and
                  result in a retry
max_retries - number of retries before giving up
This commit is contained in:
Sergei Trofimov 2015-06-18 16:36:56 +01:00
parent 51c5ef1520
commit ccea63555c
5 changed files with 70 additions and 3 deletions

View File

@ -97,6 +97,32 @@ Available Settings
Added in version 2.1.5.
.. confval:: retry_on_status
This is list of statuses on which a job will be cosidered to have failed and
will be automatically retried up to ``max_retries`` times. This defaults to
``["FAILED", "PARTIAL"]`` if not set. Possible values are:
``"OK"``
This iteration has completed and no errors have been detected
``"PARTIAL"``
One or more instruments have failed (the iteration may still be running).
``"FAILED"``
The workload itself has failed.
``"ABORTED"``
The user interupted the workload
.. confval:: max_reties
The maximum number of times failed jobs will be retried before giving up. If
not set, this will default to ``3``.
.. note:: this number does not include the original attempt
.. confval:: instrumentation
This should be a list of instruments to be enabled during run execution.

View File

@ -40,6 +40,20 @@ reboot_policy = 'as_needed'
# random: Randomisizes the order in which specs run. #
execution_order = 'by_iteration'
# This indicates when a job will be re-run.
# Possible values:
# OK: This iteration has completed and no errors have been detected
# PARTIAL: One or more instruments have failed (the iteration may still be running).
# FAILED: The workload itself has failed.
# ABORTED: The user interupted the workload
#
# If set to an empty list, a job will not be re-run ever.
retry_on_status = ['FAILED', 'PARTIAL']
# How many times a job will be re-run before giving up
max_retires = 3
####################################################################################################
######################################### Device Settings ##########################################
####################################################################################################

View File

@ -308,6 +308,12 @@ def _combine_ids(*args):
return '_'.join(args)
class status_list(list):
def append(self, item):
list.append(self, str(item).upper())
class RunConfiguration(object):
"""
Loads and maintains the unified configuration for this run. This includes configuration
@ -470,6 +476,8 @@ class RunConfiguration(object):
RunConfigurationItem('reboot_policy', 'scalar', 'replace'),
RunConfigurationItem('device', 'scalar', 'replace'),
RunConfigurationItem('flashing_config', 'dict', 'replace'),
RunConfigurationItem('retry_on_status', 'list', 'replace'),
RunConfigurationItem('max_retries', 'scalar', 'replace'),
]
# Configuration specified for each workload spec. "workload_parameters"
@ -523,6 +531,8 @@ class RunConfiguration(object):
self.workload_specs = []
self.flashing_config = {}
self.other_config = {} # keeps track of used config for extensions other than of the four main kinds.
self.retry_on_status = status_list(['FAILED', 'PARTIAL'])
self.max_retries = 3
self._used_config_items = []
self._global_instrumentation = []
self._reboot_policy = None

View File

@ -401,8 +401,9 @@ class RunnerJob(object):
"""
def __init__(self, spec):
def __init__(self, spec, retry=0):
self.spec = spec
self.retry = retry
self.iteration = None
self.result = IterationResult(self.spec)
@ -423,6 +424,10 @@ class Runner(object):
"""Internal runner error."""
pass
@property
def config(self):
return self.context.config
@property
def current_job(self):
if self.job_queue:
@ -623,8 +628,16 @@ class Runner(object):
def _finalize_job(self):
self.context.run_result.iteration_results.append(self.current_job.result)
self.job_queue[0].iteration = self.context.current_iteration
self.completed_jobs.append(self.job_queue.pop(0))
job = self.job_queue.pop(0)
job.iteration = self.context.current_iteration
if job.result.status in self.config.retry_on_status:
if job.retry >= self.config.max_retries:
self.logger.error('Exceeded maxium number of retries. Abandoning job.')
else:
self.logger.info('Job status was {}. Retrying...'.format(job.result.status))
retry_job = RunnerJob(job.spec, job.retry + 1)
self.job_queue.insert(0, retry_job)
self.completed_jobs.append(job)
self.context.end_job()
def _finalize_run(self):

View File

@ -60,6 +60,9 @@ class Mock(object):
def __call__(self, *args, **kwargs):
pass
def __iter__(self):
return iter([])
class BadDeviceMeta(DeviceMeta):
@ -161,6 +164,7 @@ class RunnerTest(TestCase):
context = Mock()
context.reboot_policy = RebootPolicy(reboot_policy)
context.config.workload_specs = workloads
context.config.retry_on_status = []
instrument = _instantiate(SignalCatcher)
instrumentation.install(instrument)