From ccea63555c975a19146e2624351e4477eb1050d8 Mon Sep 17 00:00:00 2001 From: Sergei Trofimov Date: Thu, 18 Jun 2015 16:36:56 +0100 Subject: [PATCH] Added retries Failed jobs will now be automatically retired. This is controlled by two new settings: retry_on_status - a list of statuses which will be consided failures and result in a retry max_retries - number of retries before giving up --- doc/source/configuration.rst | 26 ++++++++++++++++++++++++++ wlauto/config_example.py | 14 ++++++++++++++ wlauto/core/configuration.py | 10 ++++++++++ wlauto/core/execution.py | 19 ++++++++++++++++--- wlauto/tests/test_execution.py | 4 ++++ 5 files changed, 70 insertions(+), 3 deletions(-) diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 4ceee83c..4ffba6b0 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -97,6 +97,32 @@ Available Settings Added in version 2.1.5. + +.. confval:: retry_on_status + + This is list of statuses on which a job will be cosidered to have failed and + will be automatically retried up to ``max_retries`` times. This defaults to + ``["FAILED", "PARTIAL"]`` if not set. Possible values are: + + ``"OK"`` + This iteration has completed and no errors have been detected + + ``"PARTIAL"`` + One or more instruments have failed (the iteration may still be running). + + ``"FAILED"`` + The workload itself has failed. + + ``"ABORTED"`` + The user interupted the workload + +.. confval:: max_reties + + The maximum number of times failed jobs will be retried before giving up. If + not set, this will default to ``3``. + + .. note:: this number does not include the original attempt + .. confval:: instrumentation This should be a list of instruments to be enabled during run execution. diff --git a/wlauto/config_example.py b/wlauto/config_example.py index 2ce276c1..f33980c3 100644 --- a/wlauto/config_example.py +++ b/wlauto/config_example.py @@ -40,6 +40,20 @@ reboot_policy = 'as_needed' # random: Randomisizes the order in which specs run. # execution_order = 'by_iteration' + +# This indicates when a job will be re-run. +# Possible values: +# OK: This iteration has completed and no errors have been detected +# PARTIAL: One or more instruments have failed (the iteration may still be running). +# FAILED: The workload itself has failed. +# ABORTED: The user interupted the workload +# +# If set to an empty list, a job will not be re-run ever. +retry_on_status = ['FAILED', 'PARTIAL'] + +# How many times a job will be re-run before giving up +max_retires = 3 + #################################################################################################### ######################################### Device Settings ########################################## #################################################################################################### diff --git a/wlauto/core/configuration.py b/wlauto/core/configuration.py index 33e55a85..99a57a47 100644 --- a/wlauto/core/configuration.py +++ b/wlauto/core/configuration.py @@ -308,6 +308,12 @@ def _combine_ids(*args): return '_'.join(args) +class status_list(list): + + def append(self, item): + list.append(self, str(item).upper()) + + class RunConfiguration(object): """ Loads and maintains the unified configuration for this run. This includes configuration @@ -470,6 +476,8 @@ class RunConfiguration(object): RunConfigurationItem('reboot_policy', 'scalar', 'replace'), RunConfigurationItem('device', 'scalar', 'replace'), RunConfigurationItem('flashing_config', 'dict', 'replace'), + RunConfigurationItem('retry_on_status', 'list', 'replace'), + RunConfigurationItem('max_retries', 'scalar', 'replace'), ] # Configuration specified for each workload spec. "workload_parameters" @@ -523,6 +531,8 @@ class RunConfiguration(object): self.workload_specs = [] self.flashing_config = {} self.other_config = {} # keeps track of used config for extensions other than of the four main kinds. + self.retry_on_status = status_list(['FAILED', 'PARTIAL']) + self.max_retries = 3 self._used_config_items = [] self._global_instrumentation = [] self._reboot_policy = None diff --git a/wlauto/core/execution.py b/wlauto/core/execution.py index a3e0a56f..6fc9ee97 100644 --- a/wlauto/core/execution.py +++ b/wlauto/core/execution.py @@ -401,8 +401,9 @@ class RunnerJob(object): """ - def __init__(self, spec): + def __init__(self, spec, retry=0): self.spec = spec + self.retry = retry self.iteration = None self.result = IterationResult(self.spec) @@ -423,6 +424,10 @@ class Runner(object): """Internal runner error.""" pass + @property + def config(self): + return self.context.config + @property def current_job(self): if self.job_queue: @@ -623,8 +628,16 @@ class Runner(object): def _finalize_job(self): self.context.run_result.iteration_results.append(self.current_job.result) - self.job_queue[0].iteration = self.context.current_iteration - self.completed_jobs.append(self.job_queue.pop(0)) + job = self.job_queue.pop(0) + job.iteration = self.context.current_iteration + if job.result.status in self.config.retry_on_status: + if job.retry >= self.config.max_retries: + self.logger.error('Exceeded maxium number of retries. Abandoning job.') + else: + self.logger.info('Job status was {}. Retrying...'.format(job.result.status)) + retry_job = RunnerJob(job.spec, job.retry + 1) + self.job_queue.insert(0, retry_job) + self.completed_jobs.append(job) self.context.end_job() def _finalize_run(self): diff --git a/wlauto/tests/test_execution.py b/wlauto/tests/test_execution.py index 92259352..9482ffaa 100644 --- a/wlauto/tests/test_execution.py +++ b/wlauto/tests/test_execution.py @@ -60,6 +60,9 @@ class Mock(object): def __call__(self, *args, **kwargs): pass + def __iter__(self): + return iter([]) + class BadDeviceMeta(DeviceMeta): @@ -161,6 +164,7 @@ class RunnerTest(TestCase): context = Mock() context.reboot_policy = RebootPolicy(reboot_policy) context.config.workload_specs = workloads + context.config.retry_on_status = [] instrument = _instantiate(SignalCatcher) instrumentation.install(instrument)