From ccea63555c975a19146e2624351e4477eb1050d8 Mon Sep 17 00:00:00 2001
From: Sergei Trofimov <sergei.trofimov@arm.com>
Date: Thu, 18 Jun 2015 16:36:56 +0100
Subject: [PATCH] Added retries

Failed jobs will now be automatically retired. This is controlled by two
new settings:

retry_on_status - a list of statuses which will be consided failures and
                  result in a retry
max_retries - number of retries before giving up
---
 doc/source/configuration.rst   | 26 ++++++++++++++++++++++++++
 wlauto/config_example.py       | 14 ++++++++++++++
 wlauto/core/configuration.py   | 10 ++++++++++
 wlauto/core/execution.py       | 19 ++++++++++++++++---
 wlauto/tests/test_execution.py |  4 ++++
 5 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst
index 4ceee83c..4ffba6b0 100644
--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@@ -97,6 +97,32 @@ Available Settings
 
    Added in version 2.1.5.
 
+
+.. confval:: retry_on_status
+
+   This is list of statuses on which a job will be cosidered to have failed and
+   will be automatically retried up to ``max_retries`` times. This defaults to
+   ``["FAILED", "PARTIAL"]`` if not set. Possible values are:
+
+   ``"OK"``
+   This iteration has completed and no errors have been detected
+
+   ``"PARTIAL"`` 
+   One or more instruments have failed (the iteration may still be running).
+
+   ``"FAILED"`` 
+   The workload itself has failed.
+
+   ``"ABORTED"`` 
+   The user interupted the workload
+
+.. confval:: max_reties
+
+   The maximum number of times failed jobs will be retried before giving up. If
+   not set, this will default to ``3``. 
+
+   .. note:: this number does not include the original attempt
+
 .. confval:: instrumentation
 
    This should be a list of instruments to be enabled during run execution.
diff --git a/wlauto/config_example.py b/wlauto/config_example.py
index 2ce276c1..f33980c3 100644
--- a/wlauto/config_example.py
+++ b/wlauto/config_example.py
@@ -40,6 +40,20 @@ reboot_policy = 'as_needed'
 #   random:       Randomisizes the order in which specs run.                                       #
 execution_order = 'by_iteration'
 
+
+# This indicates when a job will be re-run.
+# Possible values:
+#     OK: This iteration has completed and no errors have been detected
+#     PARTIAL: One or more instruments have failed (the iteration may still be running).
+#     FAILED: The workload itself has failed.
+#     ABORTED: The user interupted the workload
+#
+# If set to an empty list, a job will not be re-run ever.
+retry_on_status = ['FAILED', 'PARTIAL']
+
+# How many times a job will be re-run before giving up
+max_retires = 3
+
 ####################################################################################################
 ######################################### Device Settings ##########################################
 ####################################################################################################
diff --git a/wlauto/core/configuration.py b/wlauto/core/configuration.py
index 33e55a85..99a57a47 100644
--- a/wlauto/core/configuration.py
+++ b/wlauto/core/configuration.py
@@ -308,6 +308,12 @@ def _combine_ids(*args):
     return '_'.join(args)
 
 
+class status_list(list):
+
+    def append(self, item):
+        list.append(self, str(item).upper())
+
+
 class RunConfiguration(object):
     """
     Loads and maintains the unified configuration for this run. This includes configuration
@@ -470,6 +476,8 @@ class RunConfiguration(object):
         RunConfigurationItem('reboot_policy', 'scalar', 'replace'),
         RunConfigurationItem('device', 'scalar', 'replace'),
         RunConfigurationItem('flashing_config', 'dict', 'replace'),
+        RunConfigurationItem('retry_on_status', 'list', 'replace'),
+        RunConfigurationItem('max_retries', 'scalar', 'replace'),
     ]
 
     # Configuration specified for each workload spec. "workload_parameters"
@@ -523,6 +531,8 @@ class RunConfiguration(object):
         self.workload_specs = []
         self.flashing_config = {}
         self.other_config = {}  # keeps track of used config for extensions other than of the four main kinds.
+        self.retry_on_status = status_list(['FAILED', 'PARTIAL'])
+        self.max_retries = 3
         self._used_config_items = []
         self._global_instrumentation = []
         self._reboot_policy = None
diff --git a/wlauto/core/execution.py b/wlauto/core/execution.py
index a3e0a56f..6fc9ee97 100644
--- a/wlauto/core/execution.py
+++ b/wlauto/core/execution.py
@@ -401,8 +401,9 @@ class RunnerJob(object):
 
     """
 
-    def __init__(self, spec):
+    def __init__(self, spec, retry=0):
         self.spec = spec
+        self.retry = retry
         self.iteration = None
         self.result = IterationResult(self.spec)
 
@@ -423,6 +424,10 @@ class Runner(object):
         """Internal runner error."""
         pass
 
+    @property
+    def config(self):
+        return self.context.config
+
     @property
     def current_job(self):
         if self.job_queue:
@@ -623,8 +628,16 @@ class Runner(object):
 
     def _finalize_job(self):
         self.context.run_result.iteration_results.append(self.current_job.result)
-        self.job_queue[0].iteration = self.context.current_iteration
-        self.completed_jobs.append(self.job_queue.pop(0))
+        job = self.job_queue.pop(0)
+        job.iteration = self.context.current_iteration
+        if job.result.status in self.config.retry_on_status:
+            if job.retry >= self.config.max_retries:
+                self.logger.error('Exceeded maxium number of retries. Abandoning job.')
+            else:
+                self.logger.info('Job status was {}. Retrying...'.format(job.result.status))
+                retry_job = RunnerJob(job.spec, job.retry + 1)
+                self.job_queue.insert(0, retry_job)
+        self.completed_jobs.append(job)
         self.context.end_job()
 
     def _finalize_run(self):
diff --git a/wlauto/tests/test_execution.py b/wlauto/tests/test_execution.py
index 92259352..9482ffaa 100644
--- a/wlauto/tests/test_execution.py
+++ b/wlauto/tests/test_execution.py
@@ -60,6 +60,9 @@ class Mock(object):
     def __call__(self, *args, **kwargs):
         pass
 
+    def __iter__(self):
+        return iter([])
+
 
 class BadDeviceMeta(DeviceMeta):
 
@@ -161,6 +164,7 @@ class RunnerTest(TestCase):
         context = Mock()
         context.reboot_policy = RebootPolicy(reboot_policy)
         context.config.workload_specs = workloads
+        context.config.retry_on_status = []
 
         instrument = _instantiate(SignalCatcher)
         instrumentation.install(instrument)