1
0
mirror of https://github.com/ARM-software/workload-automation.git synced 2025-09-01 10:52:33 +01:00

fw/execution: Handle unresponsive targets

If a target error occurs, check whether the target is unresponsive. If
it is, attempt to hard reset it if possible, or gracefully terminate
execution if not.
This commit is contained in:
Sergei Trofimov
2018-02-28 10:24:56 +00:00
committed by Marc Bonnici
parent fdb872d9cd
commit 6fe31d6cad
4 changed files with 40 additions and 15 deletions

View File

@@ -22,7 +22,8 @@ from datetime import datetime
import wa.framework.signal as signal import wa.framework.signal as signal
from wa.framework import instrument from wa.framework import instrument
from wa.framework.configuration.core import Status from wa.framework.configuration.core import Status
from wa.framework.exception import HostError, WorkloadError from wa.framework.exception import TargetError, HostError, WorkloadError,\
TargetNotRespondingError, TimeoutError
from wa.framework.job import Job from wa.framework.job import Job
from wa.framework.output import init_job_output from wa.framework.output import init_job_output
from wa.framework.output_processor import ProcessorManager from wa.framework.output_processor import ProcessorManager
@@ -375,17 +376,18 @@ class Runner(object):
self.send(signal.RUN_INITIALIZED) self.send(signal.RUN_INITIALIZED)
while self.context.job_queue: while self.context.job_queue:
try: with signal.wrap('JOB_EXECUTION', self, self.context):
with signal.wrap('JOB_EXECUTION', self, self.context): self.run_next_job(self.context)
self.run_next_job(self.context)
except KeyboardInterrupt: except KeyboardInterrupt as e:
self.context.skip_remaining_jobs() log.log_error(e, self.logger)
self.logger.info('Skipping remaining jobs.')
self.context.skip_remaining_jobs()
except Exception as e: except Exception as e:
self.context.add_event(e.message) message = e.message if e.message else str(e)
if (not getattr(e, 'logged', None) and log.log_error(e, self.logger)
not isinstance(e, KeyboardInterrupt)): self.logger.error('Skipping remaining jobs due to "{}".'.format(e))
log.log_error(e, self.logger) self.context.skip_remaining_jobs()
e.logged = True
raise e raise e
finally: finally:
self.finalize_run() self.finalize_run()
@@ -429,6 +431,10 @@ class Runner(object):
if not getattr(e, 'logged', None): if not getattr(e, 'logged', None):
log.log_error(e, self.logger) log.log_error(e, self.logger)
e.logged = True e.logged = True
if isinstance(e, ExecutionError):
raise e
elif isinstance(e, TargetError):
context.tm.verify_target_responsive()
finally: finally:
self.logger.info('Completing job {}'.format(job.id)) self.logger.info('Completing job {}'.format(job.id))
self.send(signal.JOB_COMPLETED) self.send(signal.JOB_COMPLETED)
@@ -467,6 +473,8 @@ class Runner(object):
if not getattr(e, 'logged', None): if not getattr(e, 'logged', None):
log.log_error(e, self.logger) log.log_error(e, self.logger)
e.logged = True e.logged = True
if isinstance(e, TargetError) or isinstance(e, TimeoutError):
context.tm.verify_target_responsive()
raise e raise e
finally: finally:
try: try:
@@ -474,8 +482,10 @@ class Runner(object):
job.process_output(context) job.process_output(context)
self.pm.process_job_output(context) self.pm.process_job_output(context)
self.pm.export_job_output(context) self.pm.export_job_output(context)
except Exception: except Exception as e:
job.set_status(Status.PARTIAL) job.set_status(Status.PARTIAL)
if isinstance(e, TargetError) or isinstance(e, TimeoutError):
context.tm.verify_target_responsive()
raise raise
except KeyboardInterrupt: except KeyboardInterrupt:

View File

@@ -105,7 +105,7 @@ from collections import OrderedDict
from wa.framework import signal from wa.framework import signal
from wa.framework.plugin import Plugin from wa.framework.plugin import Plugin
from wa.framework.exception import (WAError, TargetNotRespondingError, TimeoutError, from wa.framework.exception import (WAError, TargetNotRespondingError, TimeoutError,
WorkloadError) WorkloadError, TargetError)
from wa.utils.log import log_error from wa.utils.log import log_error
from wa.utils.misc import isiterable from wa.utils.misc import isiterable
from wa.utils.types import identifier, enum, level from wa.utils.types import identifier, enum, level
@@ -263,6 +263,9 @@ class ManagedCallback(object):
def __call__(self, context): def __call__(self, context):
if self.instrument.is_enabled: if self.instrument.is_enabled:
try: try:
if not context.tm.is_responsive:
logger.debug("Target unreponsive; skipping callback {}".format(self.callback))
return
self.callback(context) self.callback(context)
except (KeyboardInterrupt, TargetNotRespondingError, TimeoutError): # pylint: disable=W0703 except (KeyboardInterrupt, TargetNotRespondingError, TimeoutError): # pylint: disable=W0703
raise raise
@@ -274,6 +277,8 @@ class ManagedCallback(object):
context.add_event(e.message) context.add_event(e.message)
if isinstance(e, WorkloadError): if isinstance(e, WorkloadError):
context.set_status('FAILED') context.set_status('FAILED')
elif isinstance(e, TargetError) or isinstance(e, TimeoutError):
context.tm.verify_target_responsive()
else: else:
if context.current_job: if context.current_job:
context.set_status('PARTIAL') context.set_status('PARTIAL')

View File

@@ -115,6 +115,9 @@ class Job(object):
self.run_time = datetime.utcnow() - start_time self.run_time = datetime.utcnow() - start_time
def process_output(self, context): def process_output(self, context):
if not context.tm.is_responsive:
self.logger.info('Target unresponsive; not processing job output.')
return
self.logger.info('Processing output for job {} [{}]'.format(self.id, self.iteration)) self.logger.info('Processing output for job {} [{}]'.format(self.id, self.iteration))
if self.status != Status.FAILED: if self.status != Status.FAILED:
with signal.wrap('WORKLOAD_RESULT_EXTRACTION', self, context): with signal.wrap('WORKLOAD_RESULT_EXTRACTION', self, context):
@@ -124,11 +127,17 @@ class Job(object):
self.workload.update_output(context) self.workload.update_output(context)
def teardown(self, context): def teardown(self, context):
if not context.tm.is_responsive:
self.logger.info('Target unresponsive; not tearing down.')
return
self.logger.info('Tearing down job {} [{}]'.format(self.id, self.iteration)) self.logger.info('Tearing down job {} [{}]'.format(self.id, self.iteration))
with signal.wrap('WORKLOAD_TEARDOWN', self, context): with signal.wrap('WORKLOAD_TEARDOWN', self, context):
self.workload.teardown(context) self.workload.teardown(context)
def finalize(self, context): def finalize(self, context):
if not context.tm.is_responsive:
self.logger.info('Target unresponsive; not finalizing.')
return
self.logger.info('Finalizing job {} [{}]'.format(self.id, self.iteration)) self.logger.info('Finalizing job {} [{}]'.format(self.id, self.iteration))
with signal.wrap('WORKLOAD_FINALIZED', self, context): with signal.wrap('WORKLOAD_FINALIZED', self, context):
self.workload.finalize(context) self.workload.finalize(context)

View File

@@ -1,7 +1,7 @@
import logging import logging
from wa.framework import signal from wa.framework import signal
from wa.framework.exception import ExecutionError, TargetError from wa.framework.exception import ExecutionError, TargetError, TargetNotRespondingError
from wa.framework.plugin import Parameter from wa.framework.plugin import Parameter
from wa.framework.target.descriptor import (get_target_description, from wa.framework.target.descriptor import (get_target_description,
instantiate_target, instantiate_target,
@@ -90,8 +90,9 @@ class TargetManager(object):
self.logger.info('Target unresponsive; performing hard reset') self.logger.info('Target unresponsive; performing hard reset')
self.target.reboot(hard=True) self.target.reboot(hard=True)
self.is_responsive = True self.is_responsive = True
raise ExecutionError('Target became unresponsive but was recovered.')
else: else:
raise ExecutionError('Target unresponsive and hard reset not supported; bailing.') raise TargetNotRespondingError('Target unresponsive and hard reset not supported; bailing.')
def _init_target(self): def _init_target(self):
tdesc = get_target_description(self.target_name) tdesc = get_target_description(self.target_name)