mirror of
https://github.com/ARM-software/workload-automation.git
synced 2025-09-01 10:52:33 +01:00
fw/execution: Handle unresponsive targets
If a target error occurs, check whether the target is unresponsive. If it is, attempt to hard reset it if possible, or gracefully terminate execution if not.
This commit is contained in:
committed by
Marc Bonnici
parent
fdb872d9cd
commit
6fe31d6cad
@@ -22,7 +22,8 @@ from datetime import datetime
|
|||||||
import wa.framework.signal as signal
|
import wa.framework.signal as signal
|
||||||
from wa.framework import instrument
|
from wa.framework import instrument
|
||||||
from wa.framework.configuration.core import Status
|
from wa.framework.configuration.core import Status
|
||||||
from wa.framework.exception import HostError, WorkloadError
|
from wa.framework.exception import TargetError, HostError, WorkloadError,\
|
||||||
|
TargetNotRespondingError, TimeoutError
|
||||||
from wa.framework.job import Job
|
from wa.framework.job import Job
|
||||||
from wa.framework.output import init_job_output
|
from wa.framework.output import init_job_output
|
||||||
from wa.framework.output_processor import ProcessorManager
|
from wa.framework.output_processor import ProcessorManager
|
||||||
@@ -375,17 +376,18 @@ class Runner(object):
|
|||||||
self.send(signal.RUN_INITIALIZED)
|
self.send(signal.RUN_INITIALIZED)
|
||||||
|
|
||||||
while self.context.job_queue:
|
while self.context.job_queue:
|
||||||
try:
|
with signal.wrap('JOB_EXECUTION', self, self.context):
|
||||||
with signal.wrap('JOB_EXECUTION', self, self.context):
|
self.run_next_job(self.context)
|
||||||
self.run_next_job(self.context)
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt as e:
|
||||||
self.context.skip_remaining_jobs()
|
log.log_error(e, self.logger)
|
||||||
|
self.logger.info('Skipping remaining jobs.')
|
||||||
|
self.context.skip_remaining_jobs()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.context.add_event(e.message)
|
message = e.message if e.message else str(e)
|
||||||
if (not getattr(e, 'logged', None) and
|
log.log_error(e, self.logger)
|
||||||
not isinstance(e, KeyboardInterrupt)):
|
self.logger.error('Skipping remaining jobs due to "{}".'.format(e))
|
||||||
log.log_error(e, self.logger)
|
self.context.skip_remaining_jobs()
|
||||||
e.logged = True
|
|
||||||
raise e
|
raise e
|
||||||
finally:
|
finally:
|
||||||
self.finalize_run()
|
self.finalize_run()
|
||||||
@@ -429,6 +431,10 @@ class Runner(object):
|
|||||||
if not getattr(e, 'logged', None):
|
if not getattr(e, 'logged', None):
|
||||||
log.log_error(e, self.logger)
|
log.log_error(e, self.logger)
|
||||||
e.logged = True
|
e.logged = True
|
||||||
|
if isinstance(e, ExecutionError):
|
||||||
|
raise e
|
||||||
|
elif isinstance(e, TargetError):
|
||||||
|
context.tm.verify_target_responsive()
|
||||||
finally:
|
finally:
|
||||||
self.logger.info('Completing job {}'.format(job.id))
|
self.logger.info('Completing job {}'.format(job.id))
|
||||||
self.send(signal.JOB_COMPLETED)
|
self.send(signal.JOB_COMPLETED)
|
||||||
@@ -467,6 +473,8 @@ class Runner(object):
|
|||||||
if not getattr(e, 'logged', None):
|
if not getattr(e, 'logged', None):
|
||||||
log.log_error(e, self.logger)
|
log.log_error(e, self.logger)
|
||||||
e.logged = True
|
e.logged = True
|
||||||
|
if isinstance(e, TargetError) or isinstance(e, TimeoutError):
|
||||||
|
context.tm.verify_target_responsive()
|
||||||
raise e
|
raise e
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
@@ -474,8 +482,10 @@ class Runner(object):
|
|||||||
job.process_output(context)
|
job.process_output(context)
|
||||||
self.pm.process_job_output(context)
|
self.pm.process_job_output(context)
|
||||||
self.pm.export_job_output(context)
|
self.pm.export_job_output(context)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
job.set_status(Status.PARTIAL)
|
job.set_status(Status.PARTIAL)
|
||||||
|
if isinstance(e, TargetError) or isinstance(e, TimeoutError):
|
||||||
|
context.tm.verify_target_responsive()
|
||||||
raise
|
raise
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
@@ -105,7 +105,7 @@ from collections import OrderedDict
|
|||||||
from wa.framework import signal
|
from wa.framework import signal
|
||||||
from wa.framework.plugin import Plugin
|
from wa.framework.plugin import Plugin
|
||||||
from wa.framework.exception import (WAError, TargetNotRespondingError, TimeoutError,
|
from wa.framework.exception import (WAError, TargetNotRespondingError, TimeoutError,
|
||||||
WorkloadError)
|
WorkloadError, TargetError)
|
||||||
from wa.utils.log import log_error
|
from wa.utils.log import log_error
|
||||||
from wa.utils.misc import isiterable
|
from wa.utils.misc import isiterable
|
||||||
from wa.utils.types import identifier, enum, level
|
from wa.utils.types import identifier, enum, level
|
||||||
@@ -263,6 +263,9 @@ class ManagedCallback(object):
|
|||||||
def __call__(self, context):
|
def __call__(self, context):
|
||||||
if self.instrument.is_enabled:
|
if self.instrument.is_enabled:
|
||||||
try:
|
try:
|
||||||
|
if not context.tm.is_responsive:
|
||||||
|
logger.debug("Target unreponsive; skipping callback {}".format(self.callback))
|
||||||
|
return
|
||||||
self.callback(context)
|
self.callback(context)
|
||||||
except (KeyboardInterrupt, TargetNotRespondingError, TimeoutError): # pylint: disable=W0703
|
except (KeyboardInterrupt, TargetNotRespondingError, TimeoutError): # pylint: disable=W0703
|
||||||
raise
|
raise
|
||||||
@@ -274,6 +277,8 @@ class ManagedCallback(object):
|
|||||||
context.add_event(e.message)
|
context.add_event(e.message)
|
||||||
if isinstance(e, WorkloadError):
|
if isinstance(e, WorkloadError):
|
||||||
context.set_status('FAILED')
|
context.set_status('FAILED')
|
||||||
|
elif isinstance(e, TargetError) or isinstance(e, TimeoutError):
|
||||||
|
context.tm.verify_target_responsive()
|
||||||
else:
|
else:
|
||||||
if context.current_job:
|
if context.current_job:
|
||||||
context.set_status('PARTIAL')
|
context.set_status('PARTIAL')
|
||||||
|
@@ -115,6 +115,9 @@ class Job(object):
|
|||||||
self.run_time = datetime.utcnow() - start_time
|
self.run_time = datetime.utcnow() - start_time
|
||||||
|
|
||||||
def process_output(self, context):
|
def process_output(self, context):
|
||||||
|
if not context.tm.is_responsive:
|
||||||
|
self.logger.info('Target unresponsive; not processing job output.')
|
||||||
|
return
|
||||||
self.logger.info('Processing output for job {} [{}]'.format(self.id, self.iteration))
|
self.logger.info('Processing output for job {} [{}]'.format(self.id, self.iteration))
|
||||||
if self.status != Status.FAILED:
|
if self.status != Status.FAILED:
|
||||||
with signal.wrap('WORKLOAD_RESULT_EXTRACTION', self, context):
|
with signal.wrap('WORKLOAD_RESULT_EXTRACTION', self, context):
|
||||||
@@ -124,11 +127,17 @@ class Job(object):
|
|||||||
self.workload.update_output(context)
|
self.workload.update_output(context)
|
||||||
|
|
||||||
def teardown(self, context):
|
def teardown(self, context):
|
||||||
|
if not context.tm.is_responsive:
|
||||||
|
self.logger.info('Target unresponsive; not tearing down.')
|
||||||
|
return
|
||||||
self.logger.info('Tearing down job {} [{}]'.format(self.id, self.iteration))
|
self.logger.info('Tearing down job {} [{}]'.format(self.id, self.iteration))
|
||||||
with signal.wrap('WORKLOAD_TEARDOWN', self, context):
|
with signal.wrap('WORKLOAD_TEARDOWN', self, context):
|
||||||
self.workload.teardown(context)
|
self.workload.teardown(context)
|
||||||
|
|
||||||
def finalize(self, context):
|
def finalize(self, context):
|
||||||
|
if not context.tm.is_responsive:
|
||||||
|
self.logger.info('Target unresponsive; not finalizing.')
|
||||||
|
return
|
||||||
self.logger.info('Finalizing job {} [{}]'.format(self.id, self.iteration))
|
self.logger.info('Finalizing job {} [{}]'.format(self.id, self.iteration))
|
||||||
with signal.wrap('WORKLOAD_FINALIZED', self, context):
|
with signal.wrap('WORKLOAD_FINALIZED', self, context):
|
||||||
self.workload.finalize(context)
|
self.workload.finalize(context)
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
from wa.framework import signal
|
from wa.framework import signal
|
||||||
from wa.framework.exception import ExecutionError, TargetError
|
from wa.framework.exception import ExecutionError, TargetError, TargetNotRespondingError
|
||||||
from wa.framework.plugin import Parameter
|
from wa.framework.plugin import Parameter
|
||||||
from wa.framework.target.descriptor import (get_target_description,
|
from wa.framework.target.descriptor import (get_target_description,
|
||||||
instantiate_target,
|
instantiate_target,
|
||||||
@@ -90,8 +90,9 @@ class TargetManager(object):
|
|||||||
self.logger.info('Target unresponsive; performing hard reset')
|
self.logger.info('Target unresponsive; performing hard reset')
|
||||||
self.target.reboot(hard=True)
|
self.target.reboot(hard=True)
|
||||||
self.is_responsive = True
|
self.is_responsive = True
|
||||||
|
raise ExecutionError('Target became unresponsive but was recovered.')
|
||||||
else:
|
else:
|
||||||
raise ExecutionError('Target unresponsive and hard reset not supported; bailing.')
|
raise TargetNotRespondingError('Target unresponsive and hard reset not supported; bailing.')
|
||||||
|
|
||||||
def _init_target(self):
|
def _init_target(self):
|
||||||
tdesc = get_target_description(self.target_name)
|
tdesc = get_target_description(self.target_name)
|
||||||
|
Reference in New Issue
Block a user