[autotest] Retry when staging a control files fails in run-suite.
The cl also adds some stats for reporting run_suite failures, and
sneaks in some autoserv testing changes.
TEST=Ran run_suite against a bad devserver, raised the exception.
BUG=chromium:418928, chromium:359741, chromium:371644
Change-Id: If49a3b96c053432cb26a01a02e160176be37c037
Reviewed-on: https://chromium-review.googlesource.com/220973
Reviewed-by: Prashanth B <[email protected]>
Tested-by: Prashanth B <[email protected]>
Commit-Queue: Prashanth B <[email protected]>
diff --git a/client/common_lib/error.py b/client/common_lib/error.py
index 9270b5f..3428edd 100644
--- a/client/common_lib/error.py
+++ b/client/common_lib/error.py
@@ -540,6 +540,11 @@
"Raised when a repo isn't working in some way"
+class StageControlFileFailure(Exception):
+ """Exceptions encountered staging control files."""
+ pass
+
+
class CrosDynamicSuiteException(Exception):
"""
Base class for exceptions coming from dynamic suite code in
diff --git a/frontend/afe/site_rpc_interface.py b/frontend/afe/site_rpc_interface.py
index 50a43c3..618841e 100644
--- a/frontend/afe/site_rpc_interface.py
+++ b/frontend/afe/site_rpc_interface.py
@@ -83,21 +83,22 @@
@param build image we want to stage.
- @raises StageBuildFailure: if the dev server throws 500 while staging
- build.
+ @raises StageControlFileFailure: if the dev server throws 500 while staging
+ suite control files.
@return: dev_server.ImageServer instance to use with this build.
@return: timings dictionary containing staging start/end times.
"""
timings = {}
- # Set synchronous to False to allow other components to be downloaded in
- # the background.
+ # Ensure components of |build| necessary for installing images are staged
+ # on the dev server. However set synchronous to False to allow other
+ # components to be downloaded in the background.
ds = dev_server.ImageServer.resolve(build)
timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now()
try:
ds.stage_artifacts(build, ['test_suites'])
except dev_server.DevServerException as e:
- raise error.StageBuildFailure(
+ raise error.StageControlFileFailure(
"Failed to stage %s: %s" % (build, e))
timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now()
return (ds, timings)
@@ -140,7 +141,8 @@
@raises ControlFileNotFound: if a unique suite control file doesn't exist.
@raises NoControlFileList: if we can't list the control files at all.
- @raises StageBuildFailure: if the dev server throws 500 while staging build.
+ @raises StageControlFileFailure: If the dev server throws 500 while
+ staging test_suites.
@raises ControlFileEmpty: if the control file exists on the server, but
can't be read.
@@ -152,7 +154,6 @@
if num == 0:
logging.warning("Can't run on 0 hosts; using default.")
num = None
-
(ds, timings) = _stage_build_artifacts(build)
if not control_file:
diff --git a/frontend/afe/site_rpc_interface_unittest.py b/frontend/afe/site_rpc_interface_unittest.py
index 30821f4..164cad0 100644
--- a/frontend/afe/site_rpc_interface_unittest.py
+++ b/frontend/afe/site_rpc_interface_unittest.py
@@ -115,7 +115,7 @@
self._BUILD, ['test_suites']).AndRaise(
dev_server.DevServerException())
self.mox.ReplayAll()
- self.assertRaises(error.StageBuildFailure,
+ self.assertRaises(error.StageControlFileFailure,
site_rpc_interface.create_suite_job,
self._NAME,
self._BOARD,
diff --git a/server/autoserv b/server/autoserv
index b627c44..c0fb4d6 100755
--- a/server/autoserv
+++ b/server/autoserv
@@ -17,7 +17,7 @@
# Number of seconds to wait before returning if testing mode is enabled
-TESTING_MODE_SLEEP_SECS = 10
+TESTING_MODE_SLEEP_SECS = 1
try:
@@ -314,14 +314,26 @@
except control_data.ControlVariableException as e:
logging.error(str(e))
exit_code = 0
+ # TODO(beeps): Extend this to cover different failure modes.
+ # Testing exceptions are matched against labels sent to autoserv. Eg,
+ # to allow only the hostless job to run, specify
+ # testing_exceptions: test_suite in the shadow_config. To allow both
+ # the hostless job and dummy_Pass to run, specify
+ # testing_exceptions: test_suite,dummy_Pass. You can figure out
+ # what label autoserv is invoked with by looking through the logs of a test
+ # for the autoserv command's -l option.
+ testing_exceptions = global_config.global_config.get_config_value(
+ 'AUTOSERV', 'testing_exceptions', type=list, default=[])
+ test_mode = global_config.global_config.get_config_value(
+ 'AUTOSERV', 'testing_mode', type=bool, default=False)
+ test_mode = test_mode and not any([ex in parser.options.label
+ for ex in testing_exceptions])
try:
try:
- if not global_config.global_config.get_config_value(
- 'AUTOSERV', 'testing_mode', type=bool, default=False):
- run_autoserv(pid_file_manager, results, parser)
- else:
- # TODO(beeps): Extend this to cover different failure modes.
+ if test_mode:
time.sleep(TESTING_MODE_SLEEP_SECS)
+ else:
+ run_autoserv(pid_file_manager, results, parser)
except SystemExit as e:
exit_code = e.code
if exit_code:
diff --git a/site_utils/run_suite.py b/site_utils/run_suite.py
index 2435cae..e0e75e2 100755
--- a/site_utils/run_suite.py
+++ b/site_utils/run_suite.py
@@ -50,6 +50,7 @@
from autotest_lib.client.common_lib import priorities
from autotest_lib.client.common_lib import time_utils
from autotest_lib.client.common_lib.cros.graphite import stats
+from autotest_lib.client.common_lib.cros import retry
from autotest_lib.frontend.afe.json_rpc import proxy
from autotest_lib.server import utils
from autotest_lib.server.cros.dynamic_suite import constants
@@ -1194,6 +1195,40 @@
self._compute_return_code()
[email protected](error.StageControlFileFailure, timeout_min=10)
+def create_suite(afe, options):
+ """Create a suite with retries.
+
+ @param afe: The afe object to insert the new suite job into.
+ @param options: The options to use in creating the suite.
+
+ @return: The afe_job_id of the new suite job.
+ """
+ wait = options.no_wait == 'False'
+ file_bugs = options.file_bugs == 'True'
+ retry = options.retry == 'True'
+ try:
+ priority = int(options.priority)
+ except ValueError:
+ try:
+ priority = priorities.Priority.get_value(options.priority)
+ except AttributeError:
+ print 'Unknown priority level %s. Try one of %s.' % (
+ options.priority, ', '.join(priorities.Priority.names))
+ raise
+ logging.info('%s Submitted create_suite_job rpc',
+ diagnosis_utils.JobTimer.format_time(datetime.now()))
+ return afe.run('create_suite_job', name=options.name,
+ board=options.board, build=options.build,
+ check_hosts=wait, pool=options.pool,
+ num=options.num,
+ file_bugs=file_bugs, priority=priority,
+ suite_args=options.suite_args,
+ wait_for_results=wait,
+ timeout_mins=options.timeout_mins,
+ job_retry=retry)
+
+
def main_without_exception_handling():
"""
Entry point for run_suite script without exception handling.
@@ -1212,15 +1247,6 @@
log_name = os.path.join(log_dir, log_name)
setup_logging(logfile=log_name)
- try:
- priority = int(options.priority)
- except ValueError:
- try:
- priority = priorities.Priority.get_value(options.priority)
- except AttributeError:
- print 'Unknown priority level %s. Try one of %s.' % (
- options.priority, ', '.join(priorities.Priority.names))
- return RETURN_CODES.INVALID_OPTIONS
if not options.bypass_labstatus:
utils.check_lab_status(options.build)
@@ -1234,29 +1260,17 @@
rpc_helper = diagnosis_utils.RPCHelper(afe)
rpc_helper.check_dut_availability(options.board, options.pool,
options.minimum_duts)
-
- wait = options.no_wait == 'False'
- file_bugs = options.file_bugs == 'True'
- retry = options.retry == 'True'
- logging.info('%s Submitted create_suite_job rpc',
- diagnosis_utils.JobTimer.format_time(datetime.now()))
if options.mock_job_id:
job_id = int(options.mock_job_id)
else:
try:
- job_id = afe.run('create_suite_job', name=options.name,
- board=options.board, build=options.build,
- check_hosts=wait, pool=options.pool,
- num=options.num,
- file_bugs=file_bugs, priority=priority,
- suite_args=options.suite_args,
- wait_for_results=wait,
- timeout_mins=options.timeout_mins,
- job_retry=retry)
+ job_id = create_suite(afe, options)
except (error.CrosDynamicSuiteException,
error.RPCException, proxy.JSONRPCException) as e:
logging.warning('Error Message: %s', e)
return RETURN_CODES.INFRA_FAILURE
+ except AttributeError:
+ return RETURN_CODES.INVALID_OPTIONS
job_timer = diagnosis_utils.JobTimer(
time.time(), float(options.timeout_mins))
@@ -1270,7 +1284,7 @@
timeout_min=options.afe_timeout_mins,
delay_sec=options.delay_sec)
code = RETURN_CODES.OK
-
+ wait = options.no_wait == 'False'
if wait:
while not afe.get_jobs(id=job_id, finished=True):
# Note that this call logs output, preventing buildbot's
@@ -1334,7 +1348,6 @@
RETURN_CODES.get_string(code))
if is_suite_timeout:
logging.info('\nAttempting to diagnose pool: %s', options.pool)
- stats.Counter('run_suite_timeouts').increment()
try:
# Add some jitter to make up for any latency in
# aborting the suite or checking for results.
@@ -1346,8 +1359,6 @@
logging.warning('Unable to diagnose suite abort.')
# And output return message.
- code_str = RETURN_CODES.get_string(code)
- logging.info('Will return from run_suite with status: %s', code_str)
if return_message:
logging.info('Reason: %s', return_message)
@@ -1366,7 +1377,7 @@
"""Entry point."""
code = RETURN_CODES.OK
try:
- return main_without_exception_handling()
+ code = main_without_exception_handling()
except diagnosis_utils.BoardNotAvailableError as e:
logging.warning('Can not run suite: %s', e)
code = RETURN_CODES.BOARD_NOT_AVAILABLE
@@ -1379,6 +1390,7 @@
logging.info('Will return from run_suite with status: %s',
RETURN_CODES.get_string(code))
+ stats.Counter('run_suite.%s' % RETURN_CODES.get_string(code)).increment()
return code