[autotest] Retry when staging a control files fails in run-suite.

The cl also adds some stats for reporting run_suite failures, and
sneaks in some autoserv testing changes.

TEST=Ran run_suite against a bad devserver, raised the exception.
BUG=chromium:418928, chromium:359741, chromium:371644

Change-Id: If49a3b96c053432cb26a01a02e160176be37c037
Reviewed-on: https://chromium-review.googlesource.com/220973
Reviewed-by: Prashanth B <[email protected]>
Tested-by: Prashanth B <[email protected]>
Commit-Queue: Prashanth B <[email protected]>
diff --git a/client/common_lib/error.py b/client/common_lib/error.py
index 9270b5f..3428edd 100644
--- a/client/common_lib/error.py
+++ b/client/common_lib/error.py
@@ -540,6 +540,11 @@
     "Raised when a repo isn't working in some way"
 
 
+class StageControlFileFailure(Exception):
+    """Exceptions encountered staging control files."""
+    pass
+
+
 class CrosDynamicSuiteException(Exception):
     """
     Base class for exceptions coming from dynamic suite code in
diff --git a/frontend/afe/site_rpc_interface.py b/frontend/afe/site_rpc_interface.py
index 50a43c3..618841e 100644
--- a/frontend/afe/site_rpc_interface.py
+++ b/frontend/afe/site_rpc_interface.py
@@ -83,21 +83,22 @@
 
     @param build image we want to stage.
 
-    @raises StageBuildFailure: if the dev server throws 500 while staging
-        build.
+    @raises StageControlFileFailure: if the dev server throws 500 while staging
+        suite control files.
 
     @return: dev_server.ImageServer instance to use with this build.
     @return: timings dictionary containing staging start/end times.
     """
     timings = {}
-    # Set synchronous to False to allow other components to be downloaded in
-    # the background.
+    # Ensure components of |build| necessary for installing images are staged
+    # on the dev server. However set synchronous to False to allow other
+    # components to be downloaded in the background.
     ds = dev_server.ImageServer.resolve(build)
     timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now()
     try:
         ds.stage_artifacts(build, ['test_suites'])
     except dev_server.DevServerException as e:
-        raise error.StageBuildFailure(
+        raise error.StageControlFileFailure(
                 "Failed to stage %s: %s" % (build, e))
     timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now()
     return (ds, timings)
@@ -140,7 +141,8 @@
 
     @raises ControlFileNotFound: if a unique suite control file doesn't exist.
     @raises NoControlFileList: if we can't list the control files at all.
-    @raises StageBuildFailure: if the dev server throws 500 while staging build.
+    @raises StageControlFileFailure: If the dev server throws 500 while
+                                     staging test_suites.
     @raises ControlFileEmpty: if the control file exists on the server, but
                               can't be read.
 
@@ -152,7 +154,6 @@
     if num == 0:
         logging.warning("Can't run on 0 hosts; using default.")
         num = None
-
     (ds, timings) = _stage_build_artifacts(build)
 
     if not control_file:
diff --git a/frontend/afe/site_rpc_interface_unittest.py b/frontend/afe/site_rpc_interface_unittest.py
index 30821f4..164cad0 100644
--- a/frontend/afe/site_rpc_interface_unittest.py
+++ b/frontend/afe/site_rpc_interface_unittest.py
@@ -115,7 +115,7 @@
             self._BUILD, ['test_suites']).AndRaise(
                 dev_server.DevServerException())
         self.mox.ReplayAll()
-        self.assertRaises(error.StageBuildFailure,
+        self.assertRaises(error.StageControlFileFailure,
                           site_rpc_interface.create_suite_job,
                           self._NAME,
                           self._BOARD,
diff --git a/server/autoserv b/server/autoserv
index b627c44..c0fb4d6 100755
--- a/server/autoserv
+++ b/server/autoserv
@@ -17,7 +17,7 @@
 
 
 # Number of seconds to wait before returning if testing mode is enabled
-TESTING_MODE_SLEEP_SECS = 10
+TESTING_MODE_SLEEP_SECS = 1
 
 
 try:
@@ -314,14 +314,26 @@
     except control_data.ControlVariableException as e:
         logging.error(str(e))
     exit_code = 0
+    # TODO(beeps): Extend this to cover different failure modes.
+    # Testing exceptions are matched against labels sent to autoserv. Eg,
+    # to allow only the hostless job to run, specify
+    # testing_exceptions: test_suite in the shadow_config. To allow both
+    # the hostless job and dummy_Pass to run, specify
+    # testing_exceptions: test_suite,dummy_Pass. You can figure out
+    # what label autoserv is invoked with by looking through the logs of a test
+    # for the autoserv command's -l option.
+    testing_exceptions = global_config.global_config.get_config_value(
+            'AUTOSERV', 'testing_exceptions', type=list, default=[])
+    test_mode = global_config.global_config.get_config_value(
+            'AUTOSERV', 'testing_mode', type=bool, default=False)
+    test_mode = test_mode and not any([ex in parser.options.label
+                                       for ex in testing_exceptions])
     try:
         try:
-            if not global_config.global_config.get_config_value(
-                        'AUTOSERV', 'testing_mode', type=bool, default=False):
-                run_autoserv(pid_file_manager, results, parser)
-            else:
-                # TODO(beeps): Extend this to cover different failure modes.
+            if test_mode:
                 time.sleep(TESTING_MODE_SLEEP_SECS)
+            else:
+                run_autoserv(pid_file_manager, results, parser)
         except SystemExit as e:
             exit_code = e.code
             if exit_code:
diff --git a/site_utils/run_suite.py b/site_utils/run_suite.py
index 2435cae..e0e75e2 100755
--- a/site_utils/run_suite.py
+++ b/site_utils/run_suite.py
@@ -50,6 +50,7 @@
 from autotest_lib.client.common_lib import priorities
 from autotest_lib.client.common_lib import time_utils
 from autotest_lib.client.common_lib.cros.graphite import stats
+from autotest_lib.client.common_lib.cros import retry
 from autotest_lib.frontend.afe.json_rpc import proxy
 from autotest_lib.server import utils
 from autotest_lib.server.cros.dynamic_suite import constants
@@ -1194,6 +1195,40 @@
         self._compute_return_code()
 
 
[email protected](error.StageControlFileFailure, timeout_min=10)
+def create_suite(afe, options):
+    """Create a suite with retries.
+
+    @param afe: The afe object to insert the new suite job into.
+    @param options: The options to use in creating the suite.
+
+    @return: The afe_job_id of the new suite job.
+    """
+    wait = options.no_wait == 'False'
+    file_bugs = options.file_bugs == 'True'
+    retry = options.retry == 'True'
+    try:
+        priority = int(options.priority)
+    except ValueError:
+        try:
+            priority = priorities.Priority.get_value(options.priority)
+        except AttributeError:
+            print 'Unknown priority level %s.  Try one of %s.' % (
+                  options.priority, ', '.join(priorities.Priority.names))
+            raise
+    logging.info('%s Submitted create_suite_job rpc',
+                 diagnosis_utils.JobTimer.format_time(datetime.now()))
+    return afe.run('create_suite_job', name=options.name,
+                   board=options.board, build=options.build,
+                   check_hosts=wait, pool=options.pool,
+                   num=options.num,
+                   file_bugs=file_bugs, priority=priority,
+                   suite_args=options.suite_args,
+                   wait_for_results=wait,
+                   timeout_mins=options.timeout_mins,
+                   job_retry=retry)
+
+
 def main_without_exception_handling():
     """
     Entry point for run_suite script without exception handling.
@@ -1212,15 +1247,6 @@
             log_name = os.path.join(log_dir, log_name)
 
     setup_logging(logfile=log_name)
-    try:
-        priority = int(options.priority)
-    except ValueError:
-        try:
-            priority = priorities.Priority.get_value(options.priority)
-        except AttributeError:
-            print 'Unknown priority level %s.  Try one of %s.' % (
-                  options.priority, ', '.join(priorities.Priority.names))
-            return RETURN_CODES.INVALID_OPTIONS
 
     if not options.bypass_labstatus:
         utils.check_lab_status(options.build)
@@ -1234,29 +1260,17 @@
     rpc_helper = diagnosis_utils.RPCHelper(afe)
     rpc_helper.check_dut_availability(options.board, options.pool,
                                       options.minimum_duts)
-
-    wait = options.no_wait == 'False'
-    file_bugs = options.file_bugs == 'True'
-    retry = options.retry == 'True'
-    logging.info('%s Submitted create_suite_job rpc',
-                 diagnosis_utils.JobTimer.format_time(datetime.now()))
     if options.mock_job_id:
         job_id = int(options.mock_job_id)
     else:
         try:
-            job_id = afe.run('create_suite_job', name=options.name,
-                             board=options.board, build=options.build,
-                             check_hosts=wait, pool=options.pool,
-                             num=options.num,
-                             file_bugs=file_bugs, priority=priority,
-                             suite_args=options.suite_args,
-                             wait_for_results=wait,
-                             timeout_mins=options.timeout_mins,
-                             job_retry=retry)
+            job_id = create_suite(afe, options)
         except (error.CrosDynamicSuiteException,
                 error.RPCException, proxy.JSONRPCException) as e:
             logging.warning('Error Message: %s', e)
             return RETURN_CODES.INFRA_FAILURE
+        except AttributeError:
+            return RETURN_CODES.INVALID_OPTIONS
 
     job_timer = diagnosis_utils.JobTimer(
             time.time(), float(options.timeout_mins))
@@ -1270,7 +1284,7 @@
                                         timeout_min=options.afe_timeout_mins,
                                         delay_sec=options.delay_sec)
     code = RETURN_CODES.OK
-
+    wait = options.no_wait == 'False'
     if wait:
         while not afe.get_jobs(id=job_id, finished=True):
             # Note that this call logs output, preventing buildbot's
@@ -1334,7 +1348,6 @@
                                  RETURN_CODES.get_string(code))
             if is_suite_timeout:
                 logging.info('\nAttempting to diagnose pool: %s', options.pool)
-                stats.Counter('run_suite_timeouts').increment()
                 try:
                     # Add some jitter to make up for any latency in
                     # aborting the suite or checking for results.
@@ -1346,8 +1359,6 @@
                     logging.warning('Unable to diagnose suite abort.')
 
         # And output return message.
-        code_str = RETURN_CODES.get_string(code)
-        logging.info('Will return from run_suite with status: %s', code_str)
         if return_message:
             logging.info('Reason: %s', return_message)
 
@@ -1366,7 +1377,7 @@
     """Entry point."""
     code = RETURN_CODES.OK
     try:
-        return main_without_exception_handling()
+        code = main_without_exception_handling()
     except diagnosis_utils.BoardNotAvailableError as e:
         logging.warning('Can not run suite: %s', e)
         code = RETURN_CODES.BOARD_NOT_AVAILABLE
@@ -1379,6 +1390,7 @@
 
     logging.info('Will return from run_suite with status: %s',
                   RETURN_CODES.get_string(code))
+    stats.Counter('run_suite.%s' % RETURN_CODES.get_string(code)).increment()
     return code