Catch all errors when provisioning & make start_master_ssh multiprocess safe

Currently autoupdating.py only catches AutoservRunErrors when
updating the rootfs and stateful, thus allowing autotest
infrastructure errors to not be caught properly.

Updates start_master_ssh so that it protects against multiple
processes cleaning up the old connection and creating a new one
in parallel.

BUG=chromium:468062
TEST=provision still works on moblab.

Change-Id: I5a2c31ead5b15ca66c1f8035df27e9c3420b41a8
Reviewed-on: https://chromium-review.googlesource.com/260664
Trybot-Ready: Simran Basi <[email protected]>
Tested-by: Simran Basi <[email protected]>
Reviewed-by: Don Garrett <[email protected]>
Commit-Queue: Simran Basi <[email protected]>
diff --git a/server/hosts/abstract_ssh.py b/server/hosts/abstract_ssh.py
index ddd0d0a..af38b25 100644
--- a/server/hosts/abstract_ssh.py
+++ b/server/hosts/abstract_ssh.py
@@ -1,4 +1,5 @@
 import os, time, socket, shutil, glob, logging, traceback, tempfile
+from multiprocessing import Lock
 from autotest_lib.client.common_lib import autotemp, error
 from autotest_lib.server import utils, autotest
 from autotest_lib.server.hosts import remote
@@ -41,6 +42,9 @@
         self.master_ssh_tempdir = None
         self.master_ssh_option = ''
 
+        # Create a Lock to protect against race conditions.
+        self._lock = Lock()
+
 
     @property
     def ip(self):
@@ -640,42 +644,49 @@
         if not enable_master_ssh:
             return
 
-        # If a previously started master SSH connection is not running
-        # anymore, it needs to be cleaned up and then restarted.
-        if self.master_ssh_job is not None:
-            socket_path = os.path.join(self.master_ssh_tempdir.name, 'socket')
-            if (not os.path.exists(socket_path) or
-                    self.master_ssh_job.sp.poll() is not None):
-                logging.info("Master ssh connection to %s is down.",
-                             self.hostname)
-                self._cleanup_master_ssh()
+        # Multiple processes might try in parallel to clean up the old master
+        # ssh connection and create a new one, therefore use a lock to protect
+        # against race conditions.
+        with self._lock:
+            # If a previously started master SSH connection is not running
+            # anymore, it needs to be cleaned up and then restarted.
+            if self.master_ssh_job is not None:
+                socket_path = os.path.join(self.master_ssh_tempdir.name,
+                                           'socket')
+                if (not os.path.exists(socket_path) or
+                        self.master_ssh_job.sp.poll() is not None):
+                    logging.info("Master ssh connection to %s is down.",
+                                 self.hostname)
+                    self._cleanup_master_ssh()
 
-        # Start a new master SSH connection.
-        if self.master_ssh_job is None:
-            # Create a shared socket in a temp location.
-            self.master_ssh_tempdir = autotemp.tempdir(unique_id='ssh-master')
-            self.master_ssh_option = ("-o ControlPath=%s/socket" %
-                                      self.master_ssh_tempdir.name)
+            # Start a new master SSH connection.
+            if self.master_ssh_job is None:
+                # Create a shared socket in a temp location.
+                self.master_ssh_tempdir = autotemp.tempdir(
+                        unique_id='ssh-master')
+                self.master_ssh_option = ("-o ControlPath=%s/socket" %
+                                          self.master_ssh_tempdir.name)
 
-            # Start the master SSH connection in the background.
-            master_cmd = self.ssh_command(options="-N -o ControlMaster=yes")
-            logging.info("Starting master ssh connection '%s'", master_cmd)
-            self.master_ssh_job = utils.BgJob(master_cmd,
-                                              nickname='master-ssh',
-                                              no_pipes=True)
-            # To prevent a race between the the master ssh connection startup
-            # and its first attempted use, wait for socket file to exist before
-            # returning.
-            end_time = time.time() + timeout
-            socket_file_path = os.path.join(self.master_ssh_tempdir.name,
-                                            'socket')
-            while time.time() < end_time:
-                if os.path.exists(socket_file_path):
-                    break
-                time.sleep(.2)
-            else:
-                logging.info('Timed out waiting for master-ssh connection '
-                             'to be established.')
+                # Start the master SSH connection in the background.
+                master_cmd = self.ssh_command(
+                        options="-N -o ControlMaster=yes")
+                logging.info("Starting master ssh connection '%s'", master_cmd)
+                self.master_ssh_job = utils.BgJob(master_cmd,
+                                                  nickname='master-ssh',
+                                                  no_pipes=True)
+                # To prevent a race between the the master ssh connection
+                # startup and its first attempted use, wait for socket file to
+                # exist before returning.
+                end_time = time.time() + timeout
+                socket_file_path = os.path.join(self.master_ssh_tempdir.name,
+                                                'socket')
+                while time.time() < end_time:
+                    if os.path.exists(socket_file_path):
+                        break
+                    time.sleep(.2)
+                else:
+                    logging.info('Timed out waiting for master-ssh connection '
+                                 'to be established.')
 
 
     def clear_known_hosts(self):