Catch all errors when provisioning & make start_master_ssh multiprocess safe
Currently autoupdating.py only catches AutoservRunErrors when
updating the rootfs and stateful, thus allowing autotest
infrastructure errors to not be caught properly.
Updates start_master_ssh so that it protects against multiple
processes cleaning up the old connection and creating a new one
in parallel.
BUG=chromium:468062
TEST=provision still works on moblab.
Change-Id: I5a2c31ead5b15ca66c1f8035df27e9c3420b41a8
Reviewed-on: https://chromium-review.googlesource.com/260664
Trybot-Ready: Simran Basi <[email protected]>
Tested-by: Simran Basi <[email protected]>
Reviewed-by: Don Garrett <[email protected]>
Commit-Queue: Simran Basi <[email protected]>
diff --git a/server/hosts/abstract_ssh.py b/server/hosts/abstract_ssh.py
index ddd0d0a..af38b25 100644
--- a/server/hosts/abstract_ssh.py
+++ b/server/hosts/abstract_ssh.py
@@ -1,4 +1,5 @@
import os, time, socket, shutil, glob, logging, traceback, tempfile
+from multiprocessing import Lock
from autotest_lib.client.common_lib import autotemp, error
from autotest_lib.server import utils, autotest
from autotest_lib.server.hosts import remote
@@ -41,6 +42,9 @@
self.master_ssh_tempdir = None
self.master_ssh_option = ''
+ # Create a Lock to protect against race conditions.
+ self._lock = Lock()
+
@property
def ip(self):
@@ -640,42 +644,49 @@
if not enable_master_ssh:
return
- # If a previously started master SSH connection is not running
- # anymore, it needs to be cleaned up and then restarted.
- if self.master_ssh_job is not None:
- socket_path = os.path.join(self.master_ssh_tempdir.name, 'socket')
- if (not os.path.exists(socket_path) or
- self.master_ssh_job.sp.poll() is not None):
- logging.info("Master ssh connection to %s is down.",
- self.hostname)
- self._cleanup_master_ssh()
+ # Multiple processes might try in parallel to clean up the old master
+ # ssh connection and create a new one, therefore use a lock to protect
+ # against race conditions.
+ with self._lock:
+ # If a previously started master SSH connection is not running
+ # anymore, it needs to be cleaned up and then restarted.
+ if self.master_ssh_job is not None:
+ socket_path = os.path.join(self.master_ssh_tempdir.name,
+ 'socket')
+ if (not os.path.exists(socket_path) or
+ self.master_ssh_job.sp.poll() is not None):
+ logging.info("Master ssh connection to %s is down.",
+ self.hostname)
+ self._cleanup_master_ssh()
- # Start a new master SSH connection.
- if self.master_ssh_job is None:
- # Create a shared socket in a temp location.
- self.master_ssh_tempdir = autotemp.tempdir(unique_id='ssh-master')
- self.master_ssh_option = ("-o ControlPath=%s/socket" %
- self.master_ssh_tempdir.name)
+ # Start a new master SSH connection.
+ if self.master_ssh_job is None:
+ # Create a shared socket in a temp location.
+ self.master_ssh_tempdir = autotemp.tempdir(
+ unique_id='ssh-master')
+ self.master_ssh_option = ("-o ControlPath=%s/socket" %
+ self.master_ssh_tempdir.name)
- # Start the master SSH connection in the background.
- master_cmd = self.ssh_command(options="-N -o ControlMaster=yes")
- logging.info("Starting master ssh connection '%s'", master_cmd)
- self.master_ssh_job = utils.BgJob(master_cmd,
- nickname='master-ssh',
- no_pipes=True)
- # To prevent a race between the the master ssh connection startup
- # and its first attempted use, wait for socket file to exist before
- # returning.
- end_time = time.time() + timeout
- socket_file_path = os.path.join(self.master_ssh_tempdir.name,
- 'socket')
- while time.time() < end_time:
- if os.path.exists(socket_file_path):
- break
- time.sleep(.2)
- else:
- logging.info('Timed out waiting for master-ssh connection '
- 'to be established.')
+ # Start the master SSH connection in the background.
+ master_cmd = self.ssh_command(
+ options="-N -o ControlMaster=yes")
+ logging.info("Starting master ssh connection '%s'", master_cmd)
+ self.master_ssh_job = utils.BgJob(master_cmd,
+ nickname='master-ssh',
+ no_pipes=True)
+ # To prevent a race between the the master ssh connection
+ # startup and its first attempted use, wait for socket file to
+ # exist before returning.
+ end_time = time.time() + timeout
+ socket_file_path = os.path.join(self.master_ssh_tempdir.name,
+ 'socket')
+ while time.time() < end_time:
+ if os.path.exists(socket_file_path):
+ break
+ time.sleep(.2)
+ else:
+ logging.info('Timed out waiting for master-ssh connection '
+ 'to be established.')
def clear_known_hosts(self):