blob: 896c03546c29e665a4b80543963fb60b7e68ab1d [file] [log] [blame]
#
# Copyright 2007 Google Inc. Released under the GPL v2
"""
This module defines the base classes for the Host hierarchy.
Implementation details:
You should import the "hosts" package instead of importing each type of host.
Host: a machine on which you can run programs
RemoteHost: a remote machine on which you can run programs
"""
__author__ = """
[email protected] (Martin J. Bligh),
[email protected] (Benjamin Poirier),
[email protected] (Ryan Stutsman)
"""
import os, re, time, cStringIO, sys, logging
from autotest_lib.client.common_lib import global_config, error
from autotest_lib.client.common_lib import host_protections
from autotest_lib.client.bin import partition
from autotest_lib.server import utils
from autotest_lib.server.hosts import bootloader
class Host(object):
"""
This class represents a machine on which you can run programs.
It may be a local machine, the one autoserv is running on, a remote
machine or a virtual machine.
Implementation details:
This is an abstract class, leaf subclasses must implement the methods
listed here. You must not instantiate this class but should
instantiate one of those leaf subclasses.
When overriding methods that raise NotImplementedError, the leaf class
is fully responsible for the implementation and should not chain calls
to super. When overriding methods that are a NOP in Host, the subclass
should chain calls to super(). The criteria for fitting a new method into
one category or the other should be:
1. If two separate generic implementations could reasonably be
concatenated, then the abstract implementation should pass and
subclasses should chain calls to super.
2. If only one class could reasonably perform the stated function
(e.g. two separate run() implementations cannot both be executed)
then the method should raise NotImplementedError in Host, and
the implementor should NOT chain calls to super, to ensure that
only one implementation ever gets executed.
"""
bootloader = None
job = None
DEFAULT_REBOOT_TIMEOUT = 1800
WAIT_DOWN_REBOOT_TIMEOUT = 840
WAIT_DOWN_REBOOT_WARNING = 540
HOURS_TO_WAIT_FOR_RECOVERY = 2.5
def __init__(self, *args, **dargs):
self._initialize(*args, **dargs)
self.start_loggers()
if self.job:
self.job.hosts.add(self)
def _initialize(self, target_file_owner=None,
*args, **dargs):
self.serverdir = utils.get_server_dir()
self.monitordir = os.path.join(os.path.dirname(__file__), "monitors")
self.bootloader = bootloader.Bootloader(self)
self.env = {}
self.target_file_owner = target_file_owner
self._already_repaired = []
self._removed_files = False
def close(self):
if self.job:
self.job.hosts.discard(self)
def setup(self):
pass
def run(self, command):
raise NotImplementedError('Run not implemented!')
def run_output(self, command, *args, **dargs):
return self.run(command, *args, **dargs).stdout.rstrip()
def reboot(self):
raise NotImplementedError('Reboot not implemented!')
def sysrq_reboot(self):
raise NotImplementedError('Sysrq reboot not implemented!')
def reboot_setup(self, *args, **dargs):
pass
def reboot_followup(self, *args, **dargs):
pass
def get_file(self, source, dest, delete_dest=False):
raise NotImplementedError('Get file not implemented!')
def send_file(self, source, dest, delete_dest=False):
raise NotImplementedError('Send file not implemented!')
def get_tmp_dir(self):
raise NotImplementedError('Get temp dir not implemented!')
def is_up(self):
raise NotImplementedError('Is up not implemented!')
def is_shutting_down(self):
""" Indicates is a machine is currently shutting down. """
runlevel = int(self.run("runlevel").stdout.strip().split()[1])
return runlevel in (0, 6)
def get_wait_up_processes(self):
""" Gets the list of local processes to wait for in wait_up. """
get_config = global_config.global_config.get_config_value
proc_list = get_config("HOSTS", "wait_up_processes",
default="").strip()
processes = set(p.strip() for p in proc_list.split(","))
processes.discard("")
return processes
def wait_up(self, timeout=None):
raise NotImplementedError('Wait up not implemented!')
def wait_down(self, timeout=None, warning_timer=None):
raise NotImplementedError('Wait down not implemented!')
def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, **dargs):
""" Wait for the host to come back from a reboot. This is a generic
implementation based entirely on wait_up and wait_down. """
if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT,
warning_timer=self.WAIT_DOWN_REBOOT_WARNING):
self.record("ABORT", None, "reboot.verify", "shut down failed")
raise error.AutoservShutdownError("Host did not shut down")
self.wait_up(timeout)
time.sleep(2) # this is needed for complete reliability
if self.wait_up(timeout):
self.record("GOOD", None, "reboot.verify")
self.reboot_followup(**dargs)
else:
self.record("ABORT", None, "reboot.verify",
"Host did not return from reboot")
raise error.AutoservRebootError("Host did not return from reboot")
def verify(self):
pass
def verify_hardware(self):
pass
def verify_software(self):
pass
def check_diskspace(self, path, gb):
logging.info('Checking for >= %s GB of space under %s on machine %s',
gb, path, self.hostname)
df = self.run('df -mP %s | tail -1' % path).stdout.split()
free_space_gb = int(df[3])/1000.0
if free_space_gb < gb:
raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
else:
logging.info('Found %s GB >= %s GB of space under %s on machine %s',
free_space_gb, gb, path, self.hostname)
def get_open_func(self, use_cache=True):
"""
Defines and returns a function that may be used instead of built-in
open() to open and read files. The returned function is implemented
by using self.run('cat <file>') and may cache the results for the same
filename.
@param use_cache Cache results of self.run('cat <filename>') for the
same filename
@return a function that can be used instead of built-in open()
"""
cached_files = {}
def open_func(filename):
if not use_cache or filename not in cached_files:
output = self.run('cat \'%s\'' % filename,
stdout_tee=open('/dev/null', 'w')).stdout
fd = cStringIO.StringIO(output)
if not use_cache:
return fd
cached_files[filename] = fd
else:
cached_files[filename].seek(0)
return cached_files[filename]
return open_func
def check_partitions(self, root_part, filter_func=None):
""" Compare the contents of /proc/partitions with those of
/proc/mounts and raise exception in case unmounted partitions are found
root_part: in Linux /proc/mounts will never directly mention the root
partition as being mounted on / instead it will say that /dev/root is
mounted on /. Thus require this argument to filter out the root_part
from the ones checked to be mounted
filter_func: unnary predicate for additional filtering out of
partitions required to be mounted
Raise: error.AutoservHostError if unfiltered unmounted partition found
"""
print 'Checking if non-swap partitions are mounted...'
unmounted = partition.get_unmounted_partition_list(root_part,
filter_func=filter_func, open_func=self.get_open_func())
if unmounted:
raise error.AutoservNotMountedHostError(
'Found unmounted partitions: %s' %
[part.device for part in unmounted])
def _repair_wait_for_reboot(self):
TIMEOUT = int(self.HOURS_TO_WAIT_FOR_RECOVERY * 3600)
if self.is_shutting_down():
logging.info('Host is shutting down, waiting for a restart')
self.wait_for_restart(TIMEOUT)
else:
self.wait_up(TIMEOUT)
def _get_mountpoint(self, path):
"""Given a "path" get the mount point of the filesystem containing
that path."""
code = ('import os\n'
# sanitize the path and resolve symlinks
'path = os.path.realpath(%r)\n'
"while path != '/' and not os.path.ismount(path):\n"
' path, _ = os.path.split(path)\n'
'print path\n') % path
return self.run('python2.4 -c "%s"' % code,
stdout_tee=open(os.devnull, 'w')).stdout.rstrip()
def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
"""Empty a given directory path contents."""
rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
self._removed_files = True
def repair_full_disk(self, mountpoint):
# it's safe to remove /tmp and /var/tmp, site specific overrides may
# want to remove some other places too
if mountpoint == self._get_mountpoint('/tmp'):
self.erase_dir_contents('/tmp')
if mountpoint == self._get_mountpoint('/var/tmp'):
self.erase_dir_contents('/var/tmp')
def _call_repair_func(self, err, func, *args, **dargs):
for old_call in self._already_repaired:
if old_call == (func, args, dargs):
# re-raising the original exception because surrounding
# error handling may want to try other ways to fix it
logging.warn('Already done this (%s) repair procedure, '
're-raising the original exception.', func)
raise err
try:
func(*args, **dargs)
except error.AutoservHardwareRepairRequestedError:
# let this special exception propagate
raise
except error.AutoservError:
logging.exception('Repair failed but continuing in case it managed'
' to repair enough')
self._already_repaired.append((func, args, dargs))
def repair_filesystem_only(self):
"""perform file system repairs only"""
while True:
# try to repair specific problems
try:
logging.info('Running verify to find failures to repair...')
self.verify()
if self._removed_files:
logging.info('Removed files, rebooting to release the'
' inodes')
self.reboot()
return # verify succeeded, then repair succeeded
except error.AutoservHostIsShuttingDownError, err:
logging.exception('verify failed')
self._call_repair_func(err, self._repair_wait_for_reboot)
except error.AutoservDiskFullHostError, err:
logging.exception('verify failed')
self._call_repair_func(err, self.repair_full_disk,
self._get_mountpoint(err.path))
def repair_software_only(self):
"""perform software repairs only"""
while True:
try:
self.repair_filesystem_only()
break
except (error.AutoservSshPingHostError, error.AutoservSSHTimeout,
error.AutoservSshPermissionDeniedError,
error.AutoservDiskFullHostError), err:
logging.exception('verify failed')
logging.info('Trying to reinstall the machine')
self._call_repair_func(err, self.machine_install)
def repair_full(self):
while True:
try:
self.repair_software_only()
break
except error.AutoservHardwareHostError, err:
logging.exception('verify failed')
# software repair failed, try hardware repair
logging.info('Hardware problem found, '
'requesting hardware repairs')
self._call_repair_func(err, self.request_hardware_repair)
def cleanup(self):
pass
def machine_install(self):
raise NotImplementedError('Machine install not implemented!')
def install(self, installableObject):
installableObject.install(self)
def get_autodir(self):
raise NotImplementedError('Get autodir not implemented!')
def set_autodir(self):
raise NotImplementedError('Set autodir not implemented!')
def start_loggers(self):
""" Called to start continuous host logging. """
pass
def stop_loggers(self):
""" Called to stop continuous host logging. """
pass
# some extra methods simplify the retrieval of information about the
# Host machine, with generic implementations based on run(). subclasses
# should feel free to override these if they can provide better
# implementations for their specific Host types
def get_num_cpu(self):
""" Get the number of CPUs in the host according to /proc/cpuinfo. """
proc_cpuinfo = self.run("cat /proc/cpuinfo",
stdout_tee=open('/dev/null', 'w')).stdout
cpus = 0
for line in proc_cpuinfo.splitlines():
if line.startswith('processor'):
cpus += 1
return cpus
def get_arch(self):
""" Get the hardware architecture of the remote machine. """
arch = self.run('/bin/uname -m').stdout.rstrip()
if re.match(r'i\d86$', arch):
arch = 'i386'
return arch
def get_kernel_ver(self):
""" Get the kernel version of the remote machine. """
return self.run('/bin/uname -r').stdout.rstrip()
def get_cmdline(self):
""" Get the kernel command line of the remote machine. """
return self.run('cat /proc/cmdline').stdout.rstrip()
def path_exists(self, path):
""" Determine if path exists on the remote machine. """
result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path),
ignore_status=True)
return result.exit_status == 0
# some extra helpers for doing job-related operations
def record(self, *args, **dargs):
""" Helper method for recording status logs against Host.job that
silently becomes a NOP if Host.job is not available. The args and
dargs are passed on to Host.job.record unchanged. """
if self.job:
self.job.record(*args, **dargs)
def log_kernel(self):
""" Helper method for logging kernel information into the status logs.
Intended for cases where the "current" kernel is not really defined
and we want to explicitly log it. Does nothing if this host isn't
actually associated with a job. """
if self.job:
kernel = self.get_kernel_ver()
self.job.record("INFO", None, None,
optional_fields={"kernel": kernel})
def log_reboot(self, reboot_func):
""" Decorator for wrapping a reboot in a group for status
logging purposes. The reboot_func parameter should be an actual
function that carries out the reboot.
"""
if self.job and not hasattr(self, "RUNNING_LOG_REBOOT"):
self.RUNNING_LOG_REBOOT = True
try:
self.job.run_reboot(reboot_func, self.get_kernel_ver)
finally:
del self.RUNNING_LOG_REBOOT
else:
reboot_func()
def request_hardware_repair(self):
""" Should somehow request (send a mail?) for hardware repairs on
this machine. The implementation can either return by raising the
special error.AutoservHardwareRepairRequestedError exception or can
try to wait until the machine is repaired and then return normally.
"""
raise NotImplementedError("request_hardware_repair not implemented")