base_classes.py revision f83a5a2a3e536e18ae4a43ed515fbde94aeba375
1# Copyright 2009 Google Inc. Released under the GPL v2 2 3""" 4This module defines the base classes for the Host hierarchy. 5 6Implementation details: 7You should import the "hosts" package instead of importing each type of host. 8 9 Host: a machine on which you can run programs 10""" 11 12__author__ = """ 13mbligh@google.com (Martin J. Bligh), 14poirier@google.com (Benjamin Poirier), 15stutsman@google.com (Ryan Stutsman) 16""" 17 18import os, re, time, cStringIO, logging 19 20from autotest_lib.client.common_lib import global_config, error, utils 21from autotest_lib.client.bin import partition 22 23 24class Host(object): 25 """ 26 This class represents a machine on which you can run programs. 27 28 It may be a local machine, the one autoserv is running on, a remote 29 machine or a virtual machine. 30 31 Implementation details: 32 This is an abstract class, leaf subclasses must implement the methods 33 listed here. You must not instantiate this class but should 34 instantiate one of those leaf subclasses. 35 36 When overriding methods that raise NotImplementedError, the leaf class 37 is fully responsible for the implementation and should not chain calls 38 to super. When overriding methods that are a NOP in Host, the subclass 39 should chain calls to super(). The criteria for fitting a new method into 40 one category or the other should be: 41 1. If two separate generic implementations could reasonably be 42 concatenated, then the abstract implementation should pass and 43 subclasses should chain calls to super. 44 2. If only one class could reasonably perform the stated function 45 (e.g. two separate run() implementations cannot both be executed) 46 then the method should raise NotImplementedError in Host, and 47 the implementor should NOT chain calls to super, to ensure that 48 only one implementation ever gets executed. 49 """ 50 51 job = None 52 DEFAULT_REBOOT_TIMEOUT = 1800 53 WAIT_DOWN_REBOOT_TIMEOUT = 840 54 WAIT_DOWN_REBOOT_WARNING = 540 55 HOURS_TO_WAIT_FOR_RECOVERY = 2.5 56 57 58 def __init__(self, *args, **dargs): 59 self._initialize(*args, **dargs) 60 61 62 def _initialize(self, *args, **dargs): 63 self._already_repaired = [] 64 self._removed_files = False 65 66 67 def close(self): 68 pass 69 70 71 def setup(self): 72 pass 73 74 75 def run(self, command, timeout=3600, ignore_status=False, 76 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, 77 stdin=None): 78 """ 79 Run a command on this host. 80 81 @param command: the command line string 82 @param timeout: time limit in seconds before attempting to 83 kill the running process. The run() function 84 will take a few seconds longer than 'timeout' 85 to complete if it has to kill the process. 86 @param ignore_status: do not raise an exception, no matter 87 what the exit code of the command is. 88 @param stdout_tee/stderr_tee: where to tee the stdout/stderr 89 @param stdin: stdin to pass to the executed process 90 91 @return a utils.CmdResult object 92 93 @raises AutotestHostRunError: the exit code of the command execution 94 was not 0 and ignore_status was not enabled 95 """ 96 raise NotImplementedError('Run not implemented!') 97 98 99 def run_output(self, command, *args, **dargs): 100 return self.run(command, *args, **dargs).stdout.rstrip() 101 102 103 def reboot(self): 104 raise NotImplementedError('Reboot not implemented!') 105 106 107 def sysrq_reboot(self): 108 raise NotImplementedError('Sysrq reboot not implemented!') 109 110 111 def reboot_setup(self, *args, **dargs): 112 pass 113 114 115 def reboot_followup(self, *args, **dargs): 116 pass 117 118 119 def get_file(self, source, dest, delete_dest=False): 120 raise NotImplementedError('Get file not implemented!') 121 122 123 def send_file(self, source, dest, delete_dest=False): 124 raise NotImplementedError('Send file not implemented!') 125 126 127 def get_tmp_dir(self): 128 raise NotImplementedError('Get temp dir not implemented!') 129 130 131 def is_up(self): 132 raise NotImplementedError('Is up not implemented!') 133 134 135 def is_shutting_down(self): 136 """ Indicates is a machine is currently shutting down. """ 137 runlevel = int(self.run("runlevel").stdout.strip().split()[1]) 138 return runlevel in (0, 6) 139 140 141 def get_wait_up_processes(self): 142 """ Gets the list of local processes to wait for in wait_up. """ 143 get_config = global_config.global_config.get_config_value 144 proc_list = get_config("HOSTS", "wait_up_processes", 145 default="").strip() 146 processes = set(p.strip() for p in proc_list.split(",")) 147 processes.discard("") 148 return processes 149 150 151 def wait_up(self, timeout=None): 152 raise NotImplementedError('Wait up not implemented!') 153 154 155 def wait_down(self, timeout=None, warning_timer=None): 156 raise NotImplementedError('Wait down not implemented!') 157 158 159 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, **dargs): 160 """ Wait for the host to come back from a reboot. This is a generic 161 implementation based entirely on wait_up and wait_down. """ 162 if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT, 163 warning_timer=self.WAIT_DOWN_REBOOT_WARNING): 164 self.record("ABORT", None, "reboot.verify", "shut down failed") 165 raise error.AutoservShutdownError("Host did not shut down") 166 167 self.wait_up(timeout) 168 time.sleep(2) # this is needed for complete reliability 169 if self.wait_up(timeout): 170 self.record("GOOD", None, "reboot.verify") 171 self.reboot_followup(**dargs) 172 else: 173 self.record("ABORT", None, "reboot.verify", 174 "Host did not return from reboot") 175 raise error.AutoservRebootError("Host did not return from reboot") 176 177 178 def verify(self): 179 pass 180 181 182 def verify_hardware(self): 183 pass 184 185 186 def verify_software(self): 187 pass 188 189 190 def check_diskspace(self, path, gb): 191 logging.info('Checking for >= %s GB of space under %s on machine %s', 192 gb, path, self.hostname) 193 df = self.run('df -mP %s | tail -1' % path).stdout.split() 194 free_space_gb = int(df[3])/1000.0 195 if free_space_gb < gb: 196 raise error.AutoservDiskFullHostError(path, gb, free_space_gb) 197 else: 198 logging.info('Found %s GB >= %s GB of space under %s on machine %s', 199 free_space_gb, gb, path, self.hostname) 200 201 202 def get_open_func(self, use_cache=True): 203 """ 204 Defines and returns a function that may be used instead of built-in 205 open() to open and read files. The returned function is implemented 206 by using self.run('cat <file>') and may cache the results for the same 207 filename. 208 209 @param use_cache Cache results of self.run('cat <filename>') for the 210 same filename 211 212 @return a function that can be used instead of built-in open() 213 """ 214 cached_files = {} 215 216 def open_func(filename): 217 if not use_cache or filename not in cached_files: 218 output = self.run('cat \'%s\'' % filename, 219 stdout_tee=open('/dev/null', 'w')).stdout 220 fd = cStringIO.StringIO(output) 221 222 if not use_cache: 223 return fd 224 225 cached_files[filename] = fd 226 else: 227 cached_files[filename].seek(0) 228 229 return cached_files[filename] 230 231 return open_func 232 233 234 def check_partitions(self, root_part, filter_func=None): 235 """ Compare the contents of /proc/partitions with those of 236 /proc/mounts and raise exception in case unmounted partitions are found 237 238 root_part: in Linux /proc/mounts will never directly mention the root 239 partition as being mounted on / instead it will say that /dev/root is 240 mounted on /. Thus require this argument to filter out the root_part 241 from the ones checked to be mounted 242 243 filter_func: unnary predicate for additional filtering out of 244 partitions required to be mounted 245 246 Raise: error.AutoservHostError if unfiltered unmounted partition found 247 """ 248 249 print 'Checking if non-swap partitions are mounted...' 250 251 unmounted = partition.get_unmounted_partition_list(root_part, 252 filter_func=filter_func, open_func=self.get_open_func()) 253 if unmounted: 254 raise error.AutoservNotMountedHostError( 255 'Found unmounted partitions: %s' % 256 [part.device for part in unmounted]) 257 258 259 def _repair_wait_for_reboot(self): 260 TIMEOUT = int(self.HOURS_TO_WAIT_FOR_RECOVERY * 3600) 261 if self.is_shutting_down(): 262 logging.info('Host is shutting down, waiting for a restart') 263 self.wait_for_restart(TIMEOUT) 264 else: 265 self.wait_up(TIMEOUT) 266 267 268 def _get_mountpoint(self, path): 269 """Given a "path" get the mount point of the filesystem containing 270 that path.""" 271 code = ('import os\n' 272 # sanitize the path and resolve symlinks 273 'path = os.path.realpath(%r)\n' 274 "while path != '/' and not os.path.ismount(path):\n" 275 ' path, _ = os.path.split(path)\n' 276 'print path\n') % path 277 return self.run('python2.4 -c "%s"' % code, 278 stdout_tee=open(os.devnull, 'w')).stdout.rstrip() 279 280 281 def erase_dir_contents(self, path, ignore_status=True, timeout=3600): 282 """Empty a given directory path contents.""" 283 rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf' 284 self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout) 285 self._removed_files = True 286 287 288 def repair_full_disk(self, mountpoint): 289 # it's safe to remove /tmp and /var/tmp, site specific overrides may 290 # want to remove some other places too 291 if mountpoint == self._get_mountpoint('/tmp'): 292 self.erase_dir_contents('/tmp') 293 294 if mountpoint == self._get_mountpoint('/var/tmp'): 295 self.erase_dir_contents('/var/tmp') 296 297 298 def _call_repair_func(self, err, func, *args, **dargs): 299 for old_call in self._already_repaired: 300 if old_call == (func, args, dargs): 301 # re-raising the original exception because surrounding 302 # error handling may want to try other ways to fix it 303 logging.warn('Already done this (%s) repair procedure, ' 304 're-raising the original exception.', func) 305 raise err 306 307 try: 308 func(*args, **dargs) 309 except error.AutoservHardwareRepairRequestedError: 310 # let this special exception propagate 311 raise 312 except error.AutoservError: 313 logging.exception('Repair failed but continuing in case it managed' 314 ' to repair enough') 315 316 self._already_repaired.append((func, args, dargs)) 317 318 319 def repair_filesystem_only(self): 320 """perform file system repairs only""" 321 while True: 322 # try to repair specific problems 323 try: 324 logging.info('Running verify to find failures to repair...') 325 self.verify() 326 if self._removed_files: 327 logging.info('Removed files, rebooting to release the' 328 ' inodes') 329 self.reboot() 330 return # verify succeeded, then repair succeeded 331 except error.AutoservHostIsShuttingDownError, err: 332 logging.exception('verify failed') 333 self._call_repair_func(err, self._repair_wait_for_reboot) 334 except error.AutoservDiskFullHostError, err: 335 logging.exception('verify failed') 336 self._call_repair_func(err, self.repair_full_disk, 337 self._get_mountpoint(err.path)) 338 339 340 def repair_software_only(self): 341 """perform software repairs only""" 342 while True: 343 try: 344 self.repair_filesystem_only() 345 break 346 except (error.AutoservSshPingHostError, error.AutoservSSHTimeout, 347 error.AutoservSshPermissionDeniedError, 348 error.AutoservDiskFullHostError), err: 349 logging.exception('verify failed') 350 logging.info('Trying to reinstall the machine') 351 self._call_repair_func(err, self.machine_install) 352 353 354 def repair_full(self): 355 while True: 356 try: 357 self.repair_software_only() 358 break 359 except error.AutoservHardwareHostError, err: 360 logging.exception('verify failed') 361 # software repair failed, try hardware repair 362 logging.info('Hardware problem found, ' 363 'requesting hardware repairs') 364 self._call_repair_func(err, self.request_hardware_repair) 365 366 367 def cleanup(self): 368 pass 369 370 371 def machine_install(self): 372 raise NotImplementedError('Machine install not implemented!') 373 374 375 def install(self, installableObject): 376 installableObject.install(self) 377 378 379 def get_autodir(self): 380 raise NotImplementedError('Get autodir not implemented!') 381 382 383 def set_autodir(self): 384 raise NotImplementedError('Set autodir not implemented!') 385 386 387 def start_loggers(self): 388 """ Called to start continuous host logging. """ 389 pass 390 391 392 def stop_loggers(self): 393 """ Called to stop continuous host logging. """ 394 pass 395 396 397 # some extra methods simplify the retrieval of information about the 398 # Host machine, with generic implementations based on run(). subclasses 399 # should feel free to override these if they can provide better 400 # implementations for their specific Host types 401 402 def get_num_cpu(self): 403 """ Get the number of CPUs in the host according to /proc/cpuinfo. """ 404 proc_cpuinfo = self.run('cat /proc/cpuinfo', 405 stdout_tee=open(os.devnull, 'w')).stdout 406 cpus = 0 407 for line in proc_cpuinfo.splitlines(): 408 if line.startswith('processor'): 409 cpus += 1 410 return cpus 411 412 413 def get_arch(self): 414 """ Get the hardware architecture of the remote machine. """ 415 arch = self.run('/bin/uname -m').stdout.rstrip() 416 if re.match(r'i\d86$', arch): 417 arch = 'i386' 418 return arch 419 420 421 def get_kernel_ver(self): 422 """ Get the kernel version of the remote machine. """ 423 return self.run('/bin/uname -r').stdout.rstrip() 424 425 426 def get_cmdline(self): 427 """ Get the kernel command line of the remote machine. """ 428 return self.run('cat /proc/cmdline').stdout.rstrip() 429 430 431 def path_exists(self, path): 432 """ Determine if path exists on the remote machine. """ 433 result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path), 434 ignore_status=True) 435 return result.exit_status == 0 436 437 438 # some extra helpers for doing job-related operations 439 440 def record(self, *args, **dargs): 441 """ Helper method for recording status logs against Host.job that 442 silently becomes a NOP if Host.job is not available. The args and 443 dargs are passed on to Host.job.record unchanged. """ 444 if self.job: 445 self.job.record(*args, **dargs) 446 447 448 def log_kernel(self): 449 """ Helper method for logging kernel information into the status logs. 450 Intended for cases where the "current" kernel is not really defined 451 and we want to explicitly log it. Does nothing if this host isn't 452 actually associated with a job. """ 453 if self.job: 454 kernel = self.get_kernel_ver() 455 self.job.record("INFO", None, None, 456 optional_fields={"kernel": kernel}) 457 458 459 def log_reboot(self, reboot_func): 460 """ Decorator for wrapping a reboot in a group for status 461 logging purposes. The reboot_func parameter should be an actual 462 function that carries out the reboot. 463 """ 464 if self.job and not hasattr(self, "RUNNING_LOG_REBOOT"): 465 self.RUNNING_LOG_REBOOT = True 466 try: 467 self.job.run_reboot(reboot_func, self.get_kernel_ver) 468 finally: 469 del self.RUNNING_LOG_REBOOT 470 else: 471 reboot_func() 472 473 474 def request_hardware_repair(self): 475 """ Should somehow request (send a mail?) for hardware repairs on 476 this machine. The implementation can either return by raising the 477 special error.AutoservHardwareRepairRequestedError exception or can 478 try to wait until the machine is repaired and then return normally. 479 """ 480 raise NotImplementedError("request_hardware_repair not implemented") 481