base_classes.py revision 3649141cea5fda511978483ddf8709221f4c57fd
1# Copyright 2009 Google Inc. Released under the GPL v2 2 3""" 4This module defines the base classes for the Host hierarchy. 5 6Implementation details: 7You should import the "hosts" package instead of importing each type of host. 8 9 Host: a machine on which you can run programs 10""" 11 12__author__ = """ 13mbligh@google.com (Martin J. Bligh), 14poirier@google.com (Benjamin Poirier), 15stutsman@google.com (Ryan Stutsman) 16""" 17 18import cPickle, cStringIO, logging, os, re, time 19 20from autotest_lib.client.common_lib import global_config, error, utils 21from autotest_lib.client.common_lib import host_protections 22from autotest_lib.client.bin import partition 23 24 25class Host(object): 26 """ 27 This class represents a machine on which you can run programs. 28 29 It may be a local machine, the one autoserv is running on, a remote 30 machine or a virtual machine. 31 32 Implementation details: 33 This is an abstract class, leaf subclasses must implement the methods 34 listed here. You must not instantiate this class but should 35 instantiate one of those leaf subclasses. 36 37 When overriding methods that raise NotImplementedError, the leaf class 38 is fully responsible for the implementation and should not chain calls 39 to super. When overriding methods that are a NOP in Host, the subclass 40 should chain calls to super(). The criteria for fitting a new method into 41 one category or the other should be: 42 1. If two separate generic implementations could reasonably be 43 concatenated, then the abstract implementation should pass and 44 subclasses should chain calls to super. 45 2. If only one class could reasonably perform the stated function 46 (e.g. two separate run() implementations cannot both be executed) 47 then the method should raise NotImplementedError in Host, and 48 the implementor should NOT chain calls to super, to ensure that 49 only one implementation ever gets executed. 50 """ 51 52 job = None 53 DEFAULT_REBOOT_TIMEOUT = 1800 54 WAIT_DOWN_REBOOT_TIMEOUT = 840 55 WAIT_DOWN_REBOOT_WARNING = 540 56 HOURS_TO_WAIT_FOR_RECOVERY = 2.5 57 58 59 def __init__(self, *args, **dargs): 60 self._initialize(*args, **dargs) 61 62 63 def _initialize(self, *args, **dargs): 64 self._already_repaired = [] 65 self._removed_files = False 66 67 68 def close(self): 69 pass 70 71 72 def setup(self): 73 pass 74 75 76 def run(self, command, timeout=3600, ignore_status=False, 77 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, 78 stdin=None, args=()): 79 """ 80 Run a command on this host. 81 82 @param command: the command line string 83 @param timeout: time limit in seconds before attempting to 84 kill the running process. The run() function 85 will take a few seconds longer than 'timeout' 86 to complete if it has to kill the process. 87 @param ignore_status: do not raise an exception, no matter 88 what the exit code of the command is. 89 @param stdout_tee/stderr_tee: where to tee the stdout/stderr 90 @param stdin: stdin to pass (a string) to the executed command 91 @param args: sequence of strings to pass as arguments to command by 92 quoting them in " and escaping their contents if necessary 93 94 @return a utils.CmdResult object 95 96 @raises AutotestHostRunError: the exit code of the command execution 97 was not 0 and ignore_status was not enabled 98 """ 99 raise NotImplementedError('Run not implemented!') 100 101 102 def run_output(self, command, *args, **dargs): 103 return self.run(command, *args, **dargs).stdout.rstrip() 104 105 106 def reboot(self): 107 raise NotImplementedError('Reboot not implemented!') 108 109 110 def sysrq_reboot(self): 111 raise NotImplementedError('Sysrq reboot not implemented!') 112 113 114 def reboot_setup(self, *args, **dargs): 115 pass 116 117 118 def reboot_followup(self, *args, **dargs): 119 pass 120 121 122 def get_file(self, source, dest, delete_dest=False): 123 raise NotImplementedError('Get file not implemented!') 124 125 126 def send_file(self, source, dest, delete_dest=False): 127 raise NotImplementedError('Send file not implemented!') 128 129 130 def get_tmp_dir(self): 131 raise NotImplementedError('Get temp dir not implemented!') 132 133 134 def is_up(self): 135 raise NotImplementedError('Is up not implemented!') 136 137 138 def is_shutting_down(self): 139 """ Indicates is a machine is currently shutting down. """ 140 # runlevel() may not be available, so wrap it in try block. 141 try: 142 runlevel = int(self.run("runlevel").stdout.strip().split()[1]) 143 return runlevel in (0, 6) 144 except: 145 return False 146 147 148 def get_wait_up_processes(self): 149 """ Gets the list of local processes to wait for in wait_up. """ 150 get_config = global_config.global_config.get_config_value 151 proc_list = get_config("HOSTS", "wait_up_processes", 152 default="").strip() 153 processes = set(p.strip() for p in proc_list.split(",")) 154 processes.discard("") 155 return processes 156 157 158 def get_boot_id(self, timeout=60): 159 """ Get a unique ID associated with the current boot. 160 161 Should return a string with the semantics such that two separate 162 calls to Host.get_boot_id() return the same string if the host did 163 not reboot between the two calls, and two different strings if it 164 has rebooted at least once between the two calls. 165 166 @param timeout The number of seconds to wait before timing out. 167 168 @return A string unique to this boot.""" 169 return self.run('cat /proc/sys/kernel/random/boot_id', 170 timeout=timeout).stdout.strip() 171 172 173 def wait_up(self, timeout=None): 174 raise NotImplementedError('Wait up not implemented!') 175 176 177 def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None): 178 raise NotImplementedError('Wait down not implemented!') 179 180 181 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, 182 log_failure=True, old_boot_id=None, **dargs): 183 """ Wait for the host to come back from a reboot. This is a generic 184 implementation based entirely on wait_up and wait_down. """ 185 if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT, 186 warning_timer=self.WAIT_DOWN_REBOOT_WARNING, 187 old_boot_id=old_boot_id): 188 if log_failure: 189 self.record("ABORT", None, "reboot.verify", "shut down failed") 190 raise error.AutoservShutdownError("Host did not shut down") 191 192 self.wait_up(timeout) 193 time.sleep(2) # this is needed for complete reliability 194 if self.wait_up(timeout): 195 self.record("GOOD", None, "reboot.verify") 196 self.reboot_followup(**dargs) 197 else: 198 self.record("ABORT", None, "reboot.verify", 199 "Host did not return from reboot") 200 raise error.AutoservRebootError("Host did not return from reboot") 201 202 203 def verify(self): 204 self.verify_hardware() 205 self.verify_connectivity() 206 self.verify_software() 207 208 209 def verify_hardware(self): 210 pass 211 212 213 def verify_connectivity(self): 214 pass 215 216 217 def verify_software(self): 218 pass 219 220 221 def check_diskspace(self, path, gb): 222 logging.info('Checking for >= %s GB of space under %s on machine %s', 223 gb, path, self.hostname) 224 df = self.run('df -mP %s | tail -1' % path).stdout.split() 225 free_space_gb = int(df[3])/1000.0 226 if free_space_gb < gb: 227 raise error.AutoservDiskFullHostError(path, gb, free_space_gb) 228 else: 229 logging.info('Found %s GB >= %s GB of space under %s on machine %s', 230 free_space_gb, gb, path, self.hostname) 231 232 233 def get_open_func(self, use_cache=True): 234 """ 235 Defines and returns a function that may be used instead of built-in 236 open() to open and read files. The returned function is implemented 237 by using self.run('cat <file>') and may cache the results for the same 238 filename. 239 240 @param use_cache Cache results of self.run('cat <filename>') for the 241 same filename 242 243 @return a function that can be used instead of built-in open() 244 """ 245 cached_files = {} 246 247 def open_func(filename): 248 if not use_cache or filename not in cached_files: 249 output = self.run('cat \'%s\'' % filename, 250 stdout_tee=open('/dev/null', 'w')).stdout 251 fd = cStringIO.StringIO(output) 252 253 if not use_cache: 254 return fd 255 256 cached_files[filename] = fd 257 else: 258 cached_files[filename].seek(0) 259 260 return cached_files[filename] 261 262 return open_func 263 264 265 def check_partitions(self, root_part, filter_func=None): 266 """ Compare the contents of /proc/partitions with those of 267 /proc/mounts and raise exception in case unmounted partitions are found 268 269 root_part: in Linux /proc/mounts will never directly mention the root 270 partition as being mounted on / instead it will say that /dev/root is 271 mounted on /. Thus require this argument to filter out the root_part 272 from the ones checked to be mounted 273 274 filter_func: unnary predicate for additional filtering out of 275 partitions required to be mounted 276 277 Raise: error.AutoservHostError if unfiltered unmounted partition found 278 """ 279 280 print 'Checking if non-swap partitions are mounted...' 281 282 unmounted = partition.get_unmounted_partition_list(root_part, 283 filter_func=filter_func, open_func=self.get_open_func()) 284 if unmounted: 285 raise error.AutoservNotMountedHostError( 286 'Found unmounted partitions: %s' % 287 [part.device for part in unmounted]) 288 289 290 def _repair_wait_for_reboot(self): 291 TIMEOUT = int(self.HOURS_TO_WAIT_FOR_RECOVERY * 3600) 292 if self.is_shutting_down(): 293 logging.info('Host is shutting down, waiting for a restart') 294 self.wait_for_restart(TIMEOUT) 295 else: 296 self.wait_up(TIMEOUT) 297 298 299 def _get_mountpoint(self, path): 300 """Given a "path" get the mount point of the filesystem containing 301 that path.""" 302 code = ('import os\n' 303 # sanitize the path and resolve symlinks 304 'path = os.path.realpath(%r)\n' 305 "while path != '/' and not os.path.ismount(path):\n" 306 ' path, _ = os.path.split(path)\n' 307 'print path\n') % path 308 return self.run('python -c "%s"' % code, 309 stdout_tee=open(os.devnull, 'w')).stdout.rstrip() 310 311 312 def erase_dir_contents(self, path, ignore_status=True, timeout=3600): 313 """Empty a given directory path contents.""" 314 rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf' 315 self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout) 316 self._removed_files = True 317 318 319 def repair_full_disk(self, mountpoint): 320 # it's safe to remove /tmp and /var/tmp, site specific overrides may 321 # want to remove some other places too 322 if mountpoint == self._get_mountpoint('/tmp'): 323 self.erase_dir_contents('/tmp') 324 325 if mountpoint == self._get_mountpoint('/var/tmp'): 326 self.erase_dir_contents('/var/tmp') 327 328 329 def _call_repair_func(self, err, func, *args, **dargs): 330 for old_call in self._already_repaired: 331 if old_call == (func, args, dargs): 332 # re-raising the original exception because surrounding 333 # error handling may want to try other ways to fix it 334 logging.warn('Already done this (%s) repair procedure, ' 335 're-raising the original exception.', func) 336 raise err 337 338 try: 339 func(*args, **dargs) 340 except error.AutoservHardwareRepairRequestedError: 341 # let this special exception propagate 342 raise 343 except error.AutoservError: 344 logging.exception('Repair failed but continuing in case it managed' 345 ' to repair enough') 346 347 self._already_repaired.append((func, args, dargs)) 348 349 350 def repair_filesystem_only(self): 351 """perform file system repairs only""" 352 while True: 353 # try to repair specific problems 354 try: 355 logging.info('Running verify to find failures to repair...') 356 self.verify() 357 if self._removed_files: 358 logging.info('Removed files, rebooting to release the' 359 ' inodes') 360 self.reboot() 361 return # verify succeeded, then repair succeeded 362 except error.AutoservHostIsShuttingDownError, err: 363 logging.exception('verify failed') 364 self._call_repair_func(err, self._repair_wait_for_reboot) 365 except error.AutoservDiskFullHostError, err: 366 logging.exception('verify failed') 367 self._call_repair_func(err, self.repair_full_disk, 368 self._get_mountpoint(err.path)) 369 370 371 def repair_software_only(self): 372 """perform software repairs only""" 373 while True: 374 try: 375 self.repair_filesystem_only() 376 break 377 except (error.AutoservSshPingHostError, error.AutoservSSHTimeout, 378 error.AutoservSshPermissionDeniedError, 379 error.AutoservDiskFullHostError), err: 380 logging.exception('verify failed') 381 logging.info('Trying to reinstall the machine') 382 self._call_repair_func(err, self.machine_install) 383 384 385 def repair_full(self): 386 while True: 387 try: 388 self.repair_software_only() 389 break 390 except error.AutoservHardwareHostError, err: 391 logging.exception('verify failed') 392 # software repair failed, try hardware repair 393 logging.info('Hardware problem found, ' 394 'requesting hardware repairs') 395 self._call_repair_func(err, self.request_hardware_repair) 396 397 398 def repair_with_protection(self, protection_level): 399 """Perform the maximal amount of repair within the specified 400 protection level. 401 402 @param protection_level: the protection level to use for limiting 403 repairs, a host_protections.Protection 404 """ 405 protection = host_protections.Protection 406 if protection_level == protection.DO_NOT_REPAIR: 407 logging.info('Protection is "Do not repair" so just verifying') 408 self.verify() 409 elif protection_level == protection.REPAIR_FILESYSTEM_ONLY: 410 logging.info('Attempting filesystem-only repair') 411 self.repair_filesystem_only() 412 elif protection_level == protection.REPAIR_SOFTWARE_ONLY: 413 logging.info('Attempting software repair only') 414 self.repair_software_only() 415 elif protection_level == protection.NO_PROTECTION: 416 logging.info('Attempting full repair') 417 self.repair_full() 418 else: 419 raise NotImplementedError('Unknown host protection level %s' 420 % protection_level) 421 422 423 def disable_ipfilters(self): 424 """Allow all network packets in and out of the host.""" 425 self.run('iptables-save > /tmp/iptable-rules') 426 self.run('iptables -P INPUT ACCEPT') 427 self.run('iptables -P FORWARD ACCEPT') 428 self.run('iptables -P OUTPUT ACCEPT') 429 430 431 def enable_ipfilters(self): 432 """Re-enable the IP filters disabled from disable_ipfilters()""" 433 if os.path.isfile('/tmp/iptable-rules'): 434 self.run('iptables-restore < /tmp/iptable-rules') 435 436 437 def cleanup(self): 438 pass 439 440 441 def machine_install(self): 442 raise NotImplementedError('Machine install not implemented!') 443 444 445 def install(self, installableObject): 446 installableObject.install(self) 447 448 449 def get_autodir(self): 450 raise NotImplementedError('Get autodir not implemented!') 451 452 453 def set_autodir(self): 454 raise NotImplementedError('Set autodir not implemented!') 455 456 457 def start_loggers(self): 458 """ Called to start continuous host logging. """ 459 pass 460 461 462 def stop_loggers(self): 463 """ Called to stop continuous host logging. """ 464 pass 465 466 467 # some extra methods simplify the retrieval of information about the 468 # Host machine, with generic implementations based on run(). subclasses 469 # should feel free to override these if they can provide better 470 # implementations for their specific Host types 471 472 def get_num_cpu(self): 473 """ Get the number of CPUs in the host according to /proc/cpuinfo. """ 474 proc_cpuinfo = self.run('cat /proc/cpuinfo', 475 stdout_tee=open(os.devnull, 'w')).stdout 476 cpus = 0 477 for line in proc_cpuinfo.splitlines(): 478 if line.startswith('processor'): 479 cpus += 1 480 return cpus 481 482 483 def get_arch(self): 484 """ Get the hardware architecture of the remote machine. """ 485 arch = self.run('/bin/uname -m').stdout.rstrip() 486 if re.match(r'i\d86$', arch): 487 arch = 'i386' 488 return arch 489 490 491 def get_kernel_ver(self): 492 """ Get the kernel version of the remote machine. """ 493 return self.run('/bin/uname -r').stdout.rstrip() 494 495 496 def get_cmdline(self): 497 """ Get the kernel command line of the remote machine. """ 498 return self.run('cat /proc/cmdline').stdout.rstrip() 499 500 501 def path_exists(self, path): 502 """ Determine if path exists on the remote machine. """ 503 result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path), 504 ignore_status=True) 505 return result.exit_status == 0 506 507 508 # some extra helpers for doing job-related operations 509 510 def record(self, *args, **dargs): 511 """ Helper method for recording status logs against Host.job that 512 silently becomes a NOP if Host.job is not available. The args and 513 dargs are passed on to Host.job.record unchanged. """ 514 if self.job: 515 self.job.record(*args, **dargs) 516 517 518 def log_kernel(self): 519 """ Helper method for logging kernel information into the status logs. 520 Intended for cases where the "current" kernel is not really defined 521 and we want to explicitly log it. Does nothing if this host isn't 522 actually associated with a job. """ 523 if self.job: 524 kernel = self.get_kernel_ver() 525 self.job.record("INFO", None, None, 526 optional_fields={"kernel": kernel}) 527 528 529 def log_reboot(self, reboot_func): 530 """ Decorator for wrapping a reboot in a group for status 531 logging purposes. The reboot_func parameter should be an actual 532 function that carries out the reboot. 533 """ 534 if self.job and not hasattr(self, "RUNNING_LOG_REBOOT"): 535 self.RUNNING_LOG_REBOOT = True 536 try: 537 self.job.run_reboot(reboot_func, self.get_kernel_ver) 538 finally: 539 del self.RUNNING_LOG_REBOOT 540 else: 541 reboot_func() 542 543 544 def request_hardware_repair(self): 545 """ Should somehow request (send a mail?) for hardware repairs on 546 this machine. The implementation can either return by raising the 547 special error.AutoservHardwareRepairRequestedError exception or can 548 try to wait until the machine is repaired and then return normally. 549 """ 550 raise NotImplementedError("request_hardware_repair not implemented") 551 552 553 def list_files_glob(self, glob): 554 """ 555 Get a list of files on a remote host given a glob pattern path. 556 """ 557 SCRIPT = ("python -c 'import cPickle, glob, sys;" 558 "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'") 559 output = self.run(SCRIPT, args=(glob,), stdout_tee=None, 560 timeout=60).stdout 561 return cPickle.loads(output) 562 563 564 def symlink_closure(self, paths): 565 """ 566 Given a sequence of path strings, return the set of all paths that 567 can be reached from the initial set by following symlinks. 568 569 @param paths: sequence of path strings. 570 @return: a sequence of path strings that are all the unique paths that 571 can be reached from the given ones after following symlinks. 572 """ 573 SCRIPT = ("python -c 'import cPickle, os, sys\n" 574 "paths = cPickle.load(sys.stdin)\n" 575 "closure = {}\n" 576 "while paths:\n" 577 " path = paths.keys()[0]\n" 578 " del paths[path]\n" 579 " if not os.path.exists(path):\n" 580 " continue\n" 581 " closure[path] = None\n" 582 " if os.path.islink(path):\n" 583 " link_to = os.path.join(os.path.dirname(path),\n" 584 " os.readlink(path))\n" 585 " if link_to not in closure.keys():\n" 586 " paths[link_to] = None\n" 587 "cPickle.dump(closure.keys(), sys.stdout, 0)'") 588 input_data = cPickle.dumps(dict((path, None) for path in paths), 0) 589 output = self.run(SCRIPT, stdout_tee=None, stdin=input_data, 590 timeout=60).stdout 591 return cPickle.loads(output) 592 593 594 def cleanup_kernels(self, boot_dir='/boot'): 595 """ 596 Remove any kernel image and associated files (vmlinux, system.map, 597 modules) for any image found in the boot directory that is not 598 referenced by entries in the bootloader configuration. 599 600 @param boot_dir: boot directory path string, default '/boot' 601 """ 602 # find all the vmlinuz images referenced by the bootloader 603 vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-') 604 boot_info = self.bootloader.get_entries() 605 used_kernver = [boot['kernel'][len(vmlinuz_prefix):] 606 for boot in boot_info.itervalues()] 607 608 # find all the unused vmlinuz images in /boot 609 all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*') 610 used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver 611 for kernver in used_kernver) 612 unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz) 613 614 # find all the unused vmlinux images in /boot 615 vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-') 616 all_vmlinux = self.list_files_glob(vmlinux_prefix + '*') 617 used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver 618 for kernver in used_kernver) 619 unused_vmlinux = set(all_vmlinux) - set(used_vmlinux) 620 621 # find all the unused System.map files in /boot 622 systemmap_prefix = os.path.join(boot_dir, 'System.map-') 623 all_system_map = self.list_files_glob(systemmap_prefix + '*') 624 used_system_map = self.symlink_closure( 625 systemmap_prefix + kernver for kernver in used_kernver) 626 unused_system_map = set(all_system_map) - set(used_system_map) 627 628 # find all the module directories associated with unused kernels 629 modules_prefix = '/lib/modules/' 630 all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*') 631 if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)] 632 used_moddirs = self.symlink_closure(modules_prefix + kernver 633 for kernver in used_kernver) 634 unused_moddirs = set(all_moddirs) - set(used_moddirs) 635 636 # remove all the vmlinuz files we don't use 637 # TODO: if needed this should become package manager agnostic 638 for vmlinuz in unused_vmlinuz: 639 # try and get an rpm package name 640 rpm = self.run('rpm -qf', args=(vmlinuz,), 641 ignore_status=True, timeout=120) 642 if rpm.exit_status == 0: 643 packages = set(line.strip() for line in 644 rpm.stdout.splitlines()) 645 # if we found some package names, try to remove them 646 for package in packages: 647 self.run('rpm -e', args=(package,), 648 ignore_status=True, timeout=120) 649 # remove the image files anyway, even if rpm didn't 650 self.run('rm -f', args=(vmlinuz,), 651 ignore_status=True, timeout=120) 652 653 # remove all the vmlinux and System.map files left over 654 for f in (unused_vmlinux | unused_system_map): 655 self.run('rm -f', args=(f,), 656 ignore_status=True, timeout=120) 657 658 # remove all unused module directories 659 # the regex match should keep us safe from removing the wrong files 660 for moddir in unused_moddirs: 661 self.run('rm -fr', args=(moddir,), ignore_status=True) 662