base_classes.py revision c035491ba24efea9e4343982f0a7c4b92e0a8c72
1# Copyright 2009 Google Inc. Released under the GPL v2 2 3""" 4This module defines the base classes for the Host hierarchy. 5 6Implementation details: 7You should import the "hosts" package instead of importing each type of host. 8 9 Host: a machine on which you can run programs 10""" 11 12__author__ = """ 13mbligh@google.com (Martin J. Bligh), 14poirier@google.com (Benjamin Poirier), 15stutsman@google.com (Ryan Stutsman) 16""" 17 18import cPickle, cStringIO, logging, os, re, time 19 20from autotest_lib.client.common_lib import global_config, error, utils 21from autotest_lib.client.common_lib import host_protections 22from autotest_lib.client.bin import partition 23 24 25class Host(object): 26 """ 27 This class represents a machine on which you can run programs. 28 29 It may be a local machine, the one autoserv is running on, a remote 30 machine or a virtual machine. 31 32 Implementation details: 33 This is an abstract class, leaf subclasses must implement the methods 34 listed here. You must not instantiate this class but should 35 instantiate one of those leaf subclasses. 36 37 When overriding methods that raise NotImplementedError, the leaf class 38 is fully responsible for the implementation and should not chain calls 39 to super. When overriding methods that are a NOP in Host, the subclass 40 should chain calls to super(). The criteria for fitting a new method into 41 one category or the other should be: 42 1. If two separate generic implementations could reasonably be 43 concatenated, then the abstract implementation should pass and 44 subclasses should chain calls to super. 45 2. If only one class could reasonably perform the stated function 46 (e.g. two separate run() implementations cannot both be executed) 47 then the method should raise NotImplementedError in Host, and 48 the implementor should NOT chain calls to super, to ensure that 49 only one implementation ever gets executed. 50 """ 51 52 job = None 53 DEFAULT_REBOOT_TIMEOUT = 1800 54 WAIT_DOWN_REBOOT_TIMEOUT = 840 55 WAIT_DOWN_REBOOT_WARNING = 540 56 HOURS_TO_WAIT_FOR_RECOVERY = 2.5 57 58 59 def __init__(self, *args, **dargs): 60 self._initialize(*args, **dargs) 61 62 63 def _initialize(self, *args, **dargs): 64 self._already_repaired = [] 65 self._removed_files = False 66 67 68 def close(self): 69 pass 70 71 72 def setup(self): 73 pass 74 75 76 def run(self, command, timeout=3600, ignore_status=False, 77 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, 78 stdin=None, args=()): 79 """ 80 Run a command on this host. 81 82 @param command: the command line string 83 @param timeout: time limit in seconds before attempting to 84 kill the running process. The run() function 85 will take a few seconds longer than 'timeout' 86 to complete if it has to kill the process. 87 @param ignore_status: do not raise an exception, no matter 88 what the exit code of the command is. 89 @param stdout_tee/stderr_tee: where to tee the stdout/stderr 90 @param stdin: stdin to pass (a string) to the executed command 91 @param args: sequence of strings to pass as arguments to command by 92 quoting them in " and escaping their contents if necessary 93 94 @return a utils.CmdResult object 95 96 @raises AutotestHostRunError: the exit code of the command execution 97 was not 0 and ignore_status was not enabled 98 """ 99 raise NotImplementedError('Run not implemented!') 100 101 102 def run_output(self, command, *args, **dargs): 103 return self.run(command, *args, **dargs).stdout.rstrip() 104 105 106 def reboot(self): 107 raise NotImplementedError('Reboot not implemented!') 108 109 110 def sysrq_reboot(self): 111 raise NotImplementedError('Sysrq reboot not implemented!') 112 113 114 def reboot_setup(self, *args, **dargs): 115 pass 116 117 118 def reboot_followup(self, *args, **dargs): 119 pass 120 121 122 def get_file(self, source, dest, delete_dest=False): 123 raise NotImplementedError('Get file not implemented!') 124 125 126 def send_file(self, source, dest, delete_dest=False): 127 raise NotImplementedError('Send file not implemented!') 128 129 130 def get_tmp_dir(self): 131 raise NotImplementedError('Get temp dir not implemented!') 132 133 134 def is_up(self): 135 raise NotImplementedError('Is up not implemented!') 136 137 138 def is_shutting_down(self): 139 """ Indicates is a machine is currently shutting down. """ 140 runlevel = int(self.run("runlevel").stdout.strip().split()[1]) 141 return runlevel in (0, 6) 142 143 144 def get_wait_up_processes(self): 145 """ Gets the list of local processes to wait for in wait_up. """ 146 get_config = global_config.global_config.get_config_value 147 proc_list = get_config("HOSTS", "wait_up_processes", 148 default="").strip() 149 processes = set(p.strip() for p in proc_list.split(",")) 150 processes.discard("") 151 return processes 152 153 154 def get_boot_id(self, timeout=60): 155 """ Get a unique ID associated with the current boot. 156 157 Should return a string with the semantics such that two separate 158 calls to Host.get_boot_id() return the same string if the host did 159 not reboot between the two calls, and two different strings if it 160 has rebooted at least once between the two calls. 161 162 @param timeout The number of seconds to wait before timing out. 163 164 @return A string unique to this boot.""" 165 return self.run('cat /proc/sys/kernel/random/boot_id', 166 timeout=timeout).stdout.strip() 167 168 169 def wait_up(self, timeout=None): 170 raise NotImplementedError('Wait up not implemented!') 171 172 173 def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None): 174 raise NotImplementedError('Wait down not implemented!') 175 176 177 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, 178 log_failure=True, old_boot_id=None, **dargs): 179 """ Wait for the host to come back from a reboot. This is a generic 180 implementation based entirely on wait_up and wait_down. """ 181 if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT, 182 warning_timer=self.WAIT_DOWN_REBOOT_WARNING, 183 old_boot_id=old_boot_id): 184 if log_failure: 185 self.record("ABORT", None, "reboot.verify", "shut down failed") 186 raise error.AutoservShutdownError("Host did not shut down") 187 188 self.wait_up(timeout) 189 time.sleep(2) # this is needed for complete reliability 190 if self.wait_up(timeout): 191 self.record("GOOD", None, "reboot.verify") 192 self.reboot_followup(**dargs) 193 else: 194 self.record("ABORT", None, "reboot.verify", 195 "Host did not return from reboot") 196 raise error.AutoservRebootError("Host did not return from reboot") 197 198 199 def verify(self): 200 self.verify_hardware() 201 self.verify_connectivity() 202 self.verify_software() 203 204 205 def verify_hardware(self): 206 pass 207 208 209 def verify_connectivity(self): 210 pass 211 212 213 def verify_software(self): 214 pass 215 216 217 def check_diskspace(self, path, gb): 218 logging.info('Checking for >= %s GB of space under %s on machine %s', 219 gb, path, self.hostname) 220 df = self.run('df -mP %s | tail -1' % path).stdout.split() 221 free_space_gb = int(df[3])/1000.0 222 if free_space_gb < gb: 223 raise error.AutoservDiskFullHostError(path, gb, free_space_gb) 224 else: 225 logging.info('Found %s GB >= %s GB of space under %s on machine %s', 226 free_space_gb, gb, path, self.hostname) 227 228 229 def get_open_func(self, use_cache=True): 230 """ 231 Defines and returns a function that may be used instead of built-in 232 open() to open and read files. The returned function is implemented 233 by using self.run('cat <file>') and may cache the results for the same 234 filename. 235 236 @param use_cache Cache results of self.run('cat <filename>') for the 237 same filename 238 239 @return a function that can be used instead of built-in open() 240 """ 241 cached_files = {} 242 243 def open_func(filename): 244 if not use_cache or filename not in cached_files: 245 output = self.run('cat \'%s\'' % filename, 246 stdout_tee=open('/dev/null', 'w')).stdout 247 fd = cStringIO.StringIO(output) 248 249 if not use_cache: 250 return fd 251 252 cached_files[filename] = fd 253 else: 254 cached_files[filename].seek(0) 255 256 return cached_files[filename] 257 258 return open_func 259 260 261 def check_partitions(self, root_part, filter_func=None): 262 """ Compare the contents of /proc/partitions with those of 263 /proc/mounts and raise exception in case unmounted partitions are found 264 265 root_part: in Linux /proc/mounts will never directly mention the root 266 partition as being mounted on / instead it will say that /dev/root is 267 mounted on /. Thus require this argument to filter out the root_part 268 from the ones checked to be mounted 269 270 filter_func: unnary predicate for additional filtering out of 271 partitions required to be mounted 272 273 Raise: error.AutoservHostError if unfiltered unmounted partition found 274 """ 275 276 print 'Checking if non-swap partitions are mounted...' 277 278 unmounted = partition.get_unmounted_partition_list(root_part, 279 filter_func=filter_func, open_func=self.get_open_func()) 280 if unmounted: 281 raise error.AutoservNotMountedHostError( 282 'Found unmounted partitions: %s' % 283 [part.device for part in unmounted]) 284 285 286 def _repair_wait_for_reboot(self): 287 TIMEOUT = int(self.HOURS_TO_WAIT_FOR_RECOVERY * 3600) 288 if self.is_shutting_down(): 289 logging.info('Host is shutting down, waiting for a restart') 290 self.wait_for_restart(TIMEOUT) 291 else: 292 self.wait_up(TIMEOUT) 293 294 295 def _get_mountpoint(self, path): 296 """Given a "path" get the mount point of the filesystem containing 297 that path.""" 298 code = ('import os\n' 299 # sanitize the path and resolve symlinks 300 'path = os.path.realpath(%r)\n' 301 "while path != '/' and not os.path.ismount(path):\n" 302 ' path, _ = os.path.split(path)\n' 303 'print path\n') % path 304 return self.run('python -c "%s"' % code, 305 stdout_tee=open(os.devnull, 'w')).stdout.rstrip() 306 307 308 def erase_dir_contents(self, path, ignore_status=True, timeout=3600): 309 """Empty a given directory path contents.""" 310 rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf' 311 self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout) 312 self._removed_files = True 313 314 315 def repair_full_disk(self, mountpoint): 316 # it's safe to remove /tmp and /var/tmp, site specific overrides may 317 # want to remove some other places too 318 if mountpoint == self._get_mountpoint('/tmp'): 319 self.erase_dir_contents('/tmp') 320 321 if mountpoint == self._get_mountpoint('/var/tmp'): 322 self.erase_dir_contents('/var/tmp') 323 324 325 def _call_repair_func(self, err, func, *args, **dargs): 326 for old_call in self._already_repaired: 327 if old_call == (func, args, dargs): 328 # re-raising the original exception because surrounding 329 # error handling may want to try other ways to fix it 330 logging.warn('Already done this (%s) repair procedure, ' 331 're-raising the original exception.', func) 332 raise err 333 334 try: 335 func(*args, **dargs) 336 except error.AutoservHardwareRepairRequestedError: 337 # let this special exception propagate 338 raise 339 except error.AutoservError: 340 logging.exception('Repair failed but continuing in case it managed' 341 ' to repair enough') 342 343 self._already_repaired.append((func, args, dargs)) 344 345 346 def repair_filesystem_only(self): 347 """perform file system repairs only""" 348 while True: 349 # try to repair specific problems 350 try: 351 logging.info('Running verify to find failures to repair...') 352 self.verify() 353 if self._removed_files: 354 logging.info('Removed files, rebooting to release the' 355 ' inodes') 356 self.reboot() 357 return # verify succeeded, then repair succeeded 358 except error.AutoservHostIsShuttingDownError, err: 359 logging.exception('verify failed') 360 self._call_repair_func(err, self._repair_wait_for_reboot) 361 except error.AutoservDiskFullHostError, err: 362 logging.exception('verify failed') 363 self._call_repair_func(err, self.repair_full_disk, 364 self._get_mountpoint(err.path)) 365 366 367 def repair_software_only(self): 368 """perform software repairs only""" 369 while True: 370 try: 371 self.repair_filesystem_only() 372 break 373 except (error.AutoservSshPingHostError, error.AutoservSSHTimeout, 374 error.AutoservSshPermissionDeniedError, 375 error.AutoservDiskFullHostError), err: 376 logging.exception('verify failed') 377 logging.info('Trying to reinstall the machine') 378 self._call_repair_func(err, self.machine_install) 379 380 381 def repair_full(self): 382 while True: 383 try: 384 self.repair_software_only() 385 break 386 except error.AutoservHardwareHostError, err: 387 logging.exception('verify failed') 388 # software repair failed, try hardware repair 389 logging.info('Hardware problem found, ' 390 'requesting hardware repairs') 391 self._call_repair_func(err, self.request_hardware_repair) 392 393 394 def repair_with_protection(self, protection_level): 395 """Perform the maximal amount of repair within the specified 396 protection level. 397 398 @param protection_level: the protection level to use for limiting 399 repairs, a host_protections.Protection 400 """ 401 protection = host_protections.Protection 402 if protection_level == protection.DO_NOT_REPAIR: 403 logging.info('Protection is "Do not repair" so just verifying') 404 self.verify() 405 elif protection_level == protection.REPAIR_FILESYSTEM_ONLY: 406 logging.info('Attempting filesystem-only repair') 407 self.repair_filesystem_only() 408 elif protection_level == protection.REPAIR_SOFTWARE_ONLY: 409 logging.info('Attempting software repair only') 410 self.repair_software_only() 411 elif protection_level == protection.NO_PROTECTION: 412 logging.info('Attempting full repair') 413 self.repair_full() 414 else: 415 raise NotImplementedError('Unknown host protection level %s' 416 % protection_level) 417 418 419 def cleanup(self): 420 pass 421 422 423 def machine_install(self): 424 raise NotImplementedError('Machine install not implemented!') 425 426 427 def install(self, installableObject): 428 installableObject.install(self) 429 430 431 def get_autodir(self): 432 raise NotImplementedError('Get autodir not implemented!') 433 434 435 def set_autodir(self): 436 raise NotImplementedError('Set autodir not implemented!') 437 438 439 def start_loggers(self): 440 """ Called to start continuous host logging. """ 441 pass 442 443 444 def stop_loggers(self): 445 """ Called to stop continuous host logging. """ 446 pass 447 448 449 # some extra methods simplify the retrieval of information about the 450 # Host machine, with generic implementations based on run(). subclasses 451 # should feel free to override these if they can provide better 452 # implementations for their specific Host types 453 454 def get_num_cpu(self): 455 """ Get the number of CPUs in the host according to /proc/cpuinfo. """ 456 proc_cpuinfo = self.run('cat /proc/cpuinfo', 457 stdout_tee=open(os.devnull, 'w')).stdout 458 cpus = 0 459 for line in proc_cpuinfo.splitlines(): 460 if line.startswith('processor'): 461 cpus += 1 462 return cpus 463 464 465 def get_arch(self): 466 """ Get the hardware architecture of the remote machine. """ 467 arch = self.run('/bin/uname -m').stdout.rstrip() 468 if re.match(r'i\d86$', arch): 469 arch = 'i386' 470 return arch 471 472 473 def get_kernel_ver(self): 474 """ Get the kernel version of the remote machine. """ 475 return self.run('/bin/uname -r').stdout.rstrip() 476 477 478 def get_cmdline(self): 479 """ Get the kernel command line of the remote machine. """ 480 return self.run('cat /proc/cmdline').stdout.rstrip() 481 482 483 def path_exists(self, path): 484 """ Determine if path exists on the remote machine. """ 485 result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path), 486 ignore_status=True) 487 return result.exit_status == 0 488 489 490 # some extra helpers for doing job-related operations 491 492 def record(self, *args, **dargs): 493 """ Helper method for recording status logs against Host.job that 494 silently becomes a NOP if Host.job is not available. The args and 495 dargs are passed on to Host.job.record unchanged. """ 496 if self.job: 497 self.job.record(*args, **dargs) 498 499 500 def log_kernel(self): 501 """ Helper method for logging kernel information into the status logs. 502 Intended for cases where the "current" kernel is not really defined 503 and we want to explicitly log it. Does nothing if this host isn't 504 actually associated with a job. """ 505 if self.job: 506 kernel = self.get_kernel_ver() 507 self.job.record("INFO", None, None, 508 optional_fields={"kernel": kernel}) 509 510 511 def log_reboot(self, reboot_func): 512 """ Decorator for wrapping a reboot in a group for status 513 logging purposes. The reboot_func parameter should be an actual 514 function that carries out the reboot. 515 """ 516 if self.job and not hasattr(self, "RUNNING_LOG_REBOOT"): 517 self.RUNNING_LOG_REBOOT = True 518 try: 519 self.job.run_reboot(reboot_func, self.get_kernel_ver) 520 finally: 521 del self.RUNNING_LOG_REBOOT 522 else: 523 reboot_func() 524 525 526 def request_hardware_repair(self): 527 """ Should somehow request (send a mail?) for hardware repairs on 528 this machine. The implementation can either return by raising the 529 special error.AutoservHardwareRepairRequestedError exception or can 530 try to wait until the machine is repaired and then return normally. 531 """ 532 raise NotImplementedError("request_hardware_repair not implemented") 533 534 535 def list_files_glob(self, glob): 536 """ 537 Get a list of files on a remote host given a glob pattern path. 538 """ 539 SCRIPT = ("python -c 'import cPickle, glob, sys;" 540 "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'") 541 output = self.run(SCRIPT, args=(glob,), stdout_tee=None, 542 timeout=60).stdout 543 return cPickle.loads(output) 544 545 546 def symlink_closure(self, paths): 547 """ 548 Given a sequence of path strings, return the set of all paths that 549 can be reached from the initial set by following symlinks. 550 551 @param paths: sequence of path strings. 552 @return: a sequence of path strings that are all the unique paths that 553 can be reached from the given ones after following symlinks. 554 """ 555 SCRIPT = ("python -c 'import cPickle, os, sys\n" 556 "paths = cPickle.load(sys.stdin)\n" 557 "closure = {}\n" 558 "while paths:\n" 559 " path = paths.keys()[0]\n" 560 " del paths[path]\n" 561 " if not os.path.exists(path):\n" 562 " continue\n" 563 " closure[path] = None\n" 564 " if os.path.islink(path):\n" 565 " link_to = os.path.join(os.path.dirname(path),\n" 566 " os.readlink(path))\n" 567 " if link_to not in closure.keys():\n" 568 " paths[link_to] = None\n" 569 "cPickle.dump(closure.keys(), sys.stdout, 0)'") 570 input_data = cPickle.dumps(dict((path, None) for path in paths), 0) 571 output = self.run(SCRIPT, stdout_tee=None, stdin=input_data, 572 timeout=60).stdout 573 return cPickle.loads(output) 574 575 576 def cleanup_kernels(self, boot_dir='/boot'): 577 """ 578 Remove any kernel image and associated files (vmlinux, system.map, 579 modules) for any image found in the boot directory that is not 580 referenced by entries in the bootloader configuration. 581 582 @param boot_dir: boot directory path string, default '/boot' 583 """ 584 # find all the vmlinuz images referenced by the bootloader 585 vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-') 586 boot_info = self.bootloader.get_entries() 587 used_kernver = [boot['kernel'][len(vmlinuz_prefix):] 588 for boot in boot_info.itervalues()] 589 590 # find all the unused vmlinuz images in /boot 591 all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*') 592 used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver 593 for kernver in used_kernver) 594 unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz) 595 596 # find all the unused vmlinux images in /boot 597 vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-') 598 all_vmlinux = self.list_files_glob(vmlinux_prefix + '*') 599 used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver 600 for kernver in used_kernver) 601 unused_vmlinux = set(all_vmlinux) - set(used_vmlinux) 602 603 # find all the unused System.map files in /boot 604 systemmap_prefix = os.path.join(boot_dir, 'System.map-') 605 all_system_map = self.list_files_glob(systemmap_prefix + '*') 606 used_system_map = self.symlink_closure( 607 systemmap_prefix + kernver for kernver in used_kernver) 608 unused_system_map = set(all_system_map) - set(used_system_map) 609 610 # find all the module directories associated with unused kernels 611 modules_prefix = '/lib/modules/' 612 all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*') 613 if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)] 614 used_moddirs = self.symlink_closure(modules_prefix + kernver 615 for kernver in used_kernver) 616 unused_moddirs = set(all_moddirs) - set(used_moddirs) 617 618 # remove all the vmlinuz files we don't use 619 # TODO: if needed this should become package manager agnostic 620 for vmlinuz in unused_vmlinuz: 621 # try and get an rpm package name 622 rpm = self.run('rpm -qf', args=(vmlinuz,), 623 ignore_status=True, timeout=120) 624 if rpm.exit_status == 0: 625 packages = set(line.strip() for line in 626 rpm.stdout.splitlines()) 627 # if we found some package names, try to remove them 628 for package in packages: 629 self.run('rpm -e', args=(package,), 630 ignore_status=True, timeout=120) 631 # remove the image files anyway, even if rpm didn't 632 self.run('rm -f', args=(vmlinuz,), 633 ignore_status=True, timeout=120) 634 635 # remove all the vmlinux and System.map files left over 636 for f in (unused_vmlinux | unused_system_map): 637 self.run('rm -f', args=(f,), 638 ignore_status=True, timeout=120) 639 640 # remove all unused module directories 641 # the regex match should keep us safe from removing the wrong files 642 for moddir in unused_moddirs: 643 self.run('rm -fr', args=(moddir,), ignore_status=True) 644