base_classes.py revision e4256c81ced9c519f2256bb37119f801edbb9ff1
1# Copyright 2009 Google Inc. Released under the GPL v2 2 3""" 4This module defines the base classes for the Host hierarchy. 5 6Implementation details: 7You should import the "hosts" package instead of importing each type of host. 8 9 Host: a machine on which you can run programs 10""" 11 12__author__ = """ 13mbligh@google.com (Martin J. Bligh), 14poirier@google.com (Benjamin Poirier), 15stutsman@google.com (Ryan Stutsman) 16""" 17 18import cPickle, logging, os, re, time 19 20from autotest_lib.client.common_lib import global_config, error, utils 21from autotest_lib.client.common_lib.cros import path_utils 22from autotest_lib.client.common_lib.cros.graphite import autotest_stats 23 24 25class Host(object): 26 """ 27 This class represents a machine on which you can run programs. 28 29 It may be a local machine, the one autoserv is running on, a remote 30 machine or a virtual machine. 31 32 Implementation details: 33 This is an abstract class, leaf subclasses must implement the methods 34 listed here. You must not instantiate this class but should 35 instantiate one of those leaf subclasses. 36 37 When overriding methods that raise NotImplementedError, the leaf class 38 is fully responsible for the implementation and should not chain calls 39 to super. When overriding methods that are a NOP in Host, the subclass 40 should chain calls to super(). The criteria for fitting a new method into 41 one category or the other should be: 42 1. If two separate generic implementations could reasonably be 43 concatenated, then the abstract implementation should pass and 44 subclasses should chain calls to super. 45 2. If only one class could reasonably perform the stated function 46 (e.g. two separate run() implementations cannot both be executed) 47 then the method should raise NotImplementedError in Host, and 48 the implementor should NOT chain calls to super, to ensure that 49 only one implementation ever gets executed. 50 """ 51 52 job = None 53 DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value( 54 "HOSTS", "default_reboot_timeout", type=int, default=1800) 55 WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value( 56 "HOSTS", "wait_down_reboot_timeout", type=int, default=840) 57 WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value( 58 "HOSTS", "wait_down_reboot_warning", type=int, default=540) 59 HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value( 60 "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5) 61 # the number of hardware repair requests that need to happen before we 62 # actually send machines to hardware repair 63 HARDWARE_REPAIR_REQUEST_THRESHOLD = 4 64 OP_REBOOT = 'reboot' 65 OP_SUSPEND = 'suspend' 66 PWR_OPERATION = [OP_REBOOT, OP_SUSPEND] 67 68 69 def __init__(self, *args, **dargs): 70 self._initialize(*args, **dargs) 71 72 73 def _initialize(self, *args, **dargs): 74 pass 75 76 77 @property 78 def job_repo_url_attribute(self): 79 """Get the host attribute name for job_repo_url. 80 """ 81 return 'job_repo_url' 82 83 84 def close(self): 85 pass 86 87 88 def setup(self): 89 pass 90 91 92 def run(self, command, timeout=3600, ignore_status=False, 93 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, 94 stdin=None, args=()): 95 """ 96 Run a command on this host. 97 98 @param command: the command line string 99 @param timeout: time limit in seconds before attempting to 100 kill the running process. The run() function 101 will take a few seconds longer than 'timeout' 102 to complete if it has to kill the process. 103 @param ignore_status: do not raise an exception, no matter 104 what the exit code of the command is. 105 @param stdout_tee/stderr_tee: where to tee the stdout/stderr 106 @param stdin: stdin to pass (a string) to the executed command 107 @param args: sequence of strings to pass as arguments to command by 108 quoting them in " and escaping their contents if necessary 109 110 @return a utils.CmdResult object 111 112 @raises AutotestHostRunError: the exit code of the command execution 113 was not 0 and ignore_status was not enabled 114 """ 115 raise NotImplementedError('Run not implemented!') 116 117 118 def run_output(self, command, *args, **dargs): 119 return self.run(command, *args, **dargs).stdout.rstrip() 120 121 122 def reboot(self): 123 raise NotImplementedError('Reboot not implemented!') 124 125 126 def suspend(self): 127 raise NotImplementedError('Suspend not implemented!') 128 129 130 def sysrq_reboot(self): 131 raise NotImplementedError('Sysrq reboot not implemented!') 132 133 134 def reboot_setup(self, *args, **dargs): 135 pass 136 137 138 def reboot_followup(self, *args, **dargs): 139 pass 140 141 142 def get_file(self, source, dest, delete_dest=False): 143 raise NotImplementedError('Get file not implemented!') 144 145 146 def send_file(self, source, dest, delete_dest=False): 147 raise NotImplementedError('Send file not implemented!') 148 149 150 def get_tmp_dir(self): 151 raise NotImplementedError('Get temp dir not implemented!') 152 153 154 def is_up(self): 155 raise NotImplementedError('Is up not implemented!') 156 157 158 def is_shutting_down(self): 159 """ Indicates is a machine is currently shutting down. """ 160 return False 161 162 163 def get_wait_up_processes(self): 164 """ Gets the list of local processes to wait for in wait_up. """ 165 get_config = global_config.global_config.get_config_value 166 proc_list = get_config("HOSTS", "wait_up_processes", 167 default="").strip() 168 processes = set(p.strip() for p in proc_list.split(",")) 169 processes.discard("") 170 return processes 171 172 173 def get_boot_id(self, timeout=60): 174 """ Get a unique ID associated with the current boot. 175 176 Should return a string with the semantics such that two separate 177 calls to Host.get_boot_id() return the same string if the host did 178 not reboot between the two calls, and two different strings if it 179 has rebooted at least once between the two calls. 180 181 @param timeout The number of seconds to wait before timing out. 182 183 @return A string unique to this boot or None if not available.""" 184 BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id' 185 NO_ID_MSG = 'no boot_id available' 186 cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % ( 187 BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG) 188 boot_id = self.run(cmd, timeout=timeout).stdout.strip() 189 if boot_id == NO_ID_MSG: 190 return None 191 return boot_id 192 193 194 def wait_up(self, timeout=None): 195 raise NotImplementedError('Wait up not implemented!') 196 197 198 def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None): 199 raise NotImplementedError('Wait down not implemented!') 200 201 202 def _construct_host_metadata(self, type_str): 203 """Returns dict of metadata with type_str, hostname, time_recorded. 204 205 @param type_str: String representing _type field in es db. 206 For example: type_str='reboot_total'. 207 """ 208 metadata = { 209 'hostname': self.hostname, 210 'time_recorded': time.time(), 211 '_type': type_str, 212 } 213 return metadata 214 215 216 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, 217 down_timeout=WAIT_DOWN_REBOOT_TIMEOUT, 218 down_warning=WAIT_DOWN_REBOOT_WARNING, 219 log_failure=True, old_boot_id=None, **dargs): 220 """ Wait for the host to come back from a reboot. This is a generic 221 implementation based entirely on wait_up and wait_down. """ 222 key_string = 'Reboot.%s' % dargs.get('board') 223 224 total_reboot_timer = autotest_stats.Timer('%s.total' % key_string, 225 metadata=self._construct_host_metadata('reboot_total')) 226 wait_down_timer = autotest_stats.Timer('%s.wait_down' % key_string, 227 metadata=self._construct_host_metadata('reboot_down')) 228 229 total_reboot_timer.start() 230 wait_down_timer.start() 231 if not self.wait_down(timeout=down_timeout, 232 warning_timer=down_warning, 233 old_boot_id=old_boot_id): 234 if log_failure: 235 self.record("ABORT", None, "reboot.verify", "shut down failed") 236 raise error.AutoservShutdownError("Host did not shut down") 237 wait_down_timer.stop() 238 wait_up_timer = autotest_stats.Timer('%s.wait_up' % key_string, 239 metadata=self._construct_host_metadata('reboot_up')) 240 wait_up_timer.start() 241 if self.wait_up(timeout): 242 self.record("GOOD", None, "reboot.verify") 243 self.reboot_followup(**dargs) 244 wait_up_timer.stop() 245 total_reboot_timer.stop() 246 else: 247 self.record("ABORT", None, "reboot.verify", 248 "Host did not return from reboot") 249 raise error.AutoservRebootError("Host did not return from reboot") 250 251 252 def verify(self): 253 self.verify_hardware() 254 self.verify_connectivity() 255 self.verify_software() 256 257 258 def verify_hardware(self): 259 pass 260 261 262 def verify_connectivity(self): 263 pass 264 265 266 def verify_software(self): 267 pass 268 269 270 def check_diskspace(self, path, gb): 271 """Raises an error if path does not have at least gb GB free. 272 273 @param path The path to check for free disk space. 274 @param gb A floating point number to compare with a granularity 275 of 1 MB. 276 277 1000 based SI units are used. 278 279 @raises AutoservDiskFullHostError if path has less than gb GB free. 280 """ 281 one_mb = 10 ** 6 # Bytes (SI unit). 282 mb_per_gb = 1000.0 283 logging.info('Checking for >= %s GB of space under %s on machine %s', 284 gb, path, self.hostname) 285 df = self.run('df -PB %d %s | tail -1' % (one_mb, path)).stdout.split() 286 free_space_gb = int(df[3]) / mb_per_gb 287 if free_space_gb < gb: 288 raise error.AutoservDiskFullHostError(path, gb, free_space_gb) 289 else: 290 logging.info('Found %s GB >= %s GB of space under %s on machine %s', 291 free_space_gb, gb, path, self.hostname) 292 293 294 def check_inodes(self, path, min_kilo_inodes): 295 """Raises an error if a file system is short on i-nodes. 296 297 @param path The path to check for free i-nodes. 298 @param min_kilo_inodes Minimum number of i-nodes required, 299 in units of 1000 i-nodes. 300 301 @raises AutoservNoFreeInodesError If the minimum required 302 i-node count isn't available. 303 """ 304 min_inodes = 1000 * min_kilo_inodes 305 logging.info('Checking for >= %d i-nodes under %s ' 306 'on machine %s', min_inodes, path, self.hostname) 307 df = self.run('df -Pi %s | tail -1' % path).stdout.split() 308 free_inodes = int(df[3]) 309 if free_inodes < min_inodes: 310 raise error.AutoservNoFreeInodesError(path, min_inodes, 311 free_inodes) 312 else: 313 logging.info('Found %d >= %d i-nodes under %s on ' 314 'machine %s', free_inodes, min_inodes, 315 path, self.hostname) 316 317 318 def erase_dir_contents(self, path, ignore_status=True, timeout=3600): 319 """Empty a given directory path contents.""" 320 rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf' 321 self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout) 322 323 324 def repair(self): 325 """Try and get the host to pass `self.verify()`.""" 326 self.verify() 327 328 329 def disable_ipfilters(self): 330 """Allow all network packets in and out of the host.""" 331 self.run('iptables-save > /tmp/iptable-rules') 332 self.run('iptables -P INPUT ACCEPT') 333 self.run('iptables -P FORWARD ACCEPT') 334 self.run('iptables -P OUTPUT ACCEPT') 335 336 337 def enable_ipfilters(self): 338 """Re-enable the IP filters disabled from disable_ipfilters()""" 339 if self.path_exists('/tmp/iptable-rules'): 340 self.run('iptables-restore < /tmp/iptable-rules') 341 342 343 def cleanup(self): 344 pass 345 346 347 def machine_install(self): 348 raise NotImplementedError('Machine install not implemented!') 349 350 351 def install(self, installableObject): 352 installableObject.install(self) 353 354 355 def get_autodir(self): 356 raise NotImplementedError('Get autodir not implemented!') 357 358 359 def set_autodir(self): 360 raise NotImplementedError('Set autodir not implemented!') 361 362 363 def start_loggers(self): 364 """ Called to start continuous host logging. """ 365 pass 366 367 368 def stop_loggers(self): 369 """ Called to stop continuous host logging. """ 370 pass 371 372 373 # some extra methods simplify the retrieval of information about the 374 # Host machine, with generic implementations based on run(). subclasses 375 # should feel free to override these if they can provide better 376 # implementations for their specific Host types 377 378 def get_num_cpu(self): 379 """ Get the number of CPUs in the host according to /proc/cpuinfo. """ 380 proc_cpuinfo = self.run('cat /proc/cpuinfo', 381 stdout_tee=open(os.devnull, 'w')).stdout 382 cpus = 0 383 for line in proc_cpuinfo.splitlines(): 384 if line.startswith('processor'): 385 cpus += 1 386 return cpus 387 388 389 def get_arch(self): 390 """ Get the hardware architecture of the remote machine. """ 391 cmd_uname = path_utils.must_be_installed('/bin/uname', host=self) 392 arch = self.run('%s -m' % cmd_uname).stdout.rstrip() 393 if re.match(r'i\d86$', arch): 394 arch = 'i386' 395 return arch 396 397 398 def get_kernel_ver(self): 399 """ Get the kernel version of the remote machine. """ 400 cmd_uname = path_utils.must_be_installed('/bin/uname', host=self) 401 return self.run('%s -r' % cmd_uname).stdout.rstrip() 402 403 404 def get_cmdline(self): 405 """ Get the kernel command line of the remote machine. """ 406 return self.run('cat /proc/cmdline').stdout.rstrip() 407 408 409 def get_meminfo(self): 410 """ Get the kernel memory info (/proc/meminfo) of the remote machine 411 and return a dictionary mapping the various statistics. """ 412 meminfo_dict = {} 413 meminfo = self.run('cat /proc/meminfo').stdout.splitlines() 414 for key, val in (line.split(':', 1) for line in meminfo): 415 meminfo_dict[key.strip()] = val.strip() 416 return meminfo_dict 417 418 419 def path_exists(self, path): 420 """ Determine if path exists on the remote machine. """ 421 result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path), 422 ignore_status=True) 423 return result.exit_status == 0 424 425 426 # some extra helpers for doing job-related operations 427 428 def record(self, *args, **dargs): 429 """ Helper method for recording status logs against Host.job that 430 silently becomes a NOP if Host.job is not available. The args and 431 dargs are passed on to Host.job.record unchanged. """ 432 if self.job: 433 self.job.record(*args, **dargs) 434 435 436 def log_kernel(self): 437 """ Helper method for logging kernel information into the status logs. 438 Intended for cases where the "current" kernel is not really defined 439 and we want to explicitly log it. Does nothing if this host isn't 440 actually associated with a job. """ 441 if self.job: 442 kernel = self.get_kernel_ver() 443 self.job.record("INFO", None, None, 444 optional_fields={"kernel": kernel}) 445 446 447 def log_op(self, op, op_func): 448 """ Decorator for wrapping a management operaiton in a group for status 449 logging purposes. 450 451 @param op: name of the operation. 452 @param op_func: a function that carries out the operation 453 (reboot, suspend) 454 """ 455 if self.job and not hasattr(self, "RUNNING_LOG_OP"): 456 self.RUNNING_LOG_OP = True 457 try: 458 self.job.run_op(op, op_func, self.get_kernel_ver) 459 finally: 460 del self.RUNNING_LOG_OP 461 else: 462 op_func() 463 464 465 def list_files_glob(self, glob): 466 """ 467 Get a list of files on a remote host given a glob pattern path. 468 """ 469 SCRIPT = ("python -c 'import cPickle, glob, sys;" 470 "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'") 471 output = self.run(SCRIPT, args=(glob,), stdout_tee=None, 472 timeout=60).stdout 473 return cPickle.loads(output) 474 475 476 def symlink_closure(self, paths): 477 """ 478 Given a sequence of path strings, return the set of all paths that 479 can be reached from the initial set by following symlinks. 480 481 @param paths: sequence of path strings. 482 @return: a sequence of path strings that are all the unique paths that 483 can be reached from the given ones after following symlinks. 484 """ 485 SCRIPT = ("python -c 'import cPickle, os, sys\n" 486 "paths = cPickle.load(sys.stdin)\n" 487 "closure = {}\n" 488 "while paths:\n" 489 " path = paths.keys()[0]\n" 490 " del paths[path]\n" 491 " if not os.path.exists(path):\n" 492 " continue\n" 493 " closure[path] = None\n" 494 " if os.path.islink(path):\n" 495 " link_to = os.path.join(os.path.dirname(path),\n" 496 " os.readlink(path))\n" 497 " if link_to not in closure.keys():\n" 498 " paths[link_to] = None\n" 499 "cPickle.dump(closure.keys(), sys.stdout, 0)'") 500 input_data = cPickle.dumps(dict((path, None) for path in paths), 0) 501 output = self.run(SCRIPT, stdout_tee=None, stdin=input_data, 502 timeout=60).stdout 503 return cPickle.loads(output) 504 505 506 def cleanup_kernels(self, boot_dir='/boot'): 507 """ 508 Remove any kernel image and associated files (vmlinux, system.map, 509 modules) for any image found in the boot directory that is not 510 referenced by entries in the bootloader configuration. 511 512 @param boot_dir: boot directory path string, default '/boot' 513 """ 514 # find all the vmlinuz images referenced by the bootloader 515 vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-') 516 boot_info = self.bootloader.get_entries() 517 used_kernver = [boot['kernel'][len(vmlinuz_prefix):] 518 for boot in boot_info.itervalues()] 519 520 # find all the unused vmlinuz images in /boot 521 all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*') 522 used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver 523 for kernver in used_kernver) 524 unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz) 525 526 # find all the unused vmlinux images in /boot 527 vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-') 528 all_vmlinux = self.list_files_glob(vmlinux_prefix + '*') 529 used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver 530 for kernver in used_kernver) 531 unused_vmlinux = set(all_vmlinux) - set(used_vmlinux) 532 533 # find all the unused System.map files in /boot 534 systemmap_prefix = os.path.join(boot_dir, 'System.map-') 535 all_system_map = self.list_files_glob(systemmap_prefix + '*') 536 used_system_map = self.symlink_closure( 537 systemmap_prefix + kernver for kernver in used_kernver) 538 unused_system_map = set(all_system_map) - set(used_system_map) 539 540 # find all the module directories associated with unused kernels 541 modules_prefix = '/lib/modules/' 542 all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*') 543 if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)] 544 used_moddirs = self.symlink_closure(modules_prefix + kernver 545 for kernver in used_kernver) 546 unused_moddirs = set(all_moddirs) - set(used_moddirs) 547 548 # remove all the vmlinuz files we don't use 549 # TODO: if needed this should become package manager agnostic 550 for vmlinuz in unused_vmlinuz: 551 # try and get an rpm package name 552 rpm = self.run('rpm -qf', args=(vmlinuz,), 553 ignore_status=True, timeout=120) 554 if rpm.exit_status == 0: 555 packages = set(line.strip() for line in 556 rpm.stdout.splitlines()) 557 # if we found some package names, try to remove them 558 for package in packages: 559 self.run('rpm -e', args=(package,), 560 ignore_status=True, timeout=120) 561 # remove the image files anyway, even if rpm didn't 562 self.run('rm -f', args=(vmlinuz,), 563 ignore_status=True, timeout=120) 564 565 # remove all the vmlinux and System.map files left over 566 for f in (unused_vmlinux | unused_system_map): 567 self.run('rm -f', args=(f,), 568 ignore_status=True, timeout=120) 569 570 # remove all unused module directories 571 # the regex match should keep us safe from removing the wrong files 572 for moddir in unused_moddirs: 573 self.run('rm -fr', args=(moddir,), ignore_status=True) 574 575 576 def get_attributes_to_clear_before_provision(self): 577 """Get a list of attributes to be cleared before machine_install starts. 578 579 If provision runs in a lab environment, it is necessary to clear certain 580 host attributes for the host in afe_host_attributes table. For example, 581 `job_repo_url` is a devserver url pointed to autotest packages for 582 CrosHost, it needs to be removed before provision starts for tests to 583 run reliably. 584 For ADBHost, the job repo url has a different format, i.e., appended by 585 adb_serial, so this method should be overriden in ADBHost. 586 """ 587 return ['job_repo_url'] 588