1# Copyright 2009 Google Inc. Released under the GPL v2 2 3""" 4This module defines the base classes for the Host hierarchy. 5 6Implementation details: 7You should import the "hosts" package instead of importing each type of host. 8 9 Host: a machine on which you can run programs 10""" 11 12__author__ = """ 13mbligh@google.com (Martin J. Bligh), 14poirier@google.com (Benjamin Poirier), 15stutsman@google.com (Ryan Stutsman) 16""" 17 18import cPickle, cStringIO, logging, os, re, time 19 20from autotest_lib.client.common_lib import global_config, error, utils 21from autotest_lib.client.common_lib.cros import path_utils 22from autotest_lib.client.common_lib.cros.graphite import autotest_stats 23from autotest_lib.client.bin import partition 24 25 26class Host(object): 27 """ 28 This class represents a machine on which you can run programs. 29 30 It may be a local machine, the one autoserv is running on, a remote 31 machine or a virtual machine. 32 33 Implementation details: 34 This is an abstract class, leaf subclasses must implement the methods 35 listed here. You must not instantiate this class but should 36 instantiate one of those leaf subclasses. 37 38 When overriding methods that raise NotImplementedError, the leaf class 39 is fully responsible for the implementation and should not chain calls 40 to super. When overriding methods that are a NOP in Host, the subclass 41 should chain calls to super(). The criteria for fitting a new method into 42 one category or the other should be: 43 1. If two separate generic implementations could reasonably be 44 concatenated, then the abstract implementation should pass and 45 subclasses should chain calls to super. 46 2. If only one class could reasonably perform the stated function 47 (e.g. two separate run() implementations cannot both be executed) 48 then the method should raise NotImplementedError in Host, and 49 the implementor should NOT chain calls to super, to ensure that 50 only one implementation ever gets executed. 51 """ 52 53 job = None 54 DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value( 55 "HOSTS", "default_reboot_timeout", type=int, default=1800) 56 WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value( 57 "HOSTS", "wait_down_reboot_timeout", type=int, default=840) 58 WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value( 59 "HOSTS", "wait_down_reboot_warning", type=int, default=540) 60 HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value( 61 "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5) 62 # the number of hardware repair requests that need to happen before we 63 # actually send machines to hardware repair 64 HARDWARE_REPAIR_REQUEST_THRESHOLD = 4 65 OP_REBOOT = 'reboot' 66 OP_SUSPEND = 'suspend' 67 PWR_OPERATION = [OP_REBOOT, OP_SUSPEND] 68 69 70 def __init__(self, *args, **dargs): 71 self._initialize(*args, **dargs) 72 73 74 def _initialize(self, *args, **dargs): 75 pass 76 77 78 def close(self): 79 pass 80 81 82 def setup(self): 83 pass 84 85 86 def run(self, command, timeout=3600, ignore_status=False, 87 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, 88 stdin=None, args=()): 89 """ 90 Run a command on this host. 91 92 @param command: the command line string 93 @param timeout: time limit in seconds before attempting to 94 kill the running process. The run() function 95 will take a few seconds longer than 'timeout' 96 to complete if it has to kill the process. 97 @param ignore_status: do not raise an exception, no matter 98 what the exit code of the command is. 99 @param stdout_tee/stderr_tee: where to tee the stdout/stderr 100 @param stdin: stdin to pass (a string) to the executed command 101 @param args: sequence of strings to pass as arguments to command by 102 quoting them in " and escaping their contents if necessary 103 104 @return a utils.CmdResult object 105 106 @raises AutotestHostRunError: the exit code of the command execution 107 was not 0 and ignore_status was not enabled 108 """ 109 raise NotImplementedError('Run not implemented!') 110 111 112 def run_output(self, command, *args, **dargs): 113 return self.run(command, *args, **dargs).stdout.rstrip() 114 115 116 def reboot(self): 117 raise NotImplementedError('Reboot not implemented!') 118 119 120 def suspend(self): 121 raise NotImplementedError('Suspend not implemented!') 122 123 124 def sysrq_reboot(self): 125 raise NotImplementedError('Sysrq reboot not implemented!') 126 127 128 def reboot_setup(self, *args, **dargs): 129 pass 130 131 132 def reboot_followup(self, *args, **dargs): 133 pass 134 135 136 def get_file(self, source, dest, delete_dest=False): 137 raise NotImplementedError('Get file not implemented!') 138 139 140 def send_file(self, source, dest, delete_dest=False): 141 raise NotImplementedError('Send file not implemented!') 142 143 144 def get_tmp_dir(self): 145 raise NotImplementedError('Get temp dir not implemented!') 146 147 148 def is_up(self): 149 raise NotImplementedError('Is up not implemented!') 150 151 152 def is_shutting_down(self): 153 """ Indicates is a machine is currently shutting down. """ 154 return False 155 156 157 def get_wait_up_processes(self): 158 """ Gets the list of local processes to wait for in wait_up. """ 159 get_config = global_config.global_config.get_config_value 160 proc_list = get_config("HOSTS", "wait_up_processes", 161 default="").strip() 162 processes = set(p.strip() for p in proc_list.split(",")) 163 processes.discard("") 164 return processes 165 166 167 def get_boot_id(self, timeout=60): 168 """ Get a unique ID associated with the current boot. 169 170 Should return a string with the semantics such that two separate 171 calls to Host.get_boot_id() return the same string if the host did 172 not reboot between the two calls, and two different strings if it 173 has rebooted at least once between the two calls. 174 175 @param timeout The number of seconds to wait before timing out. 176 177 @return A string unique to this boot or None if not available.""" 178 BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id' 179 NO_ID_MSG = 'no boot_id available' 180 cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % ( 181 BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG) 182 boot_id = self.run(cmd, timeout=timeout).stdout.strip() 183 if boot_id == NO_ID_MSG: 184 return None 185 return boot_id 186 187 188 def wait_up(self, timeout=None): 189 raise NotImplementedError('Wait up not implemented!') 190 191 192 def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None): 193 raise NotImplementedError('Wait down not implemented!') 194 195 196 def _construct_host_metadata(self, type_str): 197 """Returns dict of metadata with type_str, hostname, time_recorded. 198 199 @param type_str: String representing _type field in es db. 200 For example: type_str='reboot_total'. 201 """ 202 metadata = { 203 'hostname': self.hostname, 204 'time_recorded': time.time(), 205 '_type': type_str, 206 } 207 return metadata 208 209 210 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, 211 down_timeout=WAIT_DOWN_REBOOT_TIMEOUT, 212 down_warning=WAIT_DOWN_REBOOT_WARNING, 213 log_failure=True, old_boot_id=None, **dargs): 214 """ Wait for the host to come back from a reboot. This is a generic 215 implementation based entirely on wait_up and wait_down. """ 216 key_string = 'Reboot.%s' % dargs.get('board') 217 218 total_reboot_timer = autotest_stats.Timer('%s.total' % key_string, 219 metadata=self._construct_host_metadata('reboot_total')) 220 wait_down_timer = autotest_stats.Timer('%s.wait_down' % key_string, 221 metadata=self._construct_host_metadata('reboot_down')) 222 223 total_reboot_timer.start() 224 wait_down_timer.start() 225 if not self.wait_down(timeout=down_timeout, 226 warning_timer=down_warning, 227 old_boot_id=old_boot_id): 228 if log_failure: 229 self.record("ABORT", None, "reboot.verify", "shut down failed") 230 raise error.AutoservShutdownError("Host did not shut down") 231 wait_down_timer.stop() 232 wait_up_timer = autotest_stats.Timer('%s.wait_up' % key_string, 233 metadata=self._construct_host_metadata('reboot_up')) 234 wait_up_timer.start() 235 if self.wait_up(timeout): 236 self.record("GOOD", None, "reboot.verify") 237 self.reboot_followup(**dargs) 238 wait_up_timer.stop() 239 total_reboot_timer.stop() 240 else: 241 self.record("ABORT", None, "reboot.verify", 242 "Host did not return from reboot") 243 raise error.AutoservRebootError("Host did not return from reboot") 244 245 246 def verify(self): 247 self.verify_hardware() 248 self.verify_connectivity() 249 self.verify_software() 250 251 252 def verify_hardware(self): 253 pass 254 255 256 def verify_connectivity(self): 257 pass 258 259 260 def verify_software(self): 261 pass 262 263 264 def check_diskspace(self, path, gb): 265 """Raises an error if path does not have at least gb GB free. 266 267 @param path The path to check for free disk space. 268 @param gb A floating point number to compare with a granularity 269 of 1 MB. 270 271 1000 based SI units are used. 272 273 @raises AutoservDiskFullHostError if path has less than gb GB free. 274 """ 275 one_mb = 10 ** 6 # Bytes (SI unit). 276 mb_per_gb = 1000.0 277 logging.info('Checking for >= %s GB of space under %s on machine %s', 278 gb, path, self.hostname) 279 df = self.run('df -PB %d %s | tail -1' % (one_mb, path)).stdout.split() 280 free_space_gb = int(df[3]) / mb_per_gb 281 if free_space_gb < gb: 282 raise error.AutoservDiskFullHostError(path, gb, free_space_gb) 283 else: 284 logging.info('Found %s GB >= %s GB of space under %s on machine %s', 285 free_space_gb, gb, path, self.hostname) 286 287 288 def check_inodes(self, path, min_kilo_inodes): 289 """Raises an error if a file system is short on i-nodes. 290 291 @param path The path to check for free i-nodes. 292 @param min_kilo_inodes Minimum number of i-nodes required, 293 in units of 1000 i-nodes. 294 295 @raises AutoservNoFreeInodesError If the minimum required 296 i-node count isn't available. 297 """ 298 min_inodes = 1000 * min_kilo_inodes 299 logging.info('Checking for >= %d i-nodes under %s ' 300 'on machine %s', min_inodes, path, self.hostname) 301 df = self.run('df -Pi %s | tail -1' % path).stdout.split() 302 free_inodes = int(df[3]) 303 if free_inodes < min_inodes: 304 raise error.AutoservNoFreeInodesError(path, min_inodes, 305 free_inodes) 306 else: 307 logging.info('Found %d >= %d i-nodes under %s on ' 308 'machine %s', free_inodes, min_inodes, 309 path, self.hostname) 310 311 312 def erase_dir_contents(self, path, ignore_status=True, timeout=3600): 313 """Empty a given directory path contents.""" 314 rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf' 315 self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout) 316 317 318 def repair(self): 319 """Try and get the host to pass `self.verify()`.""" 320 self.verify() 321 322 323 def disable_ipfilters(self): 324 """Allow all network packets in and out of the host.""" 325 self.run('iptables-save > /tmp/iptable-rules') 326 self.run('iptables -P INPUT ACCEPT') 327 self.run('iptables -P FORWARD ACCEPT') 328 self.run('iptables -P OUTPUT ACCEPT') 329 330 331 def enable_ipfilters(self): 332 """Re-enable the IP filters disabled from disable_ipfilters()""" 333 if self.path_exists('/tmp/iptable-rules'): 334 self.run('iptables-restore < /tmp/iptable-rules') 335 336 337 def cleanup(self): 338 pass 339 340 341 def machine_install(self): 342 raise NotImplementedError('Machine install not implemented!') 343 344 345 def install(self, installableObject): 346 installableObject.install(self) 347 348 349 def get_autodir(self): 350 raise NotImplementedError('Get autodir not implemented!') 351 352 353 def set_autodir(self): 354 raise NotImplementedError('Set autodir not implemented!') 355 356 357 def start_loggers(self): 358 """ Called to start continuous host logging. """ 359 pass 360 361 362 def stop_loggers(self): 363 """ Called to stop continuous host logging. """ 364 pass 365 366 367 # some extra methods simplify the retrieval of information about the 368 # Host machine, with generic implementations based on run(). subclasses 369 # should feel free to override these if they can provide better 370 # implementations for their specific Host types 371 372 def get_num_cpu(self): 373 """ Get the number of CPUs in the host according to /proc/cpuinfo. """ 374 proc_cpuinfo = self.run('cat /proc/cpuinfo', 375 stdout_tee=open(os.devnull, 'w')).stdout 376 cpus = 0 377 for line in proc_cpuinfo.splitlines(): 378 if line.startswith('processor'): 379 cpus += 1 380 return cpus 381 382 383 def get_arch(self): 384 """ Get the hardware architecture of the remote machine. """ 385 cmd_uname = path_utils.must_be_installed('/bin/uname', host=self) 386 arch = self.run('%s -m' % cmd_uname).stdout.rstrip() 387 if re.match(r'i\d86$', arch): 388 arch = 'i386' 389 return arch 390 391 392 def get_kernel_ver(self): 393 """ Get the kernel version of the remote machine. """ 394 cmd_uname = path_utils.must_be_installed('/bin/uname', host=self) 395 return self.run('%s -r' % cmd_uname).stdout.rstrip() 396 397 398 def get_cmdline(self): 399 """ Get the kernel command line of the remote machine. """ 400 return self.run('cat /proc/cmdline').stdout.rstrip() 401 402 403 def get_meminfo(self): 404 """ Get the kernel memory info (/proc/meminfo) of the remote machine 405 and return a dictionary mapping the various statistics. """ 406 meminfo_dict = {} 407 meminfo = self.run('cat /proc/meminfo').stdout.splitlines() 408 for key, val in (line.split(':', 1) for line in meminfo): 409 meminfo_dict[key.strip()] = val.strip() 410 return meminfo_dict 411 412 413 def path_exists(self, path): 414 """ Determine if path exists on the remote machine. """ 415 result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path), 416 ignore_status=True) 417 return result.exit_status == 0 418 419 420 # some extra helpers for doing job-related operations 421 422 def record(self, *args, **dargs): 423 """ Helper method for recording status logs against Host.job that 424 silently becomes a NOP if Host.job is not available. The args and 425 dargs are passed on to Host.job.record unchanged. """ 426 if self.job: 427 self.job.record(*args, **dargs) 428 429 430 def log_kernel(self): 431 """ Helper method for logging kernel information into the status logs. 432 Intended for cases where the "current" kernel is not really defined 433 and we want to explicitly log it. Does nothing if this host isn't 434 actually associated with a job. """ 435 if self.job: 436 kernel = self.get_kernel_ver() 437 self.job.record("INFO", None, None, 438 optional_fields={"kernel": kernel}) 439 440 441 def log_op(self, op, op_func): 442 """ Decorator for wrapping a management operaiton in a group for status 443 logging purposes. 444 445 @param op: name of the operation. 446 @param op_func: a function that carries out the operation 447 (reboot, suspend) 448 """ 449 if self.job and not hasattr(self, "RUNNING_LOG_OP"): 450 self.RUNNING_LOG_OP = True 451 try: 452 self.job.run_op(op, op_func, self.get_kernel_ver) 453 finally: 454 del self.RUNNING_LOG_OP 455 else: 456 op_func() 457 458 459 def list_files_glob(self, glob): 460 """ 461 Get a list of files on a remote host given a glob pattern path. 462 """ 463 SCRIPT = ("python -c 'import cPickle, glob, sys;" 464 "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'") 465 output = self.run(SCRIPT, args=(glob,), stdout_tee=None, 466 timeout=60).stdout 467 return cPickle.loads(output) 468 469 470 def symlink_closure(self, paths): 471 """ 472 Given a sequence of path strings, return the set of all paths that 473 can be reached from the initial set by following symlinks. 474 475 @param paths: sequence of path strings. 476 @return: a sequence of path strings that are all the unique paths that 477 can be reached from the given ones after following symlinks. 478 """ 479 SCRIPT = ("python -c 'import cPickle, os, sys\n" 480 "paths = cPickle.load(sys.stdin)\n" 481 "closure = {}\n" 482 "while paths:\n" 483 " path = paths.keys()[0]\n" 484 " del paths[path]\n" 485 " if not os.path.exists(path):\n" 486 " continue\n" 487 " closure[path] = None\n" 488 " if os.path.islink(path):\n" 489 " link_to = os.path.join(os.path.dirname(path),\n" 490 " os.readlink(path))\n" 491 " if link_to not in closure.keys():\n" 492 " paths[link_to] = None\n" 493 "cPickle.dump(closure.keys(), sys.stdout, 0)'") 494 input_data = cPickle.dumps(dict((path, None) for path in paths), 0) 495 output = self.run(SCRIPT, stdout_tee=None, stdin=input_data, 496 timeout=60).stdout 497 return cPickle.loads(output) 498 499 500 def cleanup_kernels(self, boot_dir='/boot'): 501 """ 502 Remove any kernel image and associated files (vmlinux, system.map, 503 modules) for any image found in the boot directory that is not 504 referenced by entries in the bootloader configuration. 505 506 @param boot_dir: boot directory path string, default '/boot' 507 """ 508 # find all the vmlinuz images referenced by the bootloader 509 vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-') 510 boot_info = self.bootloader.get_entries() 511 used_kernver = [boot['kernel'][len(vmlinuz_prefix):] 512 for boot in boot_info.itervalues()] 513 514 # find all the unused vmlinuz images in /boot 515 all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*') 516 used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver 517 for kernver in used_kernver) 518 unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz) 519 520 # find all the unused vmlinux images in /boot 521 vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-') 522 all_vmlinux = self.list_files_glob(vmlinux_prefix + '*') 523 used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver 524 for kernver in used_kernver) 525 unused_vmlinux = set(all_vmlinux) - set(used_vmlinux) 526 527 # find all the unused System.map files in /boot 528 systemmap_prefix = os.path.join(boot_dir, 'System.map-') 529 all_system_map = self.list_files_glob(systemmap_prefix + '*') 530 used_system_map = self.symlink_closure( 531 systemmap_prefix + kernver for kernver in used_kernver) 532 unused_system_map = set(all_system_map) - set(used_system_map) 533 534 # find all the module directories associated with unused kernels 535 modules_prefix = '/lib/modules/' 536 all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*') 537 if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)] 538 used_moddirs = self.symlink_closure(modules_prefix + kernver 539 for kernver in used_kernver) 540 unused_moddirs = set(all_moddirs) - set(used_moddirs) 541 542 # remove all the vmlinuz files we don't use 543 # TODO: if needed this should become package manager agnostic 544 for vmlinuz in unused_vmlinuz: 545 # try and get an rpm package name 546 rpm = self.run('rpm -qf', args=(vmlinuz,), 547 ignore_status=True, timeout=120) 548 if rpm.exit_status == 0: 549 packages = set(line.strip() for line in 550 rpm.stdout.splitlines()) 551 # if we found some package names, try to remove them 552 for package in packages: 553 self.run('rpm -e', args=(package,), 554 ignore_status=True, timeout=120) 555 # remove the image files anyway, even if rpm didn't 556 self.run('rm -f', args=(vmlinuz,), 557 ignore_status=True, timeout=120) 558 559 # remove all the vmlinux and System.map files left over 560 for f in (unused_vmlinux | unused_system_map): 561 self.run('rm -f', args=(f,), 562 ignore_status=True, timeout=120) 563 564 # remove all unused module directories 565 # the regex match should keep us safe from removing the wrong files 566 for moddir in unused_moddirs: 567 self.run('rm -fr', args=(moddir,), ignore_status=True) 568