base_classes.py revision e4256c81ced9c519f2256bb37119f801edbb9ff1
1# Copyright 2009 Google Inc. Released under the GPL v2
2
3"""
4This module defines the base classes for the Host hierarchy.
5
6Implementation details:
7You should import the "hosts" package instead of importing each type of host.
8
9        Host: a machine on which you can run programs
10"""
11
12__author__ = """
13mbligh@google.com (Martin J. Bligh),
14poirier@google.com (Benjamin Poirier),
15stutsman@google.com (Ryan Stutsman)
16"""
17
18import cPickle, logging, os, re, time
19
20from autotest_lib.client.common_lib import global_config, error, utils
21from autotest_lib.client.common_lib.cros import path_utils
22from autotest_lib.client.common_lib.cros.graphite import autotest_stats
23
24
25class Host(object):
26    """
27    This class represents a machine on which you can run programs.
28
29    It may be a local machine, the one autoserv is running on, a remote
30    machine or a virtual machine.
31
32    Implementation details:
33    This is an abstract class, leaf subclasses must implement the methods
34    listed here. You must not instantiate this class but should
35    instantiate one of those leaf subclasses.
36
37    When overriding methods that raise NotImplementedError, the leaf class
38    is fully responsible for the implementation and should not chain calls
39    to super. When overriding methods that are a NOP in Host, the subclass
40    should chain calls to super(). The criteria for fitting a new method into
41    one category or the other should be:
42        1. If two separate generic implementations could reasonably be
43           concatenated, then the abstract implementation should pass and
44           subclasses should chain calls to super.
45        2. If only one class could reasonably perform the stated function
46           (e.g. two separate run() implementations cannot both be executed)
47           then the method should raise NotImplementedError in Host, and
48           the implementor should NOT chain calls to super, to ensure that
49           only one implementation ever gets executed.
50    """
51
52    job = None
53    DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
54        "HOSTS", "default_reboot_timeout", type=int, default=1800)
55    WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
56        "HOSTS", "wait_down_reboot_timeout", type=int, default=840)
57    WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value(
58        "HOSTS", "wait_down_reboot_warning", type=int, default=540)
59    HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value(
60        "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5)
61    # the number of hardware repair requests that need to happen before we
62    # actually send machines to hardware repair
63    HARDWARE_REPAIR_REQUEST_THRESHOLD = 4
64    OP_REBOOT = 'reboot'
65    OP_SUSPEND = 'suspend'
66    PWR_OPERATION = [OP_REBOOT, OP_SUSPEND]
67
68
69    def __init__(self, *args, **dargs):
70        self._initialize(*args, **dargs)
71
72
73    def _initialize(self, *args, **dargs):
74        pass
75
76
77    @property
78    def job_repo_url_attribute(self):
79        """Get the host attribute name for job_repo_url.
80        """
81        return 'job_repo_url'
82
83
84    def close(self):
85        pass
86
87
88    def setup(self):
89        pass
90
91
92    def run(self, command, timeout=3600, ignore_status=False,
93            stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
94            stdin=None, args=()):
95        """
96        Run a command on this host.
97
98        @param command: the command line string
99        @param timeout: time limit in seconds before attempting to
100                kill the running process. The run() function
101                will take a few seconds longer than 'timeout'
102                to complete if it has to kill the process.
103        @param ignore_status: do not raise an exception, no matter
104                what the exit code of the command is.
105        @param stdout_tee/stderr_tee: where to tee the stdout/stderr
106        @param stdin: stdin to pass (a string) to the executed command
107        @param args: sequence of strings to pass as arguments to command by
108                quoting them in " and escaping their contents if necessary
109
110        @return a utils.CmdResult object
111
112        @raises AutotestHostRunError: the exit code of the command execution
113                was not 0 and ignore_status was not enabled
114        """
115        raise NotImplementedError('Run not implemented!')
116
117
118    def run_output(self, command, *args, **dargs):
119        return self.run(command, *args, **dargs).stdout.rstrip()
120
121
122    def reboot(self):
123        raise NotImplementedError('Reboot not implemented!')
124
125
126    def suspend(self):
127        raise NotImplementedError('Suspend not implemented!')
128
129
130    def sysrq_reboot(self):
131        raise NotImplementedError('Sysrq reboot not implemented!')
132
133
134    def reboot_setup(self, *args, **dargs):
135        pass
136
137
138    def reboot_followup(self, *args, **dargs):
139        pass
140
141
142    def get_file(self, source, dest, delete_dest=False):
143        raise NotImplementedError('Get file not implemented!')
144
145
146    def send_file(self, source, dest, delete_dest=False):
147        raise NotImplementedError('Send file not implemented!')
148
149
150    def get_tmp_dir(self):
151        raise NotImplementedError('Get temp dir not implemented!')
152
153
154    def is_up(self):
155        raise NotImplementedError('Is up not implemented!')
156
157
158    def is_shutting_down(self):
159        """ Indicates is a machine is currently shutting down. """
160        return False
161
162
163    def get_wait_up_processes(self):
164        """ Gets the list of local processes to wait for in wait_up. """
165        get_config = global_config.global_config.get_config_value
166        proc_list = get_config("HOSTS", "wait_up_processes",
167                               default="").strip()
168        processes = set(p.strip() for p in proc_list.split(","))
169        processes.discard("")
170        return processes
171
172
173    def get_boot_id(self, timeout=60):
174        """ Get a unique ID associated with the current boot.
175
176        Should return a string with the semantics such that two separate
177        calls to Host.get_boot_id() return the same string if the host did
178        not reboot between the two calls, and two different strings if it
179        has rebooted at least once between the two calls.
180
181        @param timeout The number of seconds to wait before timing out.
182
183        @return A string unique to this boot or None if not available."""
184        BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id'
185        NO_ID_MSG = 'no boot_id available'
186        cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % (
187                BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG)
188        boot_id = self.run(cmd, timeout=timeout).stdout.strip()
189        if boot_id == NO_ID_MSG:
190            return None
191        return boot_id
192
193
194    def wait_up(self, timeout=None):
195        raise NotImplementedError('Wait up not implemented!')
196
197
198    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
199        raise NotImplementedError('Wait down not implemented!')
200
201
202    def _construct_host_metadata(self, type_str):
203        """Returns dict of metadata with type_str, hostname, time_recorded.
204
205        @param type_str: String representing _type field in es db.
206            For example: type_str='reboot_total'.
207        """
208        metadata = {
209            'hostname': self.hostname,
210            'time_recorded': time.time(),
211            '_type': type_str,
212        }
213        return metadata
214
215
216    def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
217                         down_timeout=WAIT_DOWN_REBOOT_TIMEOUT,
218                         down_warning=WAIT_DOWN_REBOOT_WARNING,
219                         log_failure=True, old_boot_id=None, **dargs):
220        """ Wait for the host to come back from a reboot. This is a generic
221        implementation based entirely on wait_up and wait_down. """
222        key_string = 'Reboot.%s' % dargs.get('board')
223
224        total_reboot_timer = autotest_stats.Timer('%s.total' % key_string,
225                metadata=self._construct_host_metadata('reboot_total'))
226        wait_down_timer = autotest_stats.Timer('%s.wait_down' % key_string,
227                metadata=self._construct_host_metadata('reboot_down'))
228
229        total_reboot_timer.start()
230        wait_down_timer.start()
231        if not self.wait_down(timeout=down_timeout,
232                              warning_timer=down_warning,
233                              old_boot_id=old_boot_id):
234            if log_failure:
235                self.record("ABORT", None, "reboot.verify", "shut down failed")
236            raise error.AutoservShutdownError("Host did not shut down")
237        wait_down_timer.stop()
238        wait_up_timer = autotest_stats.Timer('%s.wait_up' % key_string,
239                metadata=self._construct_host_metadata('reboot_up'))
240        wait_up_timer.start()
241        if self.wait_up(timeout):
242            self.record("GOOD", None, "reboot.verify")
243            self.reboot_followup(**dargs)
244            wait_up_timer.stop()
245            total_reboot_timer.stop()
246        else:
247            self.record("ABORT", None, "reboot.verify",
248                        "Host did not return from reboot")
249            raise error.AutoservRebootError("Host did not return from reboot")
250
251
252    def verify(self):
253        self.verify_hardware()
254        self.verify_connectivity()
255        self.verify_software()
256
257
258    def verify_hardware(self):
259        pass
260
261
262    def verify_connectivity(self):
263        pass
264
265
266    def verify_software(self):
267        pass
268
269
270    def check_diskspace(self, path, gb):
271        """Raises an error if path does not have at least gb GB free.
272
273        @param path The path to check for free disk space.
274        @param gb A floating point number to compare with a granularity
275            of 1 MB.
276
277        1000 based SI units are used.
278
279        @raises AutoservDiskFullHostError if path has less than gb GB free.
280        """
281        one_mb = 10 ** 6  # Bytes (SI unit).
282        mb_per_gb = 1000.0
283        logging.info('Checking for >= %s GB of space under %s on machine %s',
284                     gb, path, self.hostname)
285        df = self.run('df -PB %d %s | tail -1' % (one_mb, path)).stdout.split()
286        free_space_gb = int(df[3]) / mb_per_gb
287        if free_space_gb < gb:
288            raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
289        else:
290            logging.info('Found %s GB >= %s GB of space under %s on machine %s',
291                free_space_gb, gb, path, self.hostname)
292
293
294    def check_inodes(self, path, min_kilo_inodes):
295        """Raises an error if a file system is short on i-nodes.
296
297        @param path The path to check for free i-nodes.
298        @param min_kilo_inodes Minimum number of i-nodes required,
299                               in units of 1000 i-nodes.
300
301        @raises AutoservNoFreeInodesError If the minimum required
302                                  i-node count isn't available.
303        """
304        min_inodes = 1000 * min_kilo_inodes
305        logging.info('Checking for >= %d i-nodes under %s '
306                     'on machine %s', min_inodes, path, self.hostname)
307        df = self.run('df -Pi %s | tail -1' % path).stdout.split()
308        free_inodes = int(df[3])
309        if free_inodes < min_inodes:
310            raise error.AutoservNoFreeInodesError(path, min_inodes,
311                                                  free_inodes)
312        else:
313            logging.info('Found %d >= %d i-nodes under %s on '
314                         'machine %s', free_inodes, min_inodes,
315                         path, self.hostname)
316
317
318    def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
319        """Empty a given directory path contents."""
320        rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
321        self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
322
323
324    def repair(self):
325        """Try and get the host to pass `self.verify()`."""
326        self.verify()
327
328
329    def disable_ipfilters(self):
330        """Allow all network packets in and out of the host."""
331        self.run('iptables-save > /tmp/iptable-rules')
332        self.run('iptables -P INPUT ACCEPT')
333        self.run('iptables -P FORWARD ACCEPT')
334        self.run('iptables -P OUTPUT ACCEPT')
335
336
337    def enable_ipfilters(self):
338        """Re-enable the IP filters disabled from disable_ipfilters()"""
339        if self.path_exists('/tmp/iptable-rules'):
340            self.run('iptables-restore < /tmp/iptable-rules')
341
342
343    def cleanup(self):
344        pass
345
346
347    def machine_install(self):
348        raise NotImplementedError('Machine install not implemented!')
349
350
351    def install(self, installableObject):
352        installableObject.install(self)
353
354
355    def get_autodir(self):
356        raise NotImplementedError('Get autodir not implemented!')
357
358
359    def set_autodir(self):
360        raise NotImplementedError('Set autodir not implemented!')
361
362
363    def start_loggers(self):
364        """ Called to start continuous host logging. """
365        pass
366
367
368    def stop_loggers(self):
369        """ Called to stop continuous host logging. """
370        pass
371
372
373    # some extra methods simplify the retrieval of information about the
374    # Host machine, with generic implementations based on run(). subclasses
375    # should feel free to override these if they can provide better
376    # implementations for their specific Host types
377
378    def get_num_cpu(self):
379        """ Get the number of CPUs in the host according to /proc/cpuinfo. """
380        proc_cpuinfo = self.run('cat /proc/cpuinfo',
381                                stdout_tee=open(os.devnull, 'w')).stdout
382        cpus = 0
383        for line in proc_cpuinfo.splitlines():
384            if line.startswith('processor'):
385                cpus += 1
386        return cpus
387
388
389    def get_arch(self):
390        """ Get the hardware architecture of the remote machine. """
391        cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
392        arch = self.run('%s -m' % cmd_uname).stdout.rstrip()
393        if re.match(r'i\d86$', arch):
394            arch = 'i386'
395        return arch
396
397
398    def get_kernel_ver(self):
399        """ Get the kernel version of the remote machine. """
400        cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
401        return self.run('%s -r' % cmd_uname).stdout.rstrip()
402
403
404    def get_cmdline(self):
405        """ Get the kernel command line of the remote machine. """
406        return self.run('cat /proc/cmdline').stdout.rstrip()
407
408
409    def get_meminfo(self):
410        """ Get the kernel memory info (/proc/meminfo) of the remote machine
411        and return a dictionary mapping the various statistics. """
412        meminfo_dict = {}
413        meminfo = self.run('cat /proc/meminfo').stdout.splitlines()
414        for key, val in (line.split(':', 1) for line in meminfo):
415            meminfo_dict[key.strip()] = val.strip()
416        return meminfo_dict
417
418
419    def path_exists(self, path):
420        """ Determine if path exists on the remote machine. """
421        result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path),
422                          ignore_status=True)
423        return result.exit_status == 0
424
425
426    # some extra helpers for doing job-related operations
427
428    def record(self, *args, **dargs):
429        """ Helper method for recording status logs against Host.job that
430        silently becomes a NOP if Host.job is not available. The args and
431        dargs are passed on to Host.job.record unchanged. """
432        if self.job:
433            self.job.record(*args, **dargs)
434
435
436    def log_kernel(self):
437        """ Helper method for logging kernel information into the status logs.
438        Intended for cases where the "current" kernel is not really defined
439        and we want to explicitly log it. Does nothing if this host isn't
440        actually associated with a job. """
441        if self.job:
442            kernel = self.get_kernel_ver()
443            self.job.record("INFO", None, None,
444                            optional_fields={"kernel": kernel})
445
446
447    def log_op(self, op, op_func):
448        """ Decorator for wrapping a management operaiton in a group for status
449        logging purposes.
450
451        @param op: name of the operation.
452        @param op_func: a function that carries out the operation
453                        (reboot, suspend)
454        """
455        if self.job and not hasattr(self, "RUNNING_LOG_OP"):
456            self.RUNNING_LOG_OP = True
457            try:
458                self.job.run_op(op, op_func, self.get_kernel_ver)
459            finally:
460                del self.RUNNING_LOG_OP
461        else:
462            op_func()
463
464
465    def list_files_glob(self, glob):
466        """
467        Get a list of files on a remote host given a glob pattern path.
468        """
469        SCRIPT = ("python -c 'import cPickle, glob, sys;"
470                  "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'")
471        output = self.run(SCRIPT, args=(glob,), stdout_tee=None,
472                          timeout=60).stdout
473        return cPickle.loads(output)
474
475
476    def symlink_closure(self, paths):
477        """
478        Given a sequence of path strings, return the set of all paths that
479        can be reached from the initial set by following symlinks.
480
481        @param paths: sequence of path strings.
482        @return: a sequence of path strings that are all the unique paths that
483                can be reached from the given ones after following symlinks.
484        """
485        SCRIPT = ("python -c 'import cPickle, os, sys\n"
486                  "paths = cPickle.load(sys.stdin)\n"
487                  "closure = {}\n"
488                  "while paths:\n"
489                  "    path = paths.keys()[0]\n"
490                  "    del paths[path]\n"
491                  "    if not os.path.exists(path):\n"
492                  "        continue\n"
493                  "    closure[path] = None\n"
494                  "    if os.path.islink(path):\n"
495                  "        link_to = os.path.join(os.path.dirname(path),\n"
496                  "                               os.readlink(path))\n"
497                  "        if link_to not in closure.keys():\n"
498                  "            paths[link_to] = None\n"
499                  "cPickle.dump(closure.keys(), sys.stdout, 0)'")
500        input_data = cPickle.dumps(dict((path, None) for path in paths), 0)
501        output = self.run(SCRIPT, stdout_tee=None, stdin=input_data,
502                          timeout=60).stdout
503        return cPickle.loads(output)
504
505
506    def cleanup_kernels(self, boot_dir='/boot'):
507        """
508        Remove any kernel image and associated files (vmlinux, system.map,
509        modules) for any image found in the boot directory that is not
510        referenced by entries in the bootloader configuration.
511
512        @param boot_dir: boot directory path string, default '/boot'
513        """
514        # find all the vmlinuz images referenced by the bootloader
515        vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-')
516        boot_info = self.bootloader.get_entries()
517        used_kernver = [boot['kernel'][len(vmlinuz_prefix):]
518                        for boot in boot_info.itervalues()]
519
520        # find all the unused vmlinuz images in /boot
521        all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*')
522        used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver
523                                            for kernver in used_kernver)
524        unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz)
525
526        # find all the unused vmlinux images in /boot
527        vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-')
528        all_vmlinux = self.list_files_glob(vmlinux_prefix + '*')
529        used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver
530                                            for kernver in used_kernver)
531        unused_vmlinux = set(all_vmlinux) - set(used_vmlinux)
532
533        # find all the unused System.map files in /boot
534        systemmap_prefix = os.path.join(boot_dir, 'System.map-')
535        all_system_map = self.list_files_glob(systemmap_prefix + '*')
536        used_system_map = self.symlink_closure(
537            systemmap_prefix + kernver for kernver in used_kernver)
538        unused_system_map = set(all_system_map) - set(used_system_map)
539
540        # find all the module directories associated with unused kernels
541        modules_prefix = '/lib/modules/'
542        all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*')
543                       if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)]
544        used_moddirs = self.symlink_closure(modules_prefix + kernver
545                                            for kernver in used_kernver)
546        unused_moddirs = set(all_moddirs) - set(used_moddirs)
547
548        # remove all the vmlinuz files we don't use
549        # TODO: if needed this should become package manager agnostic
550        for vmlinuz in unused_vmlinuz:
551            # try and get an rpm package name
552            rpm = self.run('rpm -qf', args=(vmlinuz,),
553                           ignore_status=True, timeout=120)
554            if rpm.exit_status == 0:
555                packages = set(line.strip() for line in
556                               rpm.stdout.splitlines())
557                # if we found some package names, try to remove them
558                for package in packages:
559                    self.run('rpm -e', args=(package,),
560                             ignore_status=True, timeout=120)
561            # remove the image files anyway, even if rpm didn't
562            self.run('rm -f', args=(vmlinuz,),
563                     ignore_status=True, timeout=120)
564
565        # remove all the vmlinux and System.map files left over
566        for f in (unused_vmlinux | unused_system_map):
567            self.run('rm -f', args=(f,),
568                     ignore_status=True, timeout=120)
569
570        # remove all unused module directories
571        # the regex match should keep us safe from removing the wrong files
572        for moddir in unused_moddirs:
573            self.run('rm -fr', args=(moddir,), ignore_status=True)
574
575
576    def get_attributes_to_clear_before_provision(self):
577        """Get a list of attributes to be cleared before machine_install starts.
578
579        If provision runs in a lab environment, it is necessary to clear certain
580        host attributes for the host in afe_host_attributes table. For example,
581        `job_repo_url` is a devserver url pointed to autotest packages for
582        CrosHost, it needs to be removed before provision starts for tests to
583        run reliably.
584        For ADBHost, the job repo url has a different format, i.e., appended by
585        adb_serial, so this method should be overriden in ADBHost.
586        """
587        return ['job_repo_url']
588