1# Copyright 2009 Google Inc. Released under the GPL v2
2
3"""
4This module defines the base classes for the Host hierarchy.
5
6Implementation details:
7You should import the "hosts" package instead of importing each type of host.
8
9        Host: a machine on which you can run programs
10"""
11
12__author__ = """
13mbligh@google.com (Martin J. Bligh),
14poirier@google.com (Benjamin Poirier),
15stutsman@google.com (Ryan Stutsman)
16"""
17
18import cPickle, cStringIO, logging, os, re, time
19
20from autotest_lib.client.common_lib import global_config, error, utils
21from autotest_lib.client.common_lib.cros import path_utils
22from autotest_lib.client.common_lib.cros.graphite import autotest_stats
23from autotest_lib.client.bin import partition
24
25
26class Host(object):
27    """
28    This class represents a machine on which you can run programs.
29
30    It may be a local machine, the one autoserv is running on, a remote
31    machine or a virtual machine.
32
33    Implementation details:
34    This is an abstract class, leaf subclasses must implement the methods
35    listed here. You must not instantiate this class but should
36    instantiate one of those leaf subclasses.
37
38    When overriding methods that raise NotImplementedError, the leaf class
39    is fully responsible for the implementation and should not chain calls
40    to super. When overriding methods that are a NOP in Host, the subclass
41    should chain calls to super(). The criteria for fitting a new method into
42    one category or the other should be:
43        1. If two separate generic implementations could reasonably be
44           concatenated, then the abstract implementation should pass and
45           subclasses should chain calls to super.
46        2. If only one class could reasonably perform the stated function
47           (e.g. two separate run() implementations cannot both be executed)
48           then the method should raise NotImplementedError in Host, and
49           the implementor should NOT chain calls to super, to ensure that
50           only one implementation ever gets executed.
51    """
52
53    job = None
54    DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
55        "HOSTS", "default_reboot_timeout", type=int, default=1800)
56    WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
57        "HOSTS", "wait_down_reboot_timeout", type=int, default=840)
58    WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value(
59        "HOSTS", "wait_down_reboot_warning", type=int, default=540)
60    HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value(
61        "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5)
62    # the number of hardware repair requests that need to happen before we
63    # actually send machines to hardware repair
64    HARDWARE_REPAIR_REQUEST_THRESHOLD = 4
65    OP_REBOOT = 'reboot'
66    OP_SUSPEND = 'suspend'
67    PWR_OPERATION = [OP_REBOOT, OP_SUSPEND]
68
69
70    def __init__(self, *args, **dargs):
71        self._initialize(*args, **dargs)
72
73
74    def _initialize(self, *args, **dargs):
75        pass
76
77
78    def close(self):
79        pass
80
81
82    def setup(self):
83        pass
84
85
86    def run(self, command, timeout=3600, ignore_status=False,
87            stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
88            stdin=None, args=()):
89        """
90        Run a command on this host.
91
92        @param command: the command line string
93        @param timeout: time limit in seconds before attempting to
94                kill the running process. The run() function
95                will take a few seconds longer than 'timeout'
96                to complete if it has to kill the process.
97        @param ignore_status: do not raise an exception, no matter
98                what the exit code of the command is.
99        @param stdout_tee/stderr_tee: where to tee the stdout/stderr
100        @param stdin: stdin to pass (a string) to the executed command
101        @param args: sequence of strings to pass as arguments to command by
102                quoting them in " and escaping their contents if necessary
103
104        @return a utils.CmdResult object
105
106        @raises AutotestHostRunError: the exit code of the command execution
107                was not 0 and ignore_status was not enabled
108        """
109        raise NotImplementedError('Run not implemented!')
110
111
112    def run_output(self, command, *args, **dargs):
113        return self.run(command, *args, **dargs).stdout.rstrip()
114
115
116    def reboot(self):
117        raise NotImplementedError('Reboot not implemented!')
118
119
120    def suspend(self):
121        raise NotImplementedError('Suspend not implemented!')
122
123
124    def sysrq_reboot(self):
125        raise NotImplementedError('Sysrq reboot not implemented!')
126
127
128    def reboot_setup(self, *args, **dargs):
129        pass
130
131
132    def reboot_followup(self, *args, **dargs):
133        pass
134
135
136    def get_file(self, source, dest, delete_dest=False):
137        raise NotImplementedError('Get file not implemented!')
138
139
140    def send_file(self, source, dest, delete_dest=False):
141        raise NotImplementedError('Send file not implemented!')
142
143
144    def get_tmp_dir(self):
145        raise NotImplementedError('Get temp dir not implemented!')
146
147
148    def is_up(self):
149        raise NotImplementedError('Is up not implemented!')
150
151
152    def is_shutting_down(self):
153        """ Indicates is a machine is currently shutting down. """
154        return False
155
156
157    def get_wait_up_processes(self):
158        """ Gets the list of local processes to wait for in wait_up. """
159        get_config = global_config.global_config.get_config_value
160        proc_list = get_config("HOSTS", "wait_up_processes",
161                               default="").strip()
162        processes = set(p.strip() for p in proc_list.split(","))
163        processes.discard("")
164        return processes
165
166
167    def get_boot_id(self, timeout=60):
168        """ Get a unique ID associated with the current boot.
169
170        Should return a string with the semantics such that two separate
171        calls to Host.get_boot_id() return the same string if the host did
172        not reboot between the two calls, and two different strings if it
173        has rebooted at least once between the two calls.
174
175        @param timeout The number of seconds to wait before timing out.
176
177        @return A string unique to this boot or None if not available."""
178        BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id'
179        NO_ID_MSG = 'no boot_id available'
180        cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % (
181                BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG)
182        boot_id = self.run(cmd, timeout=timeout).stdout.strip()
183        if boot_id == NO_ID_MSG:
184            return None
185        return boot_id
186
187
188    def wait_up(self, timeout=None):
189        raise NotImplementedError('Wait up not implemented!')
190
191
192    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
193        raise NotImplementedError('Wait down not implemented!')
194
195
196    def _construct_host_metadata(self, type_str):
197        """Returns dict of metadata with type_str, hostname, time_recorded.
198
199        @param type_str: String representing _type field in es db.
200            For example: type_str='reboot_total'.
201        """
202        metadata = {
203            'hostname': self.hostname,
204            'time_recorded': time.time(),
205            '_type': type_str,
206        }
207        return metadata
208
209
210    def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
211                         down_timeout=WAIT_DOWN_REBOOT_TIMEOUT,
212                         down_warning=WAIT_DOWN_REBOOT_WARNING,
213                         log_failure=True, old_boot_id=None, **dargs):
214        """ Wait for the host to come back from a reboot. This is a generic
215        implementation based entirely on wait_up and wait_down. """
216        key_string = 'Reboot.%s' % dargs.get('board')
217
218        total_reboot_timer = autotest_stats.Timer('%s.total' % key_string,
219                metadata=self._construct_host_metadata('reboot_total'))
220        wait_down_timer = autotest_stats.Timer('%s.wait_down' % key_string,
221                metadata=self._construct_host_metadata('reboot_down'))
222
223        total_reboot_timer.start()
224        wait_down_timer.start()
225        if not self.wait_down(timeout=down_timeout,
226                              warning_timer=down_warning,
227                              old_boot_id=old_boot_id):
228            if log_failure:
229                self.record("ABORT", None, "reboot.verify", "shut down failed")
230            raise error.AutoservShutdownError("Host did not shut down")
231        wait_down_timer.stop()
232        wait_up_timer = autotest_stats.Timer('%s.wait_up' % key_string,
233                metadata=self._construct_host_metadata('reboot_up'))
234        wait_up_timer.start()
235        if self.wait_up(timeout):
236            self.record("GOOD", None, "reboot.verify")
237            self.reboot_followup(**dargs)
238            wait_up_timer.stop()
239            total_reboot_timer.stop()
240        else:
241            self.record("ABORT", None, "reboot.verify",
242                        "Host did not return from reboot")
243            raise error.AutoservRebootError("Host did not return from reboot")
244
245
246    def verify(self):
247        self.verify_hardware()
248        self.verify_connectivity()
249        self.verify_software()
250
251
252    def verify_hardware(self):
253        pass
254
255
256    def verify_connectivity(self):
257        pass
258
259
260    def verify_software(self):
261        pass
262
263
264    def check_diskspace(self, path, gb):
265        """Raises an error if path does not have at least gb GB free.
266
267        @param path The path to check for free disk space.
268        @param gb A floating point number to compare with a granularity
269            of 1 MB.
270
271        1000 based SI units are used.
272
273        @raises AutoservDiskFullHostError if path has less than gb GB free.
274        """
275        one_mb = 10 ** 6  # Bytes (SI unit).
276        mb_per_gb = 1000.0
277        logging.info('Checking for >= %s GB of space under %s on machine %s',
278                     gb, path, self.hostname)
279        df = self.run('df -PB %d %s | tail -1' % (one_mb, path)).stdout.split()
280        free_space_gb = int(df[3]) / mb_per_gb
281        if free_space_gb < gb:
282            raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
283        else:
284            logging.info('Found %s GB >= %s GB of space under %s on machine %s',
285                free_space_gb, gb, path, self.hostname)
286
287
288    def check_inodes(self, path, min_kilo_inodes):
289        """Raises an error if a file system is short on i-nodes.
290
291        @param path The path to check for free i-nodes.
292        @param min_kilo_inodes Minimum number of i-nodes required,
293                               in units of 1000 i-nodes.
294
295        @raises AutoservNoFreeInodesError If the minimum required
296                                  i-node count isn't available.
297        """
298        min_inodes = 1000 * min_kilo_inodes
299        logging.info('Checking for >= %d i-nodes under %s '
300                     'on machine %s', min_inodes, path, self.hostname)
301        df = self.run('df -Pi %s | tail -1' % path).stdout.split()
302        free_inodes = int(df[3])
303        if free_inodes < min_inodes:
304            raise error.AutoservNoFreeInodesError(path, min_inodes,
305                                                  free_inodes)
306        else:
307            logging.info('Found %d >= %d i-nodes under %s on '
308                         'machine %s', free_inodes, min_inodes,
309                         path, self.hostname)
310
311
312    def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
313        """Empty a given directory path contents."""
314        rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
315        self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
316
317
318    def repair(self):
319        """Try and get the host to pass `self.verify()`."""
320        self.verify()
321
322
323    def disable_ipfilters(self):
324        """Allow all network packets in and out of the host."""
325        self.run('iptables-save > /tmp/iptable-rules')
326        self.run('iptables -P INPUT ACCEPT')
327        self.run('iptables -P FORWARD ACCEPT')
328        self.run('iptables -P OUTPUT ACCEPT')
329
330
331    def enable_ipfilters(self):
332        """Re-enable the IP filters disabled from disable_ipfilters()"""
333        if self.path_exists('/tmp/iptable-rules'):
334            self.run('iptables-restore < /tmp/iptable-rules')
335
336
337    def cleanup(self):
338        pass
339
340
341    def machine_install(self):
342        raise NotImplementedError('Machine install not implemented!')
343
344
345    def install(self, installableObject):
346        installableObject.install(self)
347
348
349    def get_autodir(self):
350        raise NotImplementedError('Get autodir not implemented!')
351
352
353    def set_autodir(self):
354        raise NotImplementedError('Set autodir not implemented!')
355
356
357    def start_loggers(self):
358        """ Called to start continuous host logging. """
359        pass
360
361
362    def stop_loggers(self):
363        """ Called to stop continuous host logging. """
364        pass
365
366
367    # some extra methods simplify the retrieval of information about the
368    # Host machine, with generic implementations based on run(). subclasses
369    # should feel free to override these if they can provide better
370    # implementations for their specific Host types
371
372    def get_num_cpu(self):
373        """ Get the number of CPUs in the host according to /proc/cpuinfo. """
374        proc_cpuinfo = self.run('cat /proc/cpuinfo',
375                                stdout_tee=open(os.devnull, 'w')).stdout
376        cpus = 0
377        for line in proc_cpuinfo.splitlines():
378            if line.startswith('processor'):
379                cpus += 1
380        return cpus
381
382
383    def get_arch(self):
384        """ Get the hardware architecture of the remote machine. """
385        cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
386        arch = self.run('%s -m' % cmd_uname).stdout.rstrip()
387        if re.match(r'i\d86$', arch):
388            arch = 'i386'
389        return arch
390
391
392    def get_kernel_ver(self):
393        """ Get the kernel version of the remote machine. """
394        cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
395        return self.run('%s -r' % cmd_uname).stdout.rstrip()
396
397
398    def get_cmdline(self):
399        """ Get the kernel command line of the remote machine. """
400        return self.run('cat /proc/cmdline').stdout.rstrip()
401
402
403    def get_meminfo(self):
404        """ Get the kernel memory info (/proc/meminfo) of the remote machine
405        and return a dictionary mapping the various statistics. """
406        meminfo_dict = {}
407        meminfo = self.run('cat /proc/meminfo').stdout.splitlines()
408        for key, val in (line.split(':', 1) for line in meminfo):
409            meminfo_dict[key.strip()] = val.strip()
410        return meminfo_dict
411
412
413    def path_exists(self, path):
414        """ Determine if path exists on the remote machine. """
415        result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path),
416                          ignore_status=True)
417        return result.exit_status == 0
418
419
420    # some extra helpers for doing job-related operations
421
422    def record(self, *args, **dargs):
423        """ Helper method for recording status logs against Host.job that
424        silently becomes a NOP if Host.job is not available. The args and
425        dargs are passed on to Host.job.record unchanged. """
426        if self.job:
427            self.job.record(*args, **dargs)
428
429
430    def log_kernel(self):
431        """ Helper method for logging kernel information into the status logs.
432        Intended for cases where the "current" kernel is not really defined
433        and we want to explicitly log it. Does nothing if this host isn't
434        actually associated with a job. """
435        if self.job:
436            kernel = self.get_kernel_ver()
437            self.job.record("INFO", None, None,
438                            optional_fields={"kernel": kernel})
439
440
441    def log_op(self, op, op_func):
442        """ Decorator for wrapping a management operaiton in a group for status
443        logging purposes.
444
445        @param op: name of the operation.
446        @param op_func: a function that carries out the operation
447                        (reboot, suspend)
448        """
449        if self.job and not hasattr(self, "RUNNING_LOG_OP"):
450            self.RUNNING_LOG_OP = True
451            try:
452                self.job.run_op(op, op_func, self.get_kernel_ver)
453            finally:
454                del self.RUNNING_LOG_OP
455        else:
456            op_func()
457
458
459    def list_files_glob(self, glob):
460        """
461        Get a list of files on a remote host given a glob pattern path.
462        """
463        SCRIPT = ("python -c 'import cPickle, glob, sys;"
464                  "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'")
465        output = self.run(SCRIPT, args=(glob,), stdout_tee=None,
466                          timeout=60).stdout
467        return cPickle.loads(output)
468
469
470    def symlink_closure(self, paths):
471        """
472        Given a sequence of path strings, return the set of all paths that
473        can be reached from the initial set by following symlinks.
474
475        @param paths: sequence of path strings.
476        @return: a sequence of path strings that are all the unique paths that
477                can be reached from the given ones after following symlinks.
478        """
479        SCRIPT = ("python -c 'import cPickle, os, sys\n"
480                  "paths = cPickle.load(sys.stdin)\n"
481                  "closure = {}\n"
482                  "while paths:\n"
483                  "    path = paths.keys()[0]\n"
484                  "    del paths[path]\n"
485                  "    if not os.path.exists(path):\n"
486                  "        continue\n"
487                  "    closure[path] = None\n"
488                  "    if os.path.islink(path):\n"
489                  "        link_to = os.path.join(os.path.dirname(path),\n"
490                  "                               os.readlink(path))\n"
491                  "        if link_to not in closure.keys():\n"
492                  "            paths[link_to] = None\n"
493                  "cPickle.dump(closure.keys(), sys.stdout, 0)'")
494        input_data = cPickle.dumps(dict((path, None) for path in paths), 0)
495        output = self.run(SCRIPT, stdout_tee=None, stdin=input_data,
496                          timeout=60).stdout
497        return cPickle.loads(output)
498
499
500    def cleanup_kernels(self, boot_dir='/boot'):
501        """
502        Remove any kernel image and associated files (vmlinux, system.map,
503        modules) for any image found in the boot directory that is not
504        referenced by entries in the bootloader configuration.
505
506        @param boot_dir: boot directory path string, default '/boot'
507        """
508        # find all the vmlinuz images referenced by the bootloader
509        vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-')
510        boot_info = self.bootloader.get_entries()
511        used_kernver = [boot['kernel'][len(vmlinuz_prefix):]
512                        for boot in boot_info.itervalues()]
513
514        # find all the unused vmlinuz images in /boot
515        all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*')
516        used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver
517                                            for kernver in used_kernver)
518        unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz)
519
520        # find all the unused vmlinux images in /boot
521        vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-')
522        all_vmlinux = self.list_files_glob(vmlinux_prefix + '*')
523        used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver
524                                            for kernver in used_kernver)
525        unused_vmlinux = set(all_vmlinux) - set(used_vmlinux)
526
527        # find all the unused System.map files in /boot
528        systemmap_prefix = os.path.join(boot_dir, 'System.map-')
529        all_system_map = self.list_files_glob(systemmap_prefix + '*')
530        used_system_map = self.symlink_closure(
531            systemmap_prefix + kernver for kernver in used_kernver)
532        unused_system_map = set(all_system_map) - set(used_system_map)
533
534        # find all the module directories associated with unused kernels
535        modules_prefix = '/lib/modules/'
536        all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*')
537                       if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)]
538        used_moddirs = self.symlink_closure(modules_prefix + kernver
539                                            for kernver in used_kernver)
540        unused_moddirs = set(all_moddirs) - set(used_moddirs)
541
542        # remove all the vmlinuz files we don't use
543        # TODO: if needed this should become package manager agnostic
544        for vmlinuz in unused_vmlinuz:
545            # try and get an rpm package name
546            rpm = self.run('rpm -qf', args=(vmlinuz,),
547                           ignore_status=True, timeout=120)
548            if rpm.exit_status == 0:
549                packages = set(line.strip() for line in
550                               rpm.stdout.splitlines())
551                # if we found some package names, try to remove them
552                for package in packages:
553                    self.run('rpm -e', args=(package,),
554                             ignore_status=True, timeout=120)
555            # remove the image files anyway, even if rpm didn't
556            self.run('rm -f', args=(vmlinuz,),
557                     ignore_status=True, timeout=120)
558
559        # remove all the vmlinux and System.map files left over
560        for f in (unused_vmlinux | unused_system_map):
561            self.run('rm -f', args=(f,),
562                     ignore_status=True, timeout=120)
563
564        # remove all unused module directories
565        # the regex match should keep us safe from removing the wrong files
566        for moddir in unused_moddirs:
567            self.run('rm -fr', args=(moddir,), ignore_status=True)
568