base_classes.py revision 3649141cea5fda511978483ddf8709221f4c57fd
1# Copyright 2009 Google Inc. Released under the GPL v2
2
3"""
4This module defines the base classes for the Host hierarchy.
5
6Implementation details:
7You should import the "hosts" package instead of importing each type of host.
8
9        Host: a machine on which you can run programs
10"""
11
12__author__ = """
13mbligh@google.com (Martin J. Bligh),
14poirier@google.com (Benjamin Poirier),
15stutsman@google.com (Ryan Stutsman)
16"""
17
18import cPickle, cStringIO, logging, os, re, time
19
20from autotest_lib.client.common_lib import global_config, error, utils
21from autotest_lib.client.common_lib import host_protections
22from autotest_lib.client.bin import partition
23
24
25class Host(object):
26    """
27    This class represents a machine on which you can run programs.
28
29    It may be a local machine, the one autoserv is running on, a remote
30    machine or a virtual machine.
31
32    Implementation details:
33    This is an abstract class, leaf subclasses must implement the methods
34    listed here. You must not instantiate this class but should
35    instantiate one of those leaf subclasses.
36
37    When overriding methods that raise NotImplementedError, the leaf class
38    is fully responsible for the implementation and should not chain calls
39    to super. When overriding methods that are a NOP in Host, the subclass
40    should chain calls to super(). The criteria for fitting a new method into
41    one category or the other should be:
42        1. If two separate generic implementations could reasonably be
43           concatenated, then the abstract implementation should pass and
44           subclasses should chain calls to super.
45        2. If only one class could reasonably perform the stated function
46           (e.g. two separate run() implementations cannot both be executed)
47           then the method should raise NotImplementedError in Host, and
48           the implementor should NOT chain calls to super, to ensure that
49           only one implementation ever gets executed.
50    """
51
52    job = None
53    DEFAULT_REBOOT_TIMEOUT = 1800
54    WAIT_DOWN_REBOOT_TIMEOUT = 840
55    WAIT_DOWN_REBOOT_WARNING = 540
56    HOURS_TO_WAIT_FOR_RECOVERY = 2.5
57
58
59    def __init__(self, *args, **dargs):
60        self._initialize(*args, **dargs)
61
62
63    def _initialize(self, *args, **dargs):
64        self._already_repaired = []
65        self._removed_files = False
66
67
68    def close(self):
69        pass
70
71
72    def setup(self):
73        pass
74
75
76    def run(self, command, timeout=3600, ignore_status=False,
77            stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
78            stdin=None, args=()):
79        """
80        Run a command on this host.
81
82        @param command: the command line string
83        @param timeout: time limit in seconds before attempting to
84                kill the running process. The run() function
85                will take a few seconds longer than 'timeout'
86                to complete if it has to kill the process.
87        @param ignore_status: do not raise an exception, no matter
88                what the exit code of the command is.
89        @param stdout_tee/stderr_tee: where to tee the stdout/stderr
90        @param stdin: stdin to pass (a string) to the executed command
91        @param args: sequence of strings to pass as arguments to command by
92                quoting them in " and escaping their contents if necessary
93
94        @return a utils.CmdResult object
95
96        @raises AutotestHostRunError: the exit code of the command execution
97                was not 0 and ignore_status was not enabled
98        """
99        raise NotImplementedError('Run not implemented!')
100
101
102    def run_output(self, command, *args, **dargs):
103        return self.run(command, *args, **dargs).stdout.rstrip()
104
105
106    def reboot(self):
107        raise NotImplementedError('Reboot not implemented!')
108
109
110    def sysrq_reboot(self):
111        raise NotImplementedError('Sysrq reboot not implemented!')
112
113
114    def reboot_setup(self, *args, **dargs):
115        pass
116
117
118    def reboot_followup(self, *args, **dargs):
119        pass
120
121
122    def get_file(self, source, dest, delete_dest=False):
123        raise NotImplementedError('Get file not implemented!')
124
125
126    def send_file(self, source, dest, delete_dest=False):
127        raise NotImplementedError('Send file not implemented!')
128
129
130    def get_tmp_dir(self):
131        raise NotImplementedError('Get temp dir not implemented!')
132
133
134    def is_up(self):
135        raise NotImplementedError('Is up not implemented!')
136
137
138    def is_shutting_down(self):
139        """ Indicates is a machine is currently shutting down. """
140        # runlevel() may not be available, so wrap it in try block.
141        try:
142            runlevel = int(self.run("runlevel").stdout.strip().split()[1])
143            return runlevel in (0, 6)
144        except:
145            return False
146
147
148    def get_wait_up_processes(self):
149        """ Gets the list of local processes to wait for in wait_up. """
150        get_config = global_config.global_config.get_config_value
151        proc_list = get_config("HOSTS", "wait_up_processes",
152                               default="").strip()
153        processes = set(p.strip() for p in proc_list.split(","))
154        processes.discard("")
155        return processes
156
157
158    def get_boot_id(self, timeout=60):
159        """ Get a unique ID associated with the current boot.
160
161        Should return a string with the semantics such that two separate
162        calls to Host.get_boot_id() return the same string if the host did
163        not reboot between the two calls, and two different strings if it
164        has rebooted at least once between the two calls.
165
166        @param timeout The number of seconds to wait before timing out.
167
168        @return A string unique to this boot."""
169        return self.run('cat /proc/sys/kernel/random/boot_id',
170                        timeout=timeout).stdout.strip()
171
172
173    def wait_up(self, timeout=None):
174        raise NotImplementedError('Wait up not implemented!')
175
176
177    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
178        raise NotImplementedError('Wait down not implemented!')
179
180
181    def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
182                         log_failure=True, old_boot_id=None, **dargs):
183        """ Wait for the host to come back from a reboot. This is a generic
184        implementation based entirely on wait_up and wait_down. """
185        if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT,
186                              warning_timer=self.WAIT_DOWN_REBOOT_WARNING,
187                              old_boot_id=old_boot_id):
188            if log_failure:
189                self.record("ABORT", None, "reboot.verify", "shut down failed")
190            raise error.AutoservShutdownError("Host did not shut down")
191
192        self.wait_up(timeout)
193        time.sleep(2)    # this is needed for complete reliability
194        if self.wait_up(timeout):
195            self.record("GOOD", None, "reboot.verify")
196            self.reboot_followup(**dargs)
197        else:
198            self.record("ABORT", None, "reboot.verify",
199                        "Host did not return from reboot")
200            raise error.AutoservRebootError("Host did not return from reboot")
201
202
203    def verify(self):
204        self.verify_hardware()
205        self.verify_connectivity()
206        self.verify_software()
207
208
209    def verify_hardware(self):
210        pass
211
212
213    def verify_connectivity(self):
214        pass
215
216
217    def verify_software(self):
218        pass
219
220
221    def check_diskspace(self, path, gb):
222        logging.info('Checking for >= %s GB of space under %s on machine %s',
223                     gb, path, self.hostname)
224        df = self.run('df -mP %s | tail -1' % path).stdout.split()
225        free_space_gb = int(df[3])/1000.0
226        if free_space_gb < gb:
227            raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
228        else:
229            logging.info('Found %s GB >= %s GB of space under %s on machine %s',
230                free_space_gb, gb, path, self.hostname)
231
232
233    def get_open_func(self, use_cache=True):
234        """
235        Defines and returns a function that may be used instead of built-in
236        open() to open and read files. The returned function is implemented
237        by using self.run('cat <file>') and may cache the results for the same
238        filename.
239
240        @param use_cache Cache results of self.run('cat <filename>') for the
241            same filename
242
243        @return a function that can be used instead of built-in open()
244        """
245        cached_files = {}
246
247        def open_func(filename):
248            if not use_cache or filename not in cached_files:
249                output = self.run('cat \'%s\'' % filename,
250                                  stdout_tee=open('/dev/null', 'w')).stdout
251                fd = cStringIO.StringIO(output)
252
253                if not use_cache:
254                    return fd
255
256                cached_files[filename] = fd
257            else:
258                cached_files[filename].seek(0)
259
260            return cached_files[filename]
261
262        return open_func
263
264
265    def check_partitions(self, root_part, filter_func=None):
266        """ Compare the contents of /proc/partitions with those of
267        /proc/mounts and raise exception in case unmounted partitions are found
268
269        root_part: in Linux /proc/mounts will never directly mention the root
270        partition as being mounted on / instead it will say that /dev/root is
271        mounted on /. Thus require this argument to filter out the root_part
272        from the ones checked to be mounted
273
274        filter_func: unnary predicate for additional filtering out of
275        partitions required to be mounted
276
277        Raise: error.AutoservHostError if unfiltered unmounted partition found
278        """
279
280        print 'Checking if non-swap partitions are mounted...'
281
282        unmounted = partition.get_unmounted_partition_list(root_part,
283            filter_func=filter_func, open_func=self.get_open_func())
284        if unmounted:
285            raise error.AutoservNotMountedHostError(
286                'Found unmounted partitions: %s' %
287                [part.device for part in unmounted])
288
289
290    def _repair_wait_for_reboot(self):
291        TIMEOUT = int(self.HOURS_TO_WAIT_FOR_RECOVERY * 3600)
292        if self.is_shutting_down():
293            logging.info('Host is shutting down, waiting for a restart')
294            self.wait_for_restart(TIMEOUT)
295        else:
296            self.wait_up(TIMEOUT)
297
298
299    def _get_mountpoint(self, path):
300        """Given a "path" get the mount point of the filesystem containing
301        that path."""
302        code = ('import os\n'
303                # sanitize the path and resolve symlinks
304                'path = os.path.realpath(%r)\n'
305                "while path != '/' and not os.path.ismount(path):\n"
306                '    path, _ = os.path.split(path)\n'
307                'print path\n') % path
308        return self.run('python -c "%s"' % code,
309                        stdout_tee=open(os.devnull, 'w')).stdout.rstrip()
310
311
312    def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
313        """Empty a given directory path contents."""
314        rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
315        self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
316        self._removed_files = True
317
318
319    def repair_full_disk(self, mountpoint):
320        # it's safe to remove /tmp and /var/tmp, site specific overrides may
321        # want to remove some other places too
322        if mountpoint == self._get_mountpoint('/tmp'):
323            self.erase_dir_contents('/tmp')
324
325        if mountpoint == self._get_mountpoint('/var/tmp'):
326            self.erase_dir_contents('/var/tmp')
327
328
329    def _call_repair_func(self, err, func, *args, **dargs):
330        for old_call in self._already_repaired:
331            if old_call == (func, args, dargs):
332                # re-raising the original exception because surrounding
333                # error handling may want to try other ways to fix it
334                logging.warn('Already done this (%s) repair procedure, '
335                             're-raising the original exception.', func)
336                raise err
337
338        try:
339            func(*args, **dargs)
340        except error.AutoservHardwareRepairRequestedError:
341            # let this special exception propagate
342            raise
343        except error.AutoservError:
344            logging.exception('Repair failed but continuing in case it managed'
345                              ' to repair enough')
346
347        self._already_repaired.append((func, args, dargs))
348
349
350    def repair_filesystem_only(self):
351        """perform file system repairs only"""
352        while True:
353            # try to repair specific problems
354            try:
355                logging.info('Running verify to find failures to repair...')
356                self.verify()
357                if self._removed_files:
358                    logging.info('Removed files, rebooting to release the'
359                                 ' inodes')
360                    self.reboot()
361                return # verify succeeded, then repair succeeded
362            except error.AutoservHostIsShuttingDownError, err:
363                logging.exception('verify failed')
364                self._call_repair_func(err, self._repair_wait_for_reboot)
365            except error.AutoservDiskFullHostError, err:
366                logging.exception('verify failed')
367                self._call_repair_func(err, self.repair_full_disk,
368                                       self._get_mountpoint(err.path))
369
370
371    def repair_software_only(self):
372        """perform software repairs only"""
373        while True:
374            try:
375                self.repair_filesystem_only()
376                break
377            except (error.AutoservSshPingHostError, error.AutoservSSHTimeout,
378                    error.AutoservSshPermissionDeniedError,
379                    error.AutoservDiskFullHostError), err:
380                logging.exception('verify failed')
381                logging.info('Trying to reinstall the machine')
382                self._call_repair_func(err, self.machine_install)
383
384
385    def repair_full(self):
386        while True:
387            try:
388                self.repair_software_only()
389                break
390            except error.AutoservHardwareHostError, err:
391                logging.exception('verify failed')
392                # software repair failed, try hardware repair
393                logging.info('Hardware problem found, '
394                             'requesting hardware repairs')
395                self._call_repair_func(err, self.request_hardware_repair)
396
397
398    def repair_with_protection(self, protection_level):
399        """Perform the maximal amount of repair within the specified
400        protection level.
401
402        @param protection_level: the protection level to use for limiting
403                                 repairs, a host_protections.Protection
404        """
405        protection = host_protections.Protection
406        if protection_level == protection.DO_NOT_REPAIR:
407            logging.info('Protection is "Do not repair" so just verifying')
408            self.verify()
409        elif protection_level == protection.REPAIR_FILESYSTEM_ONLY:
410            logging.info('Attempting filesystem-only repair')
411            self.repair_filesystem_only()
412        elif protection_level == protection.REPAIR_SOFTWARE_ONLY:
413            logging.info('Attempting software repair only')
414            self.repair_software_only()
415        elif protection_level == protection.NO_PROTECTION:
416            logging.info('Attempting full repair')
417            self.repair_full()
418        else:
419            raise NotImplementedError('Unknown host protection level %s'
420                                      % protection_level)
421
422
423    def disable_ipfilters(self):
424        """Allow all network packets in and out of the host."""
425        self.run('iptables-save > /tmp/iptable-rules')
426        self.run('iptables -P INPUT ACCEPT')
427        self.run('iptables -P FORWARD ACCEPT')
428        self.run('iptables -P OUTPUT ACCEPT')
429
430
431    def enable_ipfilters(self):
432        """Re-enable the IP filters disabled from disable_ipfilters()"""
433        if os.path.isfile('/tmp/iptable-rules'):
434            self.run('iptables-restore < /tmp/iptable-rules')
435
436
437    def cleanup(self):
438        pass
439
440
441    def machine_install(self):
442        raise NotImplementedError('Machine install not implemented!')
443
444
445    def install(self, installableObject):
446        installableObject.install(self)
447
448
449    def get_autodir(self):
450        raise NotImplementedError('Get autodir not implemented!')
451
452
453    def set_autodir(self):
454        raise NotImplementedError('Set autodir not implemented!')
455
456
457    def start_loggers(self):
458        """ Called to start continuous host logging. """
459        pass
460
461
462    def stop_loggers(self):
463        """ Called to stop continuous host logging. """
464        pass
465
466
467    # some extra methods simplify the retrieval of information about the
468    # Host machine, with generic implementations based on run(). subclasses
469    # should feel free to override these if they can provide better
470    # implementations for their specific Host types
471
472    def get_num_cpu(self):
473        """ Get the number of CPUs in the host according to /proc/cpuinfo. """
474        proc_cpuinfo = self.run('cat /proc/cpuinfo',
475                                stdout_tee=open(os.devnull, 'w')).stdout
476        cpus = 0
477        for line in proc_cpuinfo.splitlines():
478            if line.startswith('processor'):
479                cpus += 1
480        return cpus
481
482
483    def get_arch(self):
484        """ Get the hardware architecture of the remote machine. """
485        arch = self.run('/bin/uname -m').stdout.rstrip()
486        if re.match(r'i\d86$', arch):
487            arch = 'i386'
488        return arch
489
490
491    def get_kernel_ver(self):
492        """ Get the kernel version of the remote machine. """
493        return self.run('/bin/uname -r').stdout.rstrip()
494
495
496    def get_cmdline(self):
497        """ Get the kernel command line of the remote machine. """
498        return self.run('cat /proc/cmdline').stdout.rstrip()
499
500
501    def path_exists(self, path):
502        """ Determine if path exists on the remote machine. """
503        result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path),
504                          ignore_status=True)
505        return result.exit_status == 0
506
507
508    # some extra helpers for doing job-related operations
509
510    def record(self, *args, **dargs):
511        """ Helper method for recording status logs against Host.job that
512        silently becomes a NOP if Host.job is not available. The args and
513        dargs are passed on to Host.job.record unchanged. """
514        if self.job:
515            self.job.record(*args, **dargs)
516
517
518    def log_kernel(self):
519        """ Helper method for logging kernel information into the status logs.
520        Intended for cases where the "current" kernel is not really defined
521        and we want to explicitly log it. Does nothing if this host isn't
522        actually associated with a job. """
523        if self.job:
524            kernel = self.get_kernel_ver()
525            self.job.record("INFO", None, None,
526                            optional_fields={"kernel": kernel})
527
528
529    def log_reboot(self, reboot_func):
530        """ Decorator for wrapping a reboot in a group for status
531        logging purposes. The reboot_func parameter should be an actual
532        function that carries out the reboot.
533        """
534        if self.job and not hasattr(self, "RUNNING_LOG_REBOOT"):
535            self.RUNNING_LOG_REBOOT = True
536            try:
537                self.job.run_reboot(reboot_func, self.get_kernel_ver)
538            finally:
539                del self.RUNNING_LOG_REBOOT
540        else:
541            reboot_func()
542
543
544    def request_hardware_repair(self):
545        """ Should somehow request (send a mail?) for hardware repairs on
546        this machine. The implementation can either return by raising the
547        special error.AutoservHardwareRepairRequestedError exception or can
548        try to wait until the machine is repaired and then return normally.
549        """
550        raise NotImplementedError("request_hardware_repair not implemented")
551
552
553    def list_files_glob(self, glob):
554        """
555        Get a list of files on a remote host given a glob pattern path.
556        """
557        SCRIPT = ("python -c 'import cPickle, glob, sys;"
558                  "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'")
559        output = self.run(SCRIPT, args=(glob,), stdout_tee=None,
560                          timeout=60).stdout
561        return cPickle.loads(output)
562
563
564    def symlink_closure(self, paths):
565        """
566        Given a sequence of path strings, return the set of all paths that
567        can be reached from the initial set by following symlinks.
568
569        @param paths: sequence of path strings.
570        @return: a sequence of path strings that are all the unique paths that
571                can be reached from the given ones after following symlinks.
572        """
573        SCRIPT = ("python -c 'import cPickle, os, sys\n"
574                  "paths = cPickle.load(sys.stdin)\n"
575                  "closure = {}\n"
576                  "while paths:\n"
577                  "    path = paths.keys()[0]\n"
578                  "    del paths[path]\n"
579                  "    if not os.path.exists(path):\n"
580                  "        continue\n"
581                  "    closure[path] = None\n"
582                  "    if os.path.islink(path):\n"
583                  "        link_to = os.path.join(os.path.dirname(path),\n"
584                  "                               os.readlink(path))\n"
585                  "        if link_to not in closure.keys():\n"
586                  "            paths[link_to] = None\n"
587                  "cPickle.dump(closure.keys(), sys.stdout, 0)'")
588        input_data = cPickle.dumps(dict((path, None) for path in paths), 0)
589        output = self.run(SCRIPT, stdout_tee=None, stdin=input_data,
590                          timeout=60).stdout
591        return cPickle.loads(output)
592
593
594    def cleanup_kernels(self, boot_dir='/boot'):
595        """
596        Remove any kernel image and associated files (vmlinux, system.map,
597        modules) for any image found in the boot directory that is not
598        referenced by entries in the bootloader configuration.
599
600        @param boot_dir: boot directory path string, default '/boot'
601        """
602        # find all the vmlinuz images referenced by the bootloader
603        vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-')
604        boot_info = self.bootloader.get_entries()
605        used_kernver = [boot['kernel'][len(vmlinuz_prefix):]
606                        for boot in boot_info.itervalues()]
607
608        # find all the unused vmlinuz images in /boot
609        all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*')
610        used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver
611                                            for kernver in used_kernver)
612        unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz)
613
614        # find all the unused vmlinux images in /boot
615        vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-')
616        all_vmlinux = self.list_files_glob(vmlinux_prefix + '*')
617        used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver
618                                            for kernver in used_kernver)
619        unused_vmlinux = set(all_vmlinux) - set(used_vmlinux)
620
621        # find all the unused System.map files in /boot
622        systemmap_prefix = os.path.join(boot_dir, 'System.map-')
623        all_system_map = self.list_files_glob(systemmap_prefix + '*')
624        used_system_map = self.symlink_closure(
625            systemmap_prefix + kernver for kernver in used_kernver)
626        unused_system_map = set(all_system_map) - set(used_system_map)
627
628        # find all the module directories associated with unused kernels
629        modules_prefix = '/lib/modules/'
630        all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*')
631                       if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)]
632        used_moddirs = self.symlink_closure(modules_prefix + kernver
633                                            for kernver in used_kernver)
634        unused_moddirs = set(all_moddirs) - set(used_moddirs)
635
636        # remove all the vmlinuz files we don't use
637        # TODO: if needed this should become package manager agnostic
638        for vmlinuz in unused_vmlinuz:
639            # try and get an rpm package name
640            rpm = self.run('rpm -qf', args=(vmlinuz,),
641                           ignore_status=True, timeout=120)
642            if rpm.exit_status == 0:
643                packages = set(line.strip() for line in
644                               rpm.stdout.splitlines())
645                # if we found some package names, try to remove them
646                for package in packages:
647                    self.run('rpm -e', args=(package,),
648                             ignore_status=True, timeout=120)
649            # remove the image files anyway, even if rpm didn't
650            self.run('rm -f', args=(vmlinuz,),
651                     ignore_status=True, timeout=120)
652
653        # remove all the vmlinux and System.map files left over
654        for f in (unused_vmlinux | unused_system_map):
655            self.run('rm -f', args=(f,),
656                     ignore_status=True, timeout=120)
657
658        # remove all unused module directories
659        # the regex match should keep us safe from removing the wrong files
660        for moddir in unused_moddirs:
661            self.run('rm -fr', args=(moddir,), ignore_status=True)
662