base_classes.py revision c035491ba24efea9e4343982f0a7c4b92e0a8c72
1# Copyright 2009 Google Inc. Released under the GPL v2
2
3"""
4This module defines the base classes for the Host hierarchy.
5
6Implementation details:
7You should import the "hosts" package instead of importing each type of host.
8
9        Host: a machine on which you can run programs
10"""
11
12__author__ = """
13mbligh@google.com (Martin J. Bligh),
14poirier@google.com (Benjamin Poirier),
15stutsman@google.com (Ryan Stutsman)
16"""
17
18import cPickle, cStringIO, logging, os, re, time
19
20from autotest_lib.client.common_lib import global_config, error, utils
21from autotest_lib.client.common_lib import host_protections
22from autotest_lib.client.bin import partition
23
24
25class Host(object):
26    """
27    This class represents a machine on which you can run programs.
28
29    It may be a local machine, the one autoserv is running on, a remote
30    machine or a virtual machine.
31
32    Implementation details:
33    This is an abstract class, leaf subclasses must implement the methods
34    listed here. You must not instantiate this class but should
35    instantiate one of those leaf subclasses.
36
37    When overriding methods that raise NotImplementedError, the leaf class
38    is fully responsible for the implementation and should not chain calls
39    to super. When overriding methods that are a NOP in Host, the subclass
40    should chain calls to super(). The criteria for fitting a new method into
41    one category or the other should be:
42        1. If two separate generic implementations could reasonably be
43           concatenated, then the abstract implementation should pass and
44           subclasses should chain calls to super.
45        2. If only one class could reasonably perform the stated function
46           (e.g. two separate run() implementations cannot both be executed)
47           then the method should raise NotImplementedError in Host, and
48           the implementor should NOT chain calls to super, to ensure that
49           only one implementation ever gets executed.
50    """
51
52    job = None
53    DEFAULT_REBOOT_TIMEOUT = 1800
54    WAIT_DOWN_REBOOT_TIMEOUT = 840
55    WAIT_DOWN_REBOOT_WARNING = 540
56    HOURS_TO_WAIT_FOR_RECOVERY = 2.5
57
58
59    def __init__(self, *args, **dargs):
60        self._initialize(*args, **dargs)
61
62
63    def _initialize(self, *args, **dargs):
64        self._already_repaired = []
65        self._removed_files = False
66
67
68    def close(self):
69        pass
70
71
72    def setup(self):
73        pass
74
75
76    def run(self, command, timeout=3600, ignore_status=False,
77            stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
78            stdin=None, args=()):
79        """
80        Run a command on this host.
81
82        @param command: the command line string
83        @param timeout: time limit in seconds before attempting to
84                kill the running process. The run() function
85                will take a few seconds longer than 'timeout'
86                to complete if it has to kill the process.
87        @param ignore_status: do not raise an exception, no matter
88                what the exit code of the command is.
89        @param stdout_tee/stderr_tee: where to tee the stdout/stderr
90        @param stdin: stdin to pass (a string) to the executed command
91        @param args: sequence of strings to pass as arguments to command by
92                quoting them in " and escaping their contents if necessary
93
94        @return a utils.CmdResult object
95
96        @raises AutotestHostRunError: the exit code of the command execution
97                was not 0 and ignore_status was not enabled
98        """
99        raise NotImplementedError('Run not implemented!')
100
101
102    def run_output(self, command, *args, **dargs):
103        return self.run(command, *args, **dargs).stdout.rstrip()
104
105
106    def reboot(self):
107        raise NotImplementedError('Reboot not implemented!')
108
109
110    def sysrq_reboot(self):
111        raise NotImplementedError('Sysrq reboot not implemented!')
112
113
114    def reboot_setup(self, *args, **dargs):
115        pass
116
117
118    def reboot_followup(self, *args, **dargs):
119        pass
120
121
122    def get_file(self, source, dest, delete_dest=False):
123        raise NotImplementedError('Get file not implemented!')
124
125
126    def send_file(self, source, dest, delete_dest=False):
127        raise NotImplementedError('Send file not implemented!')
128
129
130    def get_tmp_dir(self):
131        raise NotImplementedError('Get temp dir not implemented!')
132
133
134    def is_up(self):
135        raise NotImplementedError('Is up not implemented!')
136
137
138    def is_shutting_down(self):
139        """ Indicates is a machine is currently shutting down. """
140        runlevel = int(self.run("runlevel").stdout.strip().split()[1])
141        return runlevel in (0, 6)
142
143
144    def get_wait_up_processes(self):
145        """ Gets the list of local processes to wait for in wait_up. """
146        get_config = global_config.global_config.get_config_value
147        proc_list = get_config("HOSTS", "wait_up_processes",
148                               default="").strip()
149        processes = set(p.strip() for p in proc_list.split(","))
150        processes.discard("")
151        return processes
152
153
154    def get_boot_id(self, timeout=60):
155        """ Get a unique ID associated with the current boot.
156
157        Should return a string with the semantics such that two separate
158        calls to Host.get_boot_id() return the same string if the host did
159        not reboot between the two calls, and two different strings if it
160        has rebooted at least once between the two calls.
161
162        @param timeout The number of seconds to wait before timing out.
163
164        @return A string unique to this boot."""
165        return self.run('cat /proc/sys/kernel/random/boot_id',
166                        timeout=timeout).stdout.strip()
167
168
169    def wait_up(self, timeout=None):
170        raise NotImplementedError('Wait up not implemented!')
171
172
173    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
174        raise NotImplementedError('Wait down not implemented!')
175
176
177    def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
178                         log_failure=True, old_boot_id=None, **dargs):
179        """ Wait for the host to come back from a reboot. This is a generic
180        implementation based entirely on wait_up and wait_down. """
181        if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT,
182                              warning_timer=self.WAIT_DOWN_REBOOT_WARNING,
183                              old_boot_id=old_boot_id):
184            if log_failure:
185                self.record("ABORT", None, "reboot.verify", "shut down failed")
186            raise error.AutoservShutdownError("Host did not shut down")
187
188        self.wait_up(timeout)
189        time.sleep(2)    # this is needed for complete reliability
190        if self.wait_up(timeout):
191            self.record("GOOD", None, "reboot.verify")
192            self.reboot_followup(**dargs)
193        else:
194            self.record("ABORT", None, "reboot.verify",
195                        "Host did not return from reboot")
196            raise error.AutoservRebootError("Host did not return from reboot")
197
198
199    def verify(self):
200        self.verify_hardware()
201        self.verify_connectivity()
202        self.verify_software()
203
204
205    def verify_hardware(self):
206        pass
207
208
209    def verify_connectivity(self):
210        pass
211
212
213    def verify_software(self):
214        pass
215
216
217    def check_diskspace(self, path, gb):
218        logging.info('Checking for >= %s GB of space under %s on machine %s',
219                     gb, path, self.hostname)
220        df = self.run('df -mP %s | tail -1' % path).stdout.split()
221        free_space_gb = int(df[3])/1000.0
222        if free_space_gb < gb:
223            raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
224        else:
225            logging.info('Found %s GB >= %s GB of space under %s on machine %s',
226                free_space_gb, gb, path, self.hostname)
227
228
229    def get_open_func(self, use_cache=True):
230        """
231        Defines and returns a function that may be used instead of built-in
232        open() to open and read files. The returned function is implemented
233        by using self.run('cat <file>') and may cache the results for the same
234        filename.
235
236        @param use_cache Cache results of self.run('cat <filename>') for the
237            same filename
238
239        @return a function that can be used instead of built-in open()
240        """
241        cached_files = {}
242
243        def open_func(filename):
244            if not use_cache or filename not in cached_files:
245                output = self.run('cat \'%s\'' % filename,
246                                  stdout_tee=open('/dev/null', 'w')).stdout
247                fd = cStringIO.StringIO(output)
248
249                if not use_cache:
250                    return fd
251
252                cached_files[filename] = fd
253            else:
254                cached_files[filename].seek(0)
255
256            return cached_files[filename]
257
258        return open_func
259
260
261    def check_partitions(self, root_part, filter_func=None):
262        """ Compare the contents of /proc/partitions with those of
263        /proc/mounts and raise exception in case unmounted partitions are found
264
265        root_part: in Linux /proc/mounts will never directly mention the root
266        partition as being mounted on / instead it will say that /dev/root is
267        mounted on /. Thus require this argument to filter out the root_part
268        from the ones checked to be mounted
269
270        filter_func: unnary predicate for additional filtering out of
271        partitions required to be mounted
272
273        Raise: error.AutoservHostError if unfiltered unmounted partition found
274        """
275
276        print 'Checking if non-swap partitions are mounted...'
277
278        unmounted = partition.get_unmounted_partition_list(root_part,
279            filter_func=filter_func, open_func=self.get_open_func())
280        if unmounted:
281            raise error.AutoservNotMountedHostError(
282                'Found unmounted partitions: %s' %
283                [part.device for part in unmounted])
284
285
286    def _repair_wait_for_reboot(self):
287        TIMEOUT = int(self.HOURS_TO_WAIT_FOR_RECOVERY * 3600)
288        if self.is_shutting_down():
289            logging.info('Host is shutting down, waiting for a restart')
290            self.wait_for_restart(TIMEOUT)
291        else:
292            self.wait_up(TIMEOUT)
293
294
295    def _get_mountpoint(self, path):
296        """Given a "path" get the mount point of the filesystem containing
297        that path."""
298        code = ('import os\n'
299                # sanitize the path and resolve symlinks
300                'path = os.path.realpath(%r)\n'
301                "while path != '/' and not os.path.ismount(path):\n"
302                '    path, _ = os.path.split(path)\n'
303                'print path\n') % path
304        return self.run('python -c "%s"' % code,
305                        stdout_tee=open(os.devnull, 'w')).stdout.rstrip()
306
307
308    def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
309        """Empty a given directory path contents."""
310        rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
311        self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
312        self._removed_files = True
313
314
315    def repair_full_disk(self, mountpoint):
316        # it's safe to remove /tmp and /var/tmp, site specific overrides may
317        # want to remove some other places too
318        if mountpoint == self._get_mountpoint('/tmp'):
319            self.erase_dir_contents('/tmp')
320
321        if mountpoint == self._get_mountpoint('/var/tmp'):
322            self.erase_dir_contents('/var/tmp')
323
324
325    def _call_repair_func(self, err, func, *args, **dargs):
326        for old_call in self._already_repaired:
327            if old_call == (func, args, dargs):
328                # re-raising the original exception because surrounding
329                # error handling may want to try other ways to fix it
330                logging.warn('Already done this (%s) repair procedure, '
331                             're-raising the original exception.', func)
332                raise err
333
334        try:
335            func(*args, **dargs)
336        except error.AutoservHardwareRepairRequestedError:
337            # let this special exception propagate
338            raise
339        except error.AutoservError:
340            logging.exception('Repair failed but continuing in case it managed'
341                              ' to repair enough')
342
343        self._already_repaired.append((func, args, dargs))
344
345
346    def repair_filesystem_only(self):
347        """perform file system repairs only"""
348        while True:
349            # try to repair specific problems
350            try:
351                logging.info('Running verify to find failures to repair...')
352                self.verify()
353                if self._removed_files:
354                    logging.info('Removed files, rebooting to release the'
355                                 ' inodes')
356                    self.reboot()
357                return # verify succeeded, then repair succeeded
358            except error.AutoservHostIsShuttingDownError, err:
359                logging.exception('verify failed')
360                self._call_repair_func(err, self._repair_wait_for_reboot)
361            except error.AutoservDiskFullHostError, err:
362                logging.exception('verify failed')
363                self._call_repair_func(err, self.repair_full_disk,
364                                       self._get_mountpoint(err.path))
365
366
367    def repair_software_only(self):
368        """perform software repairs only"""
369        while True:
370            try:
371                self.repair_filesystem_only()
372                break
373            except (error.AutoservSshPingHostError, error.AutoservSSHTimeout,
374                    error.AutoservSshPermissionDeniedError,
375                    error.AutoservDiskFullHostError), err:
376                logging.exception('verify failed')
377                logging.info('Trying to reinstall the machine')
378                self._call_repair_func(err, self.machine_install)
379
380
381    def repair_full(self):
382        while True:
383            try:
384                self.repair_software_only()
385                break
386            except error.AutoservHardwareHostError, err:
387                logging.exception('verify failed')
388                # software repair failed, try hardware repair
389                logging.info('Hardware problem found, '
390                             'requesting hardware repairs')
391                self._call_repair_func(err, self.request_hardware_repair)
392
393
394    def repair_with_protection(self, protection_level):
395        """Perform the maximal amount of repair within the specified
396        protection level.
397
398        @param protection_level: the protection level to use for limiting
399                                 repairs, a host_protections.Protection
400        """
401        protection = host_protections.Protection
402        if protection_level == protection.DO_NOT_REPAIR:
403            logging.info('Protection is "Do not repair" so just verifying')
404            self.verify()
405        elif protection_level == protection.REPAIR_FILESYSTEM_ONLY:
406            logging.info('Attempting filesystem-only repair')
407            self.repair_filesystem_only()
408        elif protection_level == protection.REPAIR_SOFTWARE_ONLY:
409            logging.info('Attempting software repair only')
410            self.repair_software_only()
411        elif protection_level == protection.NO_PROTECTION:
412            logging.info('Attempting full repair')
413            self.repair_full()
414        else:
415            raise NotImplementedError('Unknown host protection level %s'
416                                      % protection_level)
417
418
419    def cleanup(self):
420        pass
421
422
423    def machine_install(self):
424        raise NotImplementedError('Machine install not implemented!')
425
426
427    def install(self, installableObject):
428        installableObject.install(self)
429
430
431    def get_autodir(self):
432        raise NotImplementedError('Get autodir not implemented!')
433
434
435    def set_autodir(self):
436        raise NotImplementedError('Set autodir not implemented!')
437
438
439    def start_loggers(self):
440        """ Called to start continuous host logging. """
441        pass
442
443
444    def stop_loggers(self):
445        """ Called to stop continuous host logging. """
446        pass
447
448
449    # some extra methods simplify the retrieval of information about the
450    # Host machine, with generic implementations based on run(). subclasses
451    # should feel free to override these if they can provide better
452    # implementations for their specific Host types
453
454    def get_num_cpu(self):
455        """ Get the number of CPUs in the host according to /proc/cpuinfo. """
456        proc_cpuinfo = self.run('cat /proc/cpuinfo',
457                                stdout_tee=open(os.devnull, 'w')).stdout
458        cpus = 0
459        for line in proc_cpuinfo.splitlines():
460            if line.startswith('processor'):
461                cpus += 1
462        return cpus
463
464
465    def get_arch(self):
466        """ Get the hardware architecture of the remote machine. """
467        arch = self.run('/bin/uname -m').stdout.rstrip()
468        if re.match(r'i\d86$', arch):
469            arch = 'i386'
470        return arch
471
472
473    def get_kernel_ver(self):
474        """ Get the kernel version of the remote machine. """
475        return self.run('/bin/uname -r').stdout.rstrip()
476
477
478    def get_cmdline(self):
479        """ Get the kernel command line of the remote machine. """
480        return self.run('cat /proc/cmdline').stdout.rstrip()
481
482
483    def path_exists(self, path):
484        """ Determine if path exists on the remote machine. """
485        result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path),
486                          ignore_status=True)
487        return result.exit_status == 0
488
489
490    # some extra helpers for doing job-related operations
491
492    def record(self, *args, **dargs):
493        """ Helper method for recording status logs against Host.job that
494        silently becomes a NOP if Host.job is not available. The args and
495        dargs are passed on to Host.job.record unchanged. """
496        if self.job:
497            self.job.record(*args, **dargs)
498
499
500    def log_kernel(self):
501        """ Helper method for logging kernel information into the status logs.
502        Intended for cases where the "current" kernel is not really defined
503        and we want to explicitly log it. Does nothing if this host isn't
504        actually associated with a job. """
505        if self.job:
506            kernel = self.get_kernel_ver()
507            self.job.record("INFO", None, None,
508                            optional_fields={"kernel": kernel})
509
510
511    def log_reboot(self, reboot_func):
512        """ Decorator for wrapping a reboot in a group for status
513        logging purposes. The reboot_func parameter should be an actual
514        function that carries out the reboot.
515        """
516        if self.job and not hasattr(self, "RUNNING_LOG_REBOOT"):
517            self.RUNNING_LOG_REBOOT = True
518            try:
519                self.job.run_reboot(reboot_func, self.get_kernel_ver)
520            finally:
521                del self.RUNNING_LOG_REBOOT
522        else:
523            reboot_func()
524
525
526    def request_hardware_repair(self):
527        """ Should somehow request (send a mail?) for hardware repairs on
528        this machine. The implementation can either return by raising the
529        special error.AutoservHardwareRepairRequestedError exception or can
530        try to wait until the machine is repaired and then return normally.
531        """
532        raise NotImplementedError("request_hardware_repair not implemented")
533
534
535    def list_files_glob(self, glob):
536        """
537        Get a list of files on a remote host given a glob pattern path.
538        """
539        SCRIPT = ("python -c 'import cPickle, glob, sys;"
540                  "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'")
541        output = self.run(SCRIPT, args=(glob,), stdout_tee=None,
542                          timeout=60).stdout
543        return cPickle.loads(output)
544
545
546    def symlink_closure(self, paths):
547        """
548        Given a sequence of path strings, return the set of all paths that
549        can be reached from the initial set by following symlinks.
550
551        @param paths: sequence of path strings.
552        @return: a sequence of path strings that are all the unique paths that
553                can be reached from the given ones after following symlinks.
554        """
555        SCRIPT = ("python -c 'import cPickle, os, sys\n"
556                  "paths = cPickle.load(sys.stdin)\n"
557                  "closure = {}\n"
558                  "while paths:\n"
559                  "    path = paths.keys()[0]\n"
560                  "    del paths[path]\n"
561                  "    if not os.path.exists(path):\n"
562                  "        continue\n"
563                  "    closure[path] = None\n"
564                  "    if os.path.islink(path):\n"
565                  "        link_to = os.path.join(os.path.dirname(path),\n"
566                  "                               os.readlink(path))\n"
567                  "        if link_to not in closure.keys():\n"
568                  "            paths[link_to] = None\n"
569                  "cPickle.dump(closure.keys(), sys.stdout, 0)'")
570        input_data = cPickle.dumps(dict((path, None) for path in paths), 0)
571        output = self.run(SCRIPT, stdout_tee=None, stdin=input_data,
572                          timeout=60).stdout
573        return cPickle.loads(output)
574
575
576    def cleanup_kernels(self, boot_dir='/boot'):
577        """
578        Remove any kernel image and associated files (vmlinux, system.map,
579        modules) for any image found in the boot directory that is not
580        referenced by entries in the bootloader configuration.
581
582        @param boot_dir: boot directory path string, default '/boot'
583        """
584        # find all the vmlinuz images referenced by the bootloader
585        vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-')
586        boot_info = self.bootloader.get_entries()
587        used_kernver = [boot['kernel'][len(vmlinuz_prefix):]
588                        for boot in boot_info.itervalues()]
589
590        # find all the unused vmlinuz images in /boot
591        all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*')
592        used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver
593                                            for kernver in used_kernver)
594        unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz)
595
596        # find all the unused vmlinux images in /boot
597        vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-')
598        all_vmlinux = self.list_files_glob(vmlinux_prefix + '*')
599        used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver
600                                            for kernver in used_kernver)
601        unused_vmlinux = set(all_vmlinux) - set(used_vmlinux)
602
603        # find all the unused System.map files in /boot
604        systemmap_prefix = os.path.join(boot_dir, 'System.map-')
605        all_system_map = self.list_files_glob(systemmap_prefix + '*')
606        used_system_map = self.symlink_closure(
607            systemmap_prefix + kernver for kernver in used_kernver)
608        unused_system_map = set(all_system_map) - set(used_system_map)
609
610        # find all the module directories associated with unused kernels
611        modules_prefix = '/lib/modules/'
612        all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*')
613                       if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)]
614        used_moddirs = self.symlink_closure(modules_prefix + kernver
615                                            for kernver in used_kernver)
616        unused_moddirs = set(all_moddirs) - set(used_moddirs)
617
618        # remove all the vmlinuz files we don't use
619        # TODO: if needed this should become package manager agnostic
620        for vmlinuz in unused_vmlinuz:
621            # try and get an rpm package name
622            rpm = self.run('rpm -qf', args=(vmlinuz,),
623                           ignore_status=True, timeout=120)
624            if rpm.exit_status == 0:
625                packages = set(line.strip() for line in
626                               rpm.stdout.splitlines())
627                # if we found some package names, try to remove them
628                for package in packages:
629                    self.run('rpm -e', args=(package,),
630                             ignore_status=True, timeout=120)
631            # remove the image files anyway, even if rpm didn't
632            self.run('rm -f', args=(vmlinuz,),
633                     ignore_status=True, timeout=120)
634
635        # remove all the vmlinux and System.map files left over
636        for f in (unused_vmlinux | unused_system_map):
637            self.run('rm -f', args=(f,),
638                     ignore_status=True, timeout=120)
639
640        # remove all unused module directories
641        # the regex match should keep us safe from removing the wrong files
642        for moddir in unused_moddirs:
643            self.run('rm -fr', args=(moddir,), ignore_status=True)
644