base_classes.py revision f83a5a2a3e536e18ae4a43ed515fbde94aeba375
1# Copyright 2009 Google Inc. Released under the GPL v2
2
3"""
4This module defines the base classes for the Host hierarchy.
5
6Implementation details:
7You should import the "hosts" package instead of importing each type of host.
8
9        Host: a machine on which you can run programs
10"""
11
12__author__ = """
13mbligh@google.com (Martin J. Bligh),
14poirier@google.com (Benjamin Poirier),
15stutsman@google.com (Ryan Stutsman)
16"""
17
18import os, re, time, cStringIO, logging
19
20from autotest_lib.client.common_lib import global_config, error, utils
21from autotest_lib.client.bin import partition
22
23
24class Host(object):
25    """
26    This class represents a machine on which you can run programs.
27
28    It may be a local machine, the one autoserv is running on, a remote
29    machine or a virtual machine.
30
31    Implementation details:
32    This is an abstract class, leaf subclasses must implement the methods
33    listed here. You must not instantiate this class but should
34    instantiate one of those leaf subclasses.
35
36    When overriding methods that raise NotImplementedError, the leaf class
37    is fully responsible for the implementation and should not chain calls
38    to super. When overriding methods that are a NOP in Host, the subclass
39    should chain calls to super(). The criteria for fitting a new method into
40    one category or the other should be:
41        1. If two separate generic implementations could reasonably be
42           concatenated, then the abstract implementation should pass and
43           subclasses should chain calls to super.
44        2. If only one class could reasonably perform the stated function
45           (e.g. two separate run() implementations cannot both be executed)
46           then the method should raise NotImplementedError in Host, and
47           the implementor should NOT chain calls to super, to ensure that
48           only one implementation ever gets executed.
49    """
50
51    job = None
52    DEFAULT_REBOOT_TIMEOUT = 1800
53    WAIT_DOWN_REBOOT_TIMEOUT = 840
54    WAIT_DOWN_REBOOT_WARNING = 540
55    HOURS_TO_WAIT_FOR_RECOVERY = 2.5
56
57
58    def __init__(self, *args, **dargs):
59        self._initialize(*args, **dargs)
60
61
62    def _initialize(self, *args, **dargs):
63        self._already_repaired = []
64        self._removed_files = False
65
66
67    def close(self):
68        pass
69
70
71    def setup(self):
72        pass
73
74
75    def run(self, command, timeout=3600, ignore_status=False,
76            stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
77            stdin=None):
78        """
79        Run a command on this host.
80
81        @param command: the command line string
82        @param timeout: time limit in seconds before attempting to
83                kill the running process. The run() function
84                will take a few seconds longer than 'timeout'
85                to complete if it has to kill the process.
86        @param ignore_status: do not raise an exception, no matter
87                what the exit code of the command is.
88        @param stdout_tee/stderr_tee: where to tee the stdout/stderr
89        @param stdin: stdin to pass to the executed process
90
91        @return a utils.CmdResult object
92
93        @raises AutotestHostRunError: the exit code of the command execution
94                was not 0 and ignore_status was not enabled
95        """
96        raise NotImplementedError('Run not implemented!')
97
98
99    def run_output(self, command, *args, **dargs):
100        return self.run(command, *args, **dargs).stdout.rstrip()
101
102
103    def reboot(self):
104        raise NotImplementedError('Reboot not implemented!')
105
106
107    def sysrq_reboot(self):
108        raise NotImplementedError('Sysrq reboot not implemented!')
109
110
111    def reboot_setup(self, *args, **dargs):
112        pass
113
114
115    def reboot_followup(self, *args, **dargs):
116        pass
117
118
119    def get_file(self, source, dest, delete_dest=False):
120        raise NotImplementedError('Get file not implemented!')
121
122
123    def send_file(self, source, dest, delete_dest=False):
124        raise NotImplementedError('Send file not implemented!')
125
126
127    def get_tmp_dir(self):
128        raise NotImplementedError('Get temp dir not implemented!')
129
130
131    def is_up(self):
132        raise NotImplementedError('Is up not implemented!')
133
134
135    def is_shutting_down(self):
136        """ Indicates is a machine is currently shutting down. """
137        runlevel = int(self.run("runlevel").stdout.strip().split()[1])
138        return runlevel in (0, 6)
139
140
141    def get_wait_up_processes(self):
142        """ Gets the list of local processes to wait for in wait_up. """
143        get_config = global_config.global_config.get_config_value
144        proc_list = get_config("HOSTS", "wait_up_processes",
145                               default="").strip()
146        processes = set(p.strip() for p in proc_list.split(","))
147        processes.discard("")
148        return processes
149
150
151    def wait_up(self, timeout=None):
152        raise NotImplementedError('Wait up not implemented!')
153
154
155    def wait_down(self, timeout=None, warning_timer=None):
156        raise NotImplementedError('Wait down not implemented!')
157
158
159    def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, **dargs):
160        """ Wait for the host to come back from a reboot. This is a generic
161        implementation based entirely on wait_up and wait_down. """
162        if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT,
163                              warning_timer=self.WAIT_DOWN_REBOOT_WARNING):
164            self.record("ABORT", None, "reboot.verify", "shut down failed")
165            raise error.AutoservShutdownError("Host did not shut down")
166
167        self.wait_up(timeout)
168        time.sleep(2)    # this is needed for complete reliability
169        if self.wait_up(timeout):
170            self.record("GOOD", None, "reboot.verify")
171            self.reboot_followup(**dargs)
172        else:
173            self.record("ABORT", None, "reboot.verify",
174                        "Host did not return from reboot")
175            raise error.AutoservRebootError("Host did not return from reboot")
176
177
178    def verify(self):
179        pass
180
181
182    def verify_hardware(self):
183        pass
184
185
186    def verify_software(self):
187        pass
188
189
190    def check_diskspace(self, path, gb):
191        logging.info('Checking for >= %s GB of space under %s on machine %s',
192                     gb, path, self.hostname)
193        df = self.run('df -mP %s | tail -1' % path).stdout.split()
194        free_space_gb = int(df[3])/1000.0
195        if free_space_gb < gb:
196            raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
197        else:
198            logging.info('Found %s GB >= %s GB of space under %s on machine %s',
199                free_space_gb, gb, path, self.hostname)
200
201
202    def get_open_func(self, use_cache=True):
203        """
204        Defines and returns a function that may be used instead of built-in
205        open() to open and read files. The returned function is implemented
206        by using self.run('cat <file>') and may cache the results for the same
207        filename.
208
209        @param use_cache Cache results of self.run('cat <filename>') for the
210            same filename
211
212        @return a function that can be used instead of built-in open()
213        """
214        cached_files = {}
215
216        def open_func(filename):
217            if not use_cache or filename not in cached_files:
218                output = self.run('cat \'%s\'' % filename,
219                                  stdout_tee=open('/dev/null', 'w')).stdout
220                fd = cStringIO.StringIO(output)
221
222                if not use_cache:
223                    return fd
224
225                cached_files[filename] = fd
226            else:
227                cached_files[filename].seek(0)
228
229            return cached_files[filename]
230
231        return open_func
232
233
234    def check_partitions(self, root_part, filter_func=None):
235        """ Compare the contents of /proc/partitions with those of
236        /proc/mounts and raise exception in case unmounted partitions are found
237
238        root_part: in Linux /proc/mounts will never directly mention the root
239        partition as being mounted on / instead it will say that /dev/root is
240        mounted on /. Thus require this argument to filter out the root_part
241        from the ones checked to be mounted
242
243        filter_func: unnary predicate for additional filtering out of
244        partitions required to be mounted
245
246        Raise: error.AutoservHostError if unfiltered unmounted partition found
247        """
248
249        print 'Checking if non-swap partitions are mounted...'
250
251        unmounted = partition.get_unmounted_partition_list(root_part,
252            filter_func=filter_func, open_func=self.get_open_func())
253        if unmounted:
254            raise error.AutoservNotMountedHostError(
255                'Found unmounted partitions: %s' %
256                [part.device for part in unmounted])
257
258
259    def _repair_wait_for_reboot(self):
260        TIMEOUT = int(self.HOURS_TO_WAIT_FOR_RECOVERY * 3600)
261        if self.is_shutting_down():
262            logging.info('Host is shutting down, waiting for a restart')
263            self.wait_for_restart(TIMEOUT)
264        else:
265            self.wait_up(TIMEOUT)
266
267
268    def _get_mountpoint(self, path):
269        """Given a "path" get the mount point of the filesystem containing
270        that path."""
271        code = ('import os\n'
272                # sanitize the path and resolve symlinks
273                'path = os.path.realpath(%r)\n'
274                "while path != '/' and not os.path.ismount(path):\n"
275                '    path, _ = os.path.split(path)\n'
276                'print path\n') % path
277        return self.run('python2.4 -c "%s"' % code,
278                        stdout_tee=open(os.devnull, 'w')).stdout.rstrip()
279
280
281    def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
282        """Empty a given directory path contents."""
283        rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
284        self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
285        self._removed_files = True
286
287
288    def repair_full_disk(self, mountpoint):
289        # it's safe to remove /tmp and /var/tmp, site specific overrides may
290        # want to remove some other places too
291        if mountpoint == self._get_mountpoint('/tmp'):
292            self.erase_dir_contents('/tmp')
293
294        if mountpoint == self._get_mountpoint('/var/tmp'):
295            self.erase_dir_contents('/var/tmp')
296
297
298    def _call_repair_func(self, err, func, *args, **dargs):
299        for old_call in self._already_repaired:
300            if old_call == (func, args, dargs):
301                # re-raising the original exception because surrounding
302                # error handling may want to try other ways to fix it
303                logging.warn('Already done this (%s) repair procedure, '
304                             're-raising the original exception.', func)
305                raise err
306
307        try:
308            func(*args, **dargs)
309        except error.AutoservHardwareRepairRequestedError:
310            # let this special exception propagate
311            raise
312        except error.AutoservError:
313            logging.exception('Repair failed but continuing in case it managed'
314                              ' to repair enough')
315
316        self._already_repaired.append((func, args, dargs))
317
318
319    def repair_filesystem_only(self):
320        """perform file system repairs only"""
321        while True:
322            # try to repair specific problems
323            try:
324                logging.info('Running verify to find failures to repair...')
325                self.verify()
326                if self._removed_files:
327                    logging.info('Removed files, rebooting to release the'
328                                 ' inodes')
329                    self.reboot()
330                return # verify succeeded, then repair succeeded
331            except error.AutoservHostIsShuttingDownError, err:
332                logging.exception('verify failed')
333                self._call_repair_func(err, self._repair_wait_for_reboot)
334            except error.AutoservDiskFullHostError, err:
335                logging.exception('verify failed')
336                self._call_repair_func(err, self.repair_full_disk,
337                                       self._get_mountpoint(err.path))
338
339
340    def repair_software_only(self):
341        """perform software repairs only"""
342        while True:
343            try:
344                self.repair_filesystem_only()
345                break
346            except (error.AutoservSshPingHostError, error.AutoservSSHTimeout,
347                    error.AutoservSshPermissionDeniedError,
348                    error.AutoservDiskFullHostError), err:
349                logging.exception('verify failed')
350                logging.info('Trying to reinstall the machine')
351                self._call_repair_func(err, self.machine_install)
352
353
354    def repair_full(self):
355        while True:
356            try:
357                self.repair_software_only()
358                break
359            except error.AutoservHardwareHostError, err:
360                logging.exception('verify failed')
361                # software repair failed, try hardware repair
362                logging.info('Hardware problem found, '
363                             'requesting hardware repairs')
364                self._call_repair_func(err, self.request_hardware_repair)
365
366
367    def cleanup(self):
368        pass
369
370
371    def machine_install(self):
372        raise NotImplementedError('Machine install not implemented!')
373
374
375    def install(self, installableObject):
376        installableObject.install(self)
377
378
379    def get_autodir(self):
380        raise NotImplementedError('Get autodir not implemented!')
381
382
383    def set_autodir(self):
384        raise NotImplementedError('Set autodir not implemented!')
385
386
387    def start_loggers(self):
388        """ Called to start continuous host logging. """
389        pass
390
391
392    def stop_loggers(self):
393        """ Called to stop continuous host logging. """
394        pass
395
396
397    # some extra methods simplify the retrieval of information about the
398    # Host machine, with generic implementations based on run(). subclasses
399    # should feel free to override these if they can provide better
400    # implementations for their specific Host types
401
402    def get_num_cpu(self):
403        """ Get the number of CPUs in the host according to /proc/cpuinfo. """
404        proc_cpuinfo = self.run('cat /proc/cpuinfo',
405                                stdout_tee=open(os.devnull, 'w')).stdout
406        cpus = 0
407        for line in proc_cpuinfo.splitlines():
408            if line.startswith('processor'):
409                cpus += 1
410        return cpus
411
412
413    def get_arch(self):
414        """ Get the hardware architecture of the remote machine. """
415        arch = self.run('/bin/uname -m').stdout.rstrip()
416        if re.match(r'i\d86$', arch):
417            arch = 'i386'
418        return arch
419
420
421    def get_kernel_ver(self):
422        """ Get the kernel version of the remote machine. """
423        return self.run('/bin/uname -r').stdout.rstrip()
424
425
426    def get_cmdline(self):
427        """ Get the kernel command line of the remote machine. """
428        return self.run('cat /proc/cmdline').stdout.rstrip()
429
430
431    def path_exists(self, path):
432        """ Determine if path exists on the remote machine. """
433        result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path),
434                          ignore_status=True)
435        return result.exit_status == 0
436
437
438    # some extra helpers for doing job-related operations
439
440    def record(self, *args, **dargs):
441        """ Helper method for recording status logs against Host.job that
442        silently becomes a NOP if Host.job is not available. The args and
443        dargs are passed on to Host.job.record unchanged. """
444        if self.job:
445            self.job.record(*args, **dargs)
446
447
448    def log_kernel(self):
449        """ Helper method for logging kernel information into the status logs.
450        Intended for cases where the "current" kernel is not really defined
451        and we want to explicitly log it. Does nothing if this host isn't
452        actually associated with a job. """
453        if self.job:
454            kernel = self.get_kernel_ver()
455            self.job.record("INFO", None, None,
456                            optional_fields={"kernel": kernel})
457
458
459    def log_reboot(self, reboot_func):
460        """ Decorator for wrapping a reboot in a group for status
461        logging purposes. The reboot_func parameter should be an actual
462        function that carries out the reboot.
463        """
464        if self.job and not hasattr(self, "RUNNING_LOG_REBOOT"):
465            self.RUNNING_LOG_REBOOT = True
466            try:
467                self.job.run_reboot(reboot_func, self.get_kernel_ver)
468            finally:
469                del self.RUNNING_LOG_REBOOT
470        else:
471            reboot_func()
472
473
474    def request_hardware_repair(self):
475        """ Should somehow request (send a mail?) for hardware repairs on
476        this machine. The implementation can either return by raising the
477        special error.AutoservHardwareRepairRequestedError exception or can
478        try to wait until the machine is repaired and then return normally.
479        """
480        raise NotImplementedError("request_hardware_repair not implemented")
481