autoupdater.py revision 4c50c54c3cd1b4c9360b27b31aa276b215dcaab9
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import glob
6import httplib
7import logging
8import multiprocessing
9import os
10import re
11import urlparse
12import urllib2
13
14from autotest_lib.client.bin import utils
15from autotest_lib.client.common_lib import error, global_config
16from autotest_lib.client.common_lib.cros import dev_server
17
18# Local stateful update path is relative to the CrOS source directory.
19LOCAL_STATEFUL_UPDATE_PATH = 'src/platform/dev/stateful_update'
20LOCAL_CHROOT_STATEFUL_UPDATE_PATH = '/usr/bin/stateful_update'
21REMOTE_STATEUL_UPDATE_PATH = '/usr/local/bin/stateful_update'
22STATEFUL_UPDATE = '/tmp/stateful_update'
23UPDATER_BIN = '/usr/bin/update_engine_client'
24UPDATER_IDLE = 'UPDATE_STATUS_IDLE'
25UPDATER_NEED_REBOOT = 'UPDATE_STATUS_UPDATED_NEED_REBOOT'
26UPDATED_MARKER = '/var/run/update_engine_autoupdate_completed'
27UPDATER_LOGS = ['/var/log/messages', '/var/log/update_engine']
28# A list of update engine client states that occur after an update is triggered.
29UPDATER_PROCESSING_UPDATE = ['UPDATE_STATUS_CHECKING_FORUPDATE',
30                             'UPDATE_STATUS_UPDATE_AVAILABLE',
31                             'UPDATE_STATUS_DOWNLOADING',
32                             'UPDATE_STATUS_FINALIZING']
33
34class ChromiumOSError(error.InstallError):
35    """Generic error for ChromiumOS-specific exceptions."""
36    pass
37
38
39class RootFSUpdateError(ChromiumOSError):
40    """Raised when the RootFS fails to update."""
41    pass
42
43
44class StatefulUpdateError(ChromiumOSError):
45    """Raised when the stateful partition fails to update."""
46    pass
47
48
49def url_to_version(update_url):
50    """Return the version based on update_url.
51
52    @param update_url: url to the image to update to.
53
54    """
55    # The Chrome OS version is generally the last element in the URL. The only
56    # exception is delta update URLs, which are rooted under the version; e.g.,
57    # http://.../update/.../0.14.755.0/au/0.14.754.0. In this case we want to
58    # strip off the au section of the path before reading the version.
59    return re.sub('/au/.*', '',
60                  urlparse.urlparse(update_url).path).split('/')[-1].strip()
61
62
63def url_to_image_name(update_url):
64    """Return the image name based on update_url.
65
66    From a URL like:
67        http://172.22.50.205:8082/update/lumpy-release/R27-3837.0.0
68    return lumpy-release/R27-3837.0.0
69
70    @param update_url: url to the image to update to.
71    @returns a string representing the image name in the update_url.
72
73    """
74    return '/'.join(urlparse.urlparse(update_url).path.split('/')[-2:])
75
76
77def _get_devserver_build_from_update_url(update_url):
78    """Get the devserver and build from the update url.
79
80    @param update_url: The url for update.
81        Eg: http://devserver:port/update/build.
82
83    @return: A tuple of (devserver url, build) or None if the update_url
84        doesn't match the expected pattern.
85
86    @raises ValueError: If the update_url doesn't match the expected pattern.
87    @raises ValueError: If no global_config was found, or it doesn't contain an
88        image_url_pattern.
89    """
90    pattern = global_config.global_config.get_config_value(
91            'CROS', 'image_url_pattern', type=str, default='')
92    if not pattern:
93        raise ValueError('Cannot parse update_url, the global config needs '
94                'an image_url_pattern.')
95    re_pattern = pattern.replace('%s', '(\S+)')
96    parts = re.search(re_pattern, update_url)
97    if not parts or len(parts.groups()) < 2:
98        raise ValueError('%s is not an update url' % update_url)
99    return parts.groups()
100
101
102def list_image_dir_contents(update_url):
103    """Lists the contents of the devserver for a given build/update_url.
104
105    @param update_url: An update url. Eg: http://devserver:port/update/build.
106    """
107    if not update_url:
108        logging.warning('Need update_url to list contents of the devserver.')
109        return
110    error_msg = 'Cannot check contents of devserver, update url %s' % update_url
111    try:
112        devserver_url, build = _get_devserver_build_from_update_url(update_url)
113    except ValueError as e:
114        logging.warning('%s: %s', error_msg, e)
115        return
116    devserver = dev_server.ImageServer(devserver_url)
117    try:
118        devserver.list_image_dir(build)
119    # The devserver will retry on URLError to avoid flaky connections, but will
120    # eventually raise the URLError if it persists. All HTTPErrors get
121    # converted to DevServerExceptions.
122    except (dev_server.DevServerException, urllib2.URLError) as e:
123        logging.warning('%s: %s', error_msg, e)
124
125
126class ChromiumOSUpdater():
127    """Helper class used to update DUT with image of desired version."""
128    KERNEL_A = {'name': 'KERN-A', 'kernel': 2, 'root': 3}
129    KERNEL_B = {'name': 'KERN-B', 'kernel': 4, 'root': 5}
130    # Time to wait for new kernel to be marked successful after
131    # auto update.
132    KERNEL_UPDATE_TIMEOUT = 120
133
134
135    def __init__(self, update_url, host=None, local_devserver=False):
136        self.host = host
137        self.update_url = update_url
138        self._update_error_queue = multiprocessing.Queue(2)
139        self.local_devserver = local_devserver
140        if not local_devserver:
141          self.update_version = url_to_version(update_url)
142        else:
143          self.update_version = None
144
145    def check_update_status(self):
146        """Return current status from update-engine."""
147        update_status = self._run(
148            '%s -status 2>&1 | grep CURRENT_OP' % UPDATER_BIN)
149        return update_status.stdout.strip().split('=')[-1]
150
151
152    def reset_update_engine(self):
153        """Restarts the update-engine service."""
154        self._run('rm -f %s' % UPDATED_MARKER)
155        try:
156            self._run('initctl stop update-engine')
157        except error.AutoservRunError:
158            logging.warn('Stopping update-engine service failed. Already dead?')
159        self._run('initctl start update-engine')
160
161        if self.check_update_status() != UPDATER_IDLE:
162            raise ChromiumOSError('%s is not in an installable state' %
163                                  self.host.hostname)
164
165
166    def _run(self, cmd, *args, **kwargs):
167        """Abbreviated form of self.host.run(...)"""
168        return self.host.run(cmd, *args, **kwargs)
169
170
171    def rootdev(self, options=''):
172        """Returns the stripped output of rootdev <options>.
173
174        @param options: options to run rootdev.
175
176        """
177        return self._run('rootdev %s' % options).stdout.strip()
178
179
180    def get_kernel_state(self):
181        """Returns the (<active>, <inactive>) kernel state as a pair."""
182        active_root = int(re.findall('\d+\Z', self.rootdev('-s'))[0])
183        if active_root == self.KERNEL_A['root']:
184            return self.KERNEL_A, self.KERNEL_B
185        elif active_root == self.KERNEL_B['root']:
186            return self.KERNEL_B, self.KERNEL_A
187        else:
188            raise ChromiumOSError('Encountered unknown root partition: %s' %
189                                  active_root)
190
191
192    def _cgpt(self, flag, kernel, dev='$(rootdev -s -d)'):
193        """Return numeric cgpt value for the specified flag, kernel, device. """
194        return int(self._run('cgpt show -n -i %d %s %s' % (
195            kernel['kernel'], flag, dev)).stdout.strip())
196
197
198    def get_kernel_priority(self, kernel):
199        """Return numeric priority for the specified kernel.
200
201        @param kernel: information of the given kernel, KERNEL_A or KERNEL_B.
202
203        """
204        return self._cgpt('-P', kernel)
205
206
207    def get_kernel_success(self, kernel):
208        """Return boolean success flag for the specified kernel.
209
210        @param kernel: information of the given kernel, KERNEL_A or KERNEL_B.
211
212        """
213        return self._cgpt('-S', kernel) != 0
214
215
216    def get_kernel_tries(self, kernel):
217        """Return tries count for the specified kernel.
218
219        @param kernel: information of the given kernel, KERNEL_A or KERNEL_B.
220
221        """
222        return self._cgpt('-T', kernel)
223
224
225    def get_stateful_update_script(self):
226        """Returns the path to the stateful update script on the target."""
227        # We attempt to load the local stateful update path in 3 different
228        # ways. First we use the location specified in the autotest global
229        # config. If this doesn't exist, we attempt to use the Chromium OS
230        # Chroot path to the installed script. If all else fails, we use the
231        # stateful update script on the host.
232        stateful_update_path = os.path.join(
233                global_config.global_config.get_config_value(
234                        'CROS', 'source_tree', default=''),
235                LOCAL_STATEFUL_UPDATE_PATH)
236
237        if not os.path.exists(stateful_update_path):
238            logging.warn('Could not find Chrome OS source location for '
239                         'stateful_update script at %s, falling back to chroot '
240                         'copy.', stateful_update_path)
241            stateful_update_path = LOCAL_CHROOT_STATEFUL_UPDATE_PATH
242
243        if not os.path.exists(stateful_update_path):
244            logging.warn('Could not chroot stateful_update script, falling '
245                         'back on client copy.')
246            statefuldev_script = REMOTE_STATEUL_UPDATE_PATH
247        else:
248            self.host.send_file(
249                    stateful_update_path, STATEFUL_UPDATE, delete_dest=True)
250            statefuldev_script = STATEFUL_UPDATE
251
252        return statefuldev_script
253
254
255    def reset_stateful_partition(self):
256        """Clear any pending stateful update request."""
257        statefuldev_cmd = [self.get_stateful_update_script()]
258        statefuldev_cmd += ['--stateful_change=reset', '2>&1']
259        self._run(' '.join(statefuldev_cmd))
260
261
262    def revert_boot_partition(self):
263        """Revert the boot partition."""
264        part = self.rootdev('-s')
265        logging.warn('Reverting update; Boot partition will be %s', part)
266        return self._run('/postinst %s 2>&1' % part)
267
268
269    def trigger_update(self):
270        """Triggers a background update on a test image.
271
272        @raise RootFSUpdateError if anything went wrong.
273
274        """
275        autoupdate_cmd = '%s --check_for_update --omaha_url=%s' % (
276            UPDATER_BIN, self.update_url)
277        logging.info('Triggering update via: %s', autoupdate_cmd)
278        try:
279            self._run(autoupdate_cmd)
280        except (error.AutoservSshPermissionDeniedError,
281                error.AutoservSSHTimeout) as e:
282            raise RootFSUpdateError('SSH on %s is seeing %s' %
283                                    (self.host.hostname, type(e).__name__))
284        except error.AutoservRunError as e:
285
286            # Check if the exit code is 255, if so it's probably a generic
287            # SSH error.
288            result = e.args[1]
289            if result.exit_status == 255:
290              raise RootFSUpdateError('SSH on %s is seeing a generic error.' %
291                                      self.host.hostname)
292
293            # We have ruled out all SSH cases, the error code is from
294            # update_engine_client, though we still don't know why.
295            list_image_dir_contents(self.update_url)
296            raise RootFSUpdateError(
297                    'devserver unreachable, payload unavailable, '
298                    'or AU bug (unlikely) on %s: %s' %
299                    (self.host.hostname, type(e).__name__))
300
301
302    def _verify_update_completed(self):
303        """Verifies that an update has completed.
304
305        @raise RootFSUpdateError: if verification fails.
306        """
307        status = self.check_update_status()
308        if status != UPDATER_NEED_REBOOT:
309            raise RootFSUpdateError('Update did not complete with correct '
310                                    'status. Expecting %s, actual %s' %
311                                            (UPDATER_NEED_REBOOT, status))
312
313
314    def rollback_rootfs(self, powerwash):
315        """Triggers rollback and waits for it to complete.
316
317        @param powerwash: If true, powerwash as part of rollback.
318
319        @raise RootFSUpdateError if anything went wrong.
320
321        """
322        can_rollback_cmd = '%s --can_rollback' % (UPDATER_BIN)
323        logging.info('Checking for rollback.')
324        try:
325            self._run(can_rollback_cmd)
326        except error.AutoservRunError as e:
327            raise RootFSUpdateError("Rollback isn't possible on %s: %s" %
328                                    (self.host.hostname, str(e)))
329
330        rollback_cmd = '%s --rollback --follow' % (UPDATER_BIN)
331        if not powerwash:
332          rollback_cmd += ' --nopowerwash'
333
334        logging.info('Performing rollback.')
335        try:
336            self._run(rollback_cmd)
337        except error.AutoservRunError as e:
338            raise RootFSUpdateError('Rollback failed on %s: %s' %
339                                    (self.host.hostname, str(e)))
340
341        self._verify_update_completed()
342
343
344    def update_rootfs(self):
345        """Updates the rootfs partition only."""
346        logging.info('Updating root partition...')
347
348        # Run update_engine using the specified URL.
349        try:
350            autoupdate_cmd = '%s --update --omaha_url=%s 2>&1' % (
351                UPDATER_BIN, self.update_url)
352            self._run(autoupdate_cmd, timeout=900)
353        except error.AutoservRunError:
354            list_image_dir_contents(self.update_url)
355            update_error = RootFSUpdateError('update-engine failed on %s' %
356                                             self.host.hostname)
357            self._update_error_queue.put(update_error)
358            raise update_error
359
360        try:
361            self._verify_update_completed()
362        except RootFSUpdateError as e:
363            self._update_error_queue.put(e)
364            raise
365
366
367    def update_stateful(self, clobber=True):
368        """Updates the stateful partition.
369
370        @param clobber: If True, a clean stateful installation.
371        """
372        logging.info('Updating stateful partition...')
373        statefuldev_url = self.update_url.replace('update',
374                                                  'static')
375
376        # Attempt stateful partition update; this must succeed so that the newly
377        # installed host is testable after update.
378        statefuldev_cmd = [self.get_stateful_update_script(), statefuldev_url]
379        if clobber:
380            statefuldev_cmd.append('--stateful_change=clean')
381
382        statefuldev_cmd.append('2>&1')
383        try:
384            self._run(' '.join(statefuldev_cmd), timeout=600)
385        except error.AutoservRunError:
386            update_error = StatefulUpdateError('stateful_update failed on %s' %
387                                               self.host.hostname)
388            self._update_error_queue.put(update_error)
389            raise update_error
390
391
392    def run_update(self, force_update, update_root=True):
393        """Update the DUT with image of specific version.
394
395        @param force_update: True to update DUT even if it's running the same
396            version already.
397        @param update_root: True to force a kernel update. If it's False and
398            force_update is True, stateful update will be used to clean up
399            the DUT.
400
401        """
402        booted_version = self.get_build_id()
403        if (self.check_version() and not force_update):
404            logging.info('System is already up to date. Skipping update.')
405            return False
406
407        if self.update_version:
408            logging.info('Updating from version %s to %s.',
409                         booted_version, self.update_version)
410
411        # Check that Dev Server is accepting connections (from autoserv's host).
412        # If we can't talk to it, the machine host probably can't either.
413        auserver_host = urlparse.urlparse(self.update_url)[1]
414        try:
415            httplib.HTTPConnection(auserver_host).connect()
416        except IOError:
417            raise ChromiumOSError(
418                'Update server at %s not available' % auserver_host)
419
420        logging.info('Installing from %s to %s', self.update_url,
421                     self.host.hostname)
422
423        # Reset update state.
424        self.reset_update_engine()
425        self.reset_stateful_partition()
426
427        try:
428            updaters = [
429                multiprocessing.process.Process(target=self.update_rootfs),
430                multiprocessing.process.Process(target=self.update_stateful)
431                ]
432            if not update_root:
433                logging.info('Root update is skipped.')
434                updaters = updaters[1:]
435
436            # Run the updaters in parallel.
437            for updater in updaters: updater.start()
438            for updater in updaters: updater.join()
439
440            # Re-raise the first error that occurred.
441            if not self._update_error_queue.empty():
442                update_error = self._update_error_queue.get()
443                self.revert_boot_partition()
444                self.reset_stateful_partition()
445                raise update_error
446
447            logging.info('Update complete.')
448            return True
449        except:
450            # Collect update engine logs in the event of failure.
451            if self.host.job:
452                logging.info('Collecting update engine logs...')
453                self.host.get_file(
454                    UPDATER_LOGS, self.host.job.sysinfo.sysinfodir,
455                    preserve_perm=False)
456            list_image_dir_contents(self.update_url)
457            raise
458        finally:
459            self.host.show_update_engine_log()
460
461
462    def check_version(self):
463        """Check the image running in DUT has the desired version.
464
465        @returns: True if the DUT's image version matches the version that
466            the autoupdater tries to update to.
467
468        """
469        booted_version = self.get_build_id()
470        return (self.update_version and
471                self.update_version.endswith(booted_version))
472
473
474    def check_version_to_confirm_install(self):
475        """Check image running in DUT has the desired version to be installed.
476
477        The method should not be used to check if DUT needs to have a full
478        reimage. Only use it to confirm a image is installed.
479
480        The method is designed to verify version for following 4 scenarios with
481        samples of version to update to and expected booted version:
482        1. trybot paladin build.
483        update version: trybot-lumpy-paladin/R27-3837.0.0-b123
484        booted version: 3837.0.2013_03_21_1340
485
486        2. trybot release build.
487        update version: trybot-lumpy-release/R27-3837.0.0-b456
488        booted version: 3837.0.0
489
490        3. buildbot official release build.
491        update version: lumpy-release/R27-3837.0.0
492        booted version: 3837.0.0
493
494        4. non-official paladin rc build.
495        update version: lumpy-paladin/R27-3878.0.0-rc7
496        booted version: 3837.0.0-rc7
497
498        5. chrome-perf build.
499        update version: lumpy-chrome-perf/R28-3837.0.0-b2996
500        booted version: 3837.0.0
501
502        6. pgo-generate build.
503        update version: lumpy-release-pgo-generate/R28-3837.0.0-b2996
504        booted version: 3837.0.0-pgo-generate
505
506        When we are checking if a DUT needs to do a full install, we should NOT
507        use this method to check if the DUT is running the same version, since
508        it may return false positive for a DUT running trybot paladin build to
509        be updated to another trybot paladin build.
510
511        TODO: This logic has a bug if a trybot paladin build failed to be
512        installed in a DUT running an older trybot paladin build with same
513        platform number, but different build number (-b###). So to conclusively
514        determine if a tryjob paladin build is imaged successfully, we may need
515        to find out the date string from update url.
516
517        @returns: True if the DUT's image version (without the date string if
518            the image is a trybot build), matches the version that the
519            autoupdater is trying to update to.
520
521        """
522        # In the local_devserver case, we can't know the expected
523        # build, so just pass.
524        if not self.update_version:
525            return True
526
527        # Always try the default check_version method first, this prevents
528        # any backward compatibility issue.
529        if self.check_version():
530            return True
531
532        # Remove R#- and -b# at the end of build version
533        stripped_version = re.sub(r'(R\d+-|-b\d+)', '', self.update_version)
534
535        booted_version = self.get_build_id()
536
537        is_trybot_paladin_build = re.match(r'.+trybot-.+-paladin',
538                                           self.update_url)
539
540        # Replace date string with 0 in booted_version
541        booted_version_no_date = re.sub(r'\d{4}_\d{2}_\d{2}_\d+', '0',
542                                        booted_version)
543        has_date_string = booted_version != booted_version_no_date
544
545        is_pgo_generate_build = re.match(r'.+-pgo-generate',
546                                           self.update_url)
547
548        # Remove |-pgo-generate| in booted_version
549        booted_version_no_pgo = booted_version.replace('-pgo-generate', '')
550        has_pgo_generate = booted_version != booted_version_no_pgo
551
552        if is_trybot_paladin_build:
553            if not has_date_string:
554                logging.error('A trybot paladin build is expected. Version ' +
555                              '"%s" is not a paladin build.', booted_version)
556                return False
557            return stripped_version == booted_version_no_date
558        elif is_pgo_generate_build:
559            if not has_pgo_generate:
560                logging.error('A pgo-generate build is expected. Version ' +
561                              '"%s" is not a pgo-generate build.',
562                              booted_version)
563                return False
564            return stripped_version == booted_version_no_pgo
565        else:
566            if has_date_string:
567                logging.error('Unexpected date found in a non trybot paladin' +
568                              ' build.')
569                return False
570            # Versioned build, i.e., rc or release build.
571            return stripped_version == booted_version
572
573
574    def get_build_id(self):
575        """Pulls the CHROMEOS_RELEASE_VERSION string from /etc/lsb-release."""
576        return self._run('grep CHROMEOS_RELEASE_VERSION'
577                         ' /etc/lsb-release').stdout.split('=')[1].strip()
578
579
580    def verify_boot_expectations(self, expected_kernel_state, rollback_message):
581        """Verifies that we fully booted given expected kernel state.
582
583        This method both verifies that we booted using the correct kernel
584        state and that the OS has marked the kernel as good.
585
586        @param expected_kernel_state: kernel state that we are verifying with
587            i.e. I expect to be booted onto partition 4 etc. See output of
588            get_kernel_state.
589        @param rollback_message: string to raise as a ChromiumOSError
590            if we booted with the wrong partition.
591
592        @raises ChromiumOSError: If we didn't.
593        """
594        # Figure out the newly active kernel.
595        active_kernel_state = self.get_kernel_state()[0]
596
597        # Check for rollback due to a bad build.
598        if (expected_kernel_state and
599                active_kernel_state != expected_kernel_state):
600
601            # Kernel crash reports should be wiped between test runs, but
602            # may persist from earlier parts of the test, or from problems
603            # with provisioning.
604            #
605            # Kernel crash reports will NOT be present if the crash happened
606            # before encrypted stateful is mounted.
607            #
608            # TODO(dgarrett): Integrate with server/crashcollect.py at some
609            # point.
610            kernel_crashes = glob.glob('/var/spool/crash/kernel.*.kcrash')
611            if kernel_crashes:
612                rollback_message += ': kernel_crash'
613                logging.debug('Found %d kernel crash reports:',
614                              len(kernel_crashes))
615                # The crash names contain timestamps that may be useful:
616                #   kernel.20131207.005945.0.kcrash
617                for crash in kernel_crashes:
618                  logging.debug('  %s', os.path.basename(crash))
619
620            # Print out some information to make it easier to debug
621            # the rollback.
622            logging.debug('Dumping partition table.')
623            self._run('cgpt show $(rootdev -s -d)')
624            logging.debug('Dumping crossystem for firmware debugging.')
625            self._run('crossystem --all')
626            raise ChromiumOSError(rollback_message)
627
628        # Make sure chromeos-setgoodkernel runs.
629        try:
630            utils.poll_for_condition(
631                lambda: (self.get_kernel_tries(active_kernel_state) == 0
632                         and self.get_kernel_success(active_kernel_state)),
633                exception=ChromiumOSError(),
634                timeout=self.KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
635        except ChromiumOSError:
636            services_status = self._run('status system-services').stdout
637            if services_status != 'system-services start/running\n':
638                event = ('Chrome failed to reach login screen')
639            else:
640                event = ('update-engine failed to call '
641                         'chromeos-setgoodkernel')
642            raise ChromiumOSError(
643                    'After update and reboot, %s '
644                    'within %d seconds' % (event,
645                                           self.KERNEL_UPDATE_TIMEOUT))
646