dev_server.py revision 2c32d6b593c4987a525ef162d6704fa6d6d7c0b0
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from distutils import version
6import cStringIO
7import HTMLParser
8import httplib
9import json
10import logging
11import multiprocessing
12import os
13import re
14import socket
15import time
16import urllib2
17import urlparse
18
19from autotest_lib.client.bin import utils as bin_utils
20from autotest_lib.client.common_lib import android_utils
21from autotest_lib.client.common_lib import error
22from autotest_lib.client.common_lib import global_config
23from autotest_lib.client.common_lib import utils
24from autotest_lib.client.common_lib.cros import retry
25from autotest_lib.server import utils as server_utils
26# TODO(cmasone): redo this class using requests module; http://crosbug.com/30107
27
28try:
29    from chromite.lib import metrics
30except ImportError:
31    metrics = utils.metrics_mock
32
33
34CONFIG = global_config.global_config
35# This file is generated at build time and specifies, per suite and per test,
36# the DEPENDENCIES list specified in each control file.  It's a dict of dicts:
37# {'bvt':   {'/path/to/autotest/control/site_tests/test1/control': ['dep1']}
38#  'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']}
39#  'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'],
40#            '/path/to/autotest/control/site_tests/test3/control': ['dep3']}
41# }
42DEPENDENCIES_FILE = 'test_suites/dependency_info'
43# Number of seconds for caller to poll devserver's is_staged call to check if
44# artifacts are staged.
45_ARTIFACT_STAGE_POLLING_INTERVAL = 5
46# Artifacts that should be staged when client calls devserver RPC to stage an
47# image.
48_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful'
49# Artifacts that should be staged when client calls devserver RPC to stage an
50# image with autotest artifact.
51_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,'
52                                                   'control_files,stateful,'
53                                                   'autotest_packages')
54# Artifacts that should be staged when client calls devserver RPC to stage an
55# Android build.
56_BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions')
57SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value(
58        'CROS', 'skip_devserver_health_check', type=bool)
59# Number of seconds for the call to get devserver load to time out.
60TIMEOUT_GET_DEVSERVER_LOAD = 2.0
61
62# Android artifact path in devserver
63ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value(
64        'CROS', 'android_build_name_pattern', type=str).replace('\\', '')
65
66# Return value from a devserver RPC indicating the call succeeded.
67SUCCESS = 'Success'
68
69# The timeout minutes for a given devserver ssh call.
70DEVSERVER_SSH_TIMEOUT_MINS = 1
71
72# Error message for invalid devserver response.
73ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error'
74
75# Error message for devserver call timedout.
76ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout'
77
78# The timeout minutes for waiting a devserver staging.
79DEVSERVER_IS_STAGING_RETRY_MIN = 100
80
81# The timeout minutes for waiting a DUT auto-update finished.
82DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100
83
84# The total times of devserver triggering CrOS auto-update.
85AU_RETRY_LIMIT = 3
86
87# Number of seconds for caller to poll devserver's get_au_status call to
88# check if cros auto-update is finished.
89CROS_AU_POLLING_INTERVAL = 10
90
91# Number of seconds for intervals between retrying auto-update calls.
92CROS_AU_RETRY_INTERVAL = 20
93
94# The file name for auto-update logs.
95CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log'
96
97# Provision error patterns.
98# People who see this should know that they shouldn't change these
99# classification strings. These strings are used for monitoring provision
100# failures. Any changes may mess up the stats.
101_EXCEPTION_PATTERNS = [
102        # Raised when devserver portfile does not exist on host.
103        (r".*Devserver portfile does not exist!.*$",
104         '(1) Devserver portfile does not exist on host'),
105        # Raised when devserver cannot copy packages to host.
106        (r".*Could not copy .* to device.*$",
107         '(2) Cannot copy packages to host'),
108        # Raised when devserver fails to run specific commands on host.
109        (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$",
110         '(3) Fail to run specific command on host'),
111        # Raised when new build fails to boot on the host.
112        (r'.*RootfsUpdateError: Build .* failed to boot on.*$',
113         '(4) Build failed to boot on host'),
114        # Raised when the auto-update process is timed out.
115        (r'.*The CrOS auto-update process is timed out, '
116         'thus will be terminated.*$',
117         '(5) Auto-update is timed out'),
118        # Raised when the host is not pingable.
119        (r".*DeviceNotPingableError.*$",
120         '(6) Host is not pingable during auto-update'),
121        # Raised when hosts have unexpected status after rootfs update.
122        (r'.*Update failed with unexpected update status: '
123         'UPDATE_STATUS_IDLE.*$',
124         '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs '
125         'update'),
126        # Raised when devserver returns non-json response to shard/drone.
127        (r'.*No JSON object could be decoded.*$',
128         '(8) Devserver returned non-json object'),
129        # Raised when devserver loses host's ssh connection
130        (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$',
131         "(9) Devserver lost host's ssh connection"),
132        # Raised when error happens in writing files to host
133        (r'.*Write failed\: Broken pipe.*$',
134         "(10) Broken pipe while writing or connecting to host")]
135
136PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value(
137        'CROS', 'prefer_local_devserver', type=bool, default=False)
138
139ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value(
140        'CROS', 'enable_ssh_connection_for_devserver', type=bool,
141        default=False)
142
143# Directory to save auto-update logs
144AUTO_UPDATE_LOG_DIR = 'autoupdate_logs'
145
146DEFAULT_SUBNET_MASKBIT = 19
147
148
149class DevServerException(Exception):
150    """Raised when the dev server returns a non-200 HTTP response."""
151    pass
152
153
154class DevServerOverloadException(Exception):
155    """Raised when the dev server returns a 502 HTTP response."""
156    pass
157
158
159class MarkupStripper(HTMLParser.HTMLParser):
160    """HTML parser that strips HTML tags, coded characters like &
161
162    Works by, basically, not doing anything for any tags, and only recording
163    the content of text nodes in an internal data structure.
164    """
165    def __init__(self):
166        self.reset()
167        self.fed = []
168
169
170    def handle_data(self, d):
171        """Consume content of text nodes, store it away."""
172        self.fed.append(d)
173
174
175    def get_data(self):
176        """Concatenate and return all stored data."""
177        return ''.join(self.fed)
178
179
180def _strip_http_message(message):
181    """Strip the HTTP marker from the an HTTP message.
182
183    @param message: A string returned by an HTTP call.
184
185    @return: A string with HTTP marker being stripped.
186    """
187    strip = MarkupStripper()
188    try:
189        strip.feed(message.decode('utf_32'))
190    except UnicodeDecodeError:
191        strip.feed(message)
192    return strip.get_data()
193
194
195def _get_image_storage_server():
196    return CONFIG.get_config_value('CROS', 'image_storage_server', type=str)
197
198
199def _get_canary_channel_server():
200    """
201    Get the url of the canary-channel server,
202    eg: gsutil://chromeos-releases/canary-channel/<board>/<release>
203
204    @return: The url to the canary channel server.
205    """
206    return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str)
207
208
209def _get_storage_server_for_artifacts(artifacts=None):
210    """Gets the appropriate storage server for the given artifacts.
211
212    @param artifacts: A list of artifacts we need to stage.
213    @return: The address of the storage server that has these artifacts.
214             The default image storage server if no artifacts are specified.
215    """
216    factory_artifact = global_config.global_config.get_config_value(
217            'CROS', 'factory_artifact', type=str, default='')
218    if artifacts and factory_artifact and factory_artifact in artifacts:
219        return _get_canary_channel_server()
220    return _get_image_storage_server()
221
222
223def _reverse_lookup_from_config(address):
224    """Look up hostname for the given IP address.
225
226    This uses the hostname-address map from the config file.
227
228    If multiple hostnames map to the same IP address, the first one
229    defined in the configuration file takes precedence.
230
231    @param address: IP address string
232    @returns: hostname string, or original input if not found
233    """
234    for hostname, addr in _get_hostname_addr_map().iteritems():
235        if addr == address:
236            return hostname
237    return address
238
239
240def _get_hostname_addr_map():
241    """Get hostname address mapping from config.
242
243    @return: dict mapping server hostnames to addresses
244    """
245    return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP')
246
247
248def _get_dev_server_list():
249    return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[])
250
251
252def _get_crash_server_list():
253    return CONFIG.get_config_value('CROS', 'crash_server', type=list,
254        default=[])
255
256
257def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN,
258                          exception_to_raise=DevServerException):
259    """A decorator to use with remote devserver calls.
260
261    This decorator converts urllib2.HTTPErrors into DevServerExceptions
262    with any embedded error info converted into plain text. The method
263    retries on urllib2.URLError or error.CmdError to avoid devserver flakiness.
264    """
265    #pylint: disable=C0111
266
267    def inner_decorator(method):
268        label = method.__name__ if hasattr(method, '__name__') else None
269        def metrics_wrapper(*args, **kwargs):
270            @retry.retry((urllib2.URLError, error.CmdError,
271                          DevServerOverloadException),
272                         timeout_min=timeout_min,
273                         exception_to_raise=exception_to_raise,
274                        label=label)
275            def wrapper():
276                """This wrapper actually catches the HTTPError."""
277                try:
278                    return method(*args, **kwargs)
279                except urllib2.HTTPError as e:
280                    error_markup = e.read()
281                    raise DevServerException(_strip_http_message(error_markup))
282
283            try:
284                return wrapper()
285            except Exception as e:
286                if ERR_MSG_FOR_TIMED_OUT_CALL in str(e):
287                    dev_server = None
288                    if args and isinstance(args[0], DevServer):
289                        dev_server = args[0].hostname
290                    elif 'devserver' in kwargs:
291                        dev_server = get_hostname(kwargs['devserver'])
292
293                    logging.debug('RPC call %s has timed out on devserver %s.',
294                                  label, dev_server)
295                    c = metrics.Counter(
296                            'chromeos/autotest/devserver/call_timeout')
297                    c.increment(fields={'dev_server': dev_server,
298                                        'healthy': label})
299
300                raise
301
302        return metrics_wrapper
303
304    return inner_decorator
305
306
307def get_hostname(url):
308    """Get the hostname portion of a URL
309
310    schema://hostname:port/path
311
312    @param url: a Url string
313    @return: a hostname string
314    """
315    return urlparse.urlparse(url).hostname
316
317
318class DevServer(object):
319    """Base class for all DevServer-like server stubs.
320
321    This is the base class for interacting with all Dev Server-like servers.
322    A caller should instantiate a sub-class of DevServer with:
323
324    host = SubClassServer.resolve(build)
325    server = SubClassServer(host)
326    """
327    _MIN_FREE_DISK_SPACE_GB = 20
328    _MAX_APACHE_CLIENT_COUNT = 75
329    # Threshold for the CPU load percentage for a devserver to be selected.
330    MAX_CPU_LOAD = 80.0
331    # Threshold for the network IO, set to 80MB/s
332    MAX_NETWORK_IO = 1024 * 1024 * 80
333    DISK_IO = 'disk_total_bytes_per_second'
334    NETWORK_IO = 'network_total_bytes_per_second'
335    CPU_LOAD = 'cpu_percent'
336    FREE_DISK = 'free_disk'
337    AU_PROCESS = 'au_process_count'
338    STAGING_THREAD_COUNT = 'staging_thread_count'
339    APACHE_CLIENT_COUNT = 'apache_client_count'
340
341
342    def __init__(self, devserver):
343        self._devserver = devserver
344
345
346    def url(self):
347        """Returns the url for this devserver."""
348        return self._devserver
349
350
351    @property
352    def hostname(self):
353        """Return devserver hostname parsed from the devserver URL.
354
355        Note that this is likely parsed from the devserver URL from
356        shadow_config.ini, meaning that the "hostname" part of the
357        devserver URL is actually an IP address.
358
359        @return hostname string
360        """
361        return get_hostname(self.url())
362
363
364    @property
365    def resolved_hostname(self):
366        """Return devserver hostname, resolved from its IP address.
367
368        Unlike the hostname property, this property attempts to look up
369        the proper hostname from the devserver IP address.  If lookup
370        fails, then fall back to whatever the hostname property would
371        have returned.
372
373        @return hostname string
374        """
375        return _reverse_lookup_from_config(self.hostname)
376
377
378    @staticmethod
379    def get_server_url(url):
380        """Get the devserver url from a repo url, which includes build info.
381
382        @param url: A job repo url.
383
384        @return A devserver url, e.g., http://127.0.0.10:8080
385        """
386        res = urlparse.urlparse(url)
387        if res.netloc:
388            return res.scheme + '://' + res.netloc
389
390
391    @classmethod
392    def get_devserver_load_wrapper(cls, devserver, timeout_sec, output):
393        """A wrapper function to call get_devserver_load in parallel.
394
395        @param devserver: url of the devserver.
396        @param timeout_sec: Number of seconds before time out the devserver
397                            call.
398        @param output: An output queue to save results to.
399        """
400        load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0)
401        if load:
402            load['devserver'] = devserver
403        output.put(load)
404
405
406    @classmethod
407    def get_devserver_load(cls, devserver,
408                           timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
409        """Returns True if the |devserver| is healthy to stage build.
410
411        @param devserver: url of the devserver.
412        @param timeout_min: How long to wait in minutes before deciding the
413                            the devserver is not up (float).
414
415        @return: A dictionary of the devserver's load.
416
417        """
418        call = cls._build_call(devserver, 'check_health')
419        @remote_devserver_call(timeout_min=timeout_min)
420        def get_load(devserver=devserver):
421            """Inner method that makes the call."""
422            return cls.run_call(call, timeout=timeout_min*60)
423
424        try:
425            return json.load(cStringIO.StringIO(get_load(devserver=devserver)))
426        except Exception as e:
427            logging.error('Devserver call failed: "%s", timeout: %s seconds,'
428                          ' Error: %s', call, timeout_min * 60, e)
429
430
431    @classmethod
432    def is_free_disk_ok(cls, load):
433        """Check if a devserver has enough free disk.
434
435        @param load: A dict of the load of the devserver.
436
437        @return: True if the devserver has enough free disk or disk check is
438                 skipped in global config.
439
440        """
441        if SKIP_DEVSERVER_HEALTH_CHECK:
442            logging.debug('devserver health check is skipped.')
443        elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB:
444            return False
445
446        return True
447
448
449    @classmethod
450    def is_apache_client_count_ok(cls, load):
451        """Check if a devserver has enough Apache connections available.
452
453        Apache server by default has maximum of 150 concurrent connections. If
454        a devserver has too many live connections, it likely indicates the
455        server is busy handling many long running download requests, e.g.,
456        downloading stateful partitions. It is better not to add more requests
457        to it.
458
459        @param load: A dict of the load of the devserver.
460
461        @return: True if the devserver has enough Apache connections available,
462                 or disk check is skipped in global config.
463
464        """
465        if SKIP_DEVSERVER_HEALTH_CHECK:
466            logging.debug('devserver health check is skipped.')
467        elif cls.APACHE_CLIENT_COUNT not in load:
468            logging.debug('Apache client count is not collected from devserver.')
469        elif (load[cls.APACHE_CLIENT_COUNT] >
470              cls._MAX_APACHE_CLIENT_COUNT):
471            return False
472
473        return True
474
475
476    @classmethod
477    def devserver_healthy(cls, devserver,
478                          timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
479        """Returns True if the |devserver| is healthy to stage build.
480
481        @param devserver: url of the devserver.
482        @param timeout_min: How long to wait in minutes before deciding the
483                            the devserver is not up (float).
484
485        @return: True if devserver is healthy. Return False otherwise.
486
487        """
488        c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy')
489        reason = ''
490        healthy = False
491        load = cls.get_devserver_load(devserver, timeout_min=timeout_min)
492        try:
493            if not load:
494                # Failed to get the load of devserver.
495                reason = '(1) Failed to get load.'
496                return False
497
498            apache_ok = cls.is_apache_client_count_ok(load)
499            if not apache_ok:
500                reason = '(2) Apache client count too high.'
501                logging.error('Devserver check_health failed. Live Apache client '
502                              'count is too high: %d.',
503                              load[cls.APACHE_CLIENT_COUNT])
504                return False
505
506            disk_ok = cls.is_free_disk_ok(load)
507            if not disk_ok:
508                reason = '(3) Disk space too low.'
509                logging.error('Devserver check_health failed. Free disk space is '
510                              'low. Only %dGB is available.',
511                              load[cls.FREE_DISK])
512            healthy = bool(disk_ok)
513            return disk_ok
514        finally:
515            c.increment(fields={'dev_server': cls(devserver).resolved_hostname,
516                                'healthy': healthy,
517                                'reason': reason})
518            # Monitor how many AU processes the devserver is currently running.
519            if load is not None and load.get(DevServer.AU_PROCESS):
520                c_au = metrics.Gauge(
521                        'chromeos/autotest/devserver/devserver_au_count')
522                c_au.set(
523                    load.get(DevServer.AU_PROCESS),
524                    fields={'dev_server': cls(devserver).resolved_hostname})
525
526
527    @staticmethod
528    def _build_call(host, method, **kwargs):
529        """Build a URL to |host| that calls |method|, passing |kwargs|.
530
531        Builds a URL that calls |method| on the dev server defined by |host|,
532        passing a set of key/value pairs built from the dict |kwargs|.
533
534        @param host: a string that is the host basename e.g. http://server:90.
535        @param method: the dev server method to call.
536        @param kwargs: a dict mapping arg names to arg values.
537        @return the URL string.
538        """
539        argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems()))
540        return "%(host)s/%(method)s?%(argstr)s" % dict(
541                host=host, method=method, argstr=argstr)
542
543
544    def build_call(self, method, **kwargs):
545        """Builds a devserver RPC string that is used by 'run_call()'.
546
547        @param method: remote devserver method to call.
548        """
549        return self._build_call(self._devserver, method, **kwargs)
550
551
552    @classmethod
553    def build_all_calls(cls, method, **kwargs):
554        """Builds a list of URLs that makes RPC calls on all devservers.
555
556        Build a URL that calls |method| on the dev server, passing a set
557        of key/value pairs built from the dict |kwargs|.
558
559        @param method: the dev server method to call.
560        @param kwargs: a dict mapping arg names to arg values
561
562        @return the URL string
563        """
564        calls = []
565        # Note we use cls.servers as servers is class specific.
566        for server in cls.servers():
567            if cls.devserver_healthy(server):
568                calls.append(cls._build_call(server, method, **kwargs))
569
570        return calls
571
572
573    @classmethod
574    def run_call(cls, call, readline=False, timeout=None):
575        """Invoke a given devserver call using urllib.open.
576
577        Open the URL with HTTP, and return the text of the response. Exceptions
578        may be raised as for urllib2.urlopen().
579
580        @param call: a url string that calls a method to a devserver.
581        @param readline: whether read http response line by line.
582        @param timeout: The timeout seconds for this urlopen call.
583
584        @return the results of this call.
585        """
586        if timeout is not None:
587            return utils.urlopen_socket_timeout(
588                    call, timeout=timeout).read()
589        elif readline:
590            response = urllib2.urlopen(call)
591            return [line.rstrip() for line in response]
592        else:
593            return urllib2.urlopen(call).read()
594
595
596    @staticmethod
597    def servers():
598        """Returns a list of servers that can serve as this type of server."""
599        raise NotImplementedError()
600
601
602    @classmethod
603    def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT,
604                                      unrestricted_only=False):
605        """Get the devservers in the same subnet of the given ip.
606
607        @param ip: The IP address of a dut to look for devserver.
608        @param mask_bits: Number of mask bits. Default is 19.
609        @param unrestricted_only: Set to True to select from devserver in
610                unrestricted subnet only. Default is False.
611
612        @return: A list of devservers in the same subnet of the given ip.
613
614        """
615        # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so
616        # we need a dict to return the full devserver path once the IPs are
617        # filtered in get_servers_in_same_subnet.
618        server_names = {}
619        all_devservers = []
620        devservers = (cls.get_unrestricted_devservers() if unrestricted_only
621                      else cls.servers())
622        for server in devservers:
623            server_name = get_hostname(server)
624            server_names[server_name] = server
625            all_devservers.append(server_name)
626        devservers = utils.get_servers_in_same_subnet(ip, mask_bits,
627                                                      all_devservers)
628        return [server_names[s] for s in devservers]
629
630
631    @classmethod
632    def get_unrestricted_devservers(
633                cls, restricted_subnets=utils.RESTRICTED_SUBNETS):
634        """Get the devservers not in any restricted subnet specified in
635        restricted_subnets.
636
637        @param restricted_subnets: A list of restriected subnets.
638
639        @return: A list of devservers not in any restricted subnet.
640
641        """
642        if not restricted_subnets:
643            return cls.servers()
644
645        devservers = []
646        for server in cls.servers():
647            server_name = get_hostname(server)
648            if not utils.get_restricted_subnet(server_name, restricted_subnets):
649                devservers.append(server)
650        return devservers
651
652
653    @classmethod
654    def get_healthy_devserver(cls, build, devservers, ban_list=None):
655        """"Get a healthy devserver instance from the list of devservers.
656
657        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
658        @param devservers: The devserver list to be chosen out a healthy one.
659        @param ban_list: The blacklist of devservers we don't want to choose.
660                Default is None.
661
662        @return: A DevServer object of a healthy devserver. Return None if no
663                healthy devserver is found.
664
665        """
666        while devservers:
667            hash_index = hash(build) % len(devservers)
668            devserver = devservers.pop(hash_index)
669            if ban_list and devserver in ban_list:
670                continue
671
672            if cls.devserver_healthy(devserver):
673                return cls(devserver)
674
675
676    @classmethod
677    def get_available_devservers(cls, hostname=None,
678                                 prefer_local_devserver=PREFER_LOCAL_DEVSERVER,
679                                 restricted_subnets=utils.RESTRICTED_SUBNETS):
680        """Get devservers in the same subnet of the given hostname.
681
682        @param hostname: Hostname of a DUT to choose devserver for.
683
684        @return: A tuple of (devservers, can_retry), devservers is a list of
685                 devservers that's available for the given hostname. can_retry
686                 is a flag that indicate if caller can retry the selection of
687                 devserver if no devserver in the returned devservers can be
688                 used. For example, if hostname is in a restricted subnet,
689                 can_retry will be False.
690        """
691        host_ip = None
692        if hostname:
693            host_ip = bin_utils.get_ip_address(hostname)
694            if not host_ip:
695                logging.error('Failed to get IP address of %s. Will pick a '
696                              'devserver without subnet constraint.', hostname)
697
698        if not host_ip:
699            return cls.get_unrestricted_devservers(restricted_subnets), False
700
701        # Go through all restricted subnet settings and check if the DUT is
702        # inside a restricted subnet. If so, only return the devservers in the
703        # restricted subnet and doesn't allow retry.
704        if host_ip and restricted_subnets:
705            for subnet_ip, mask_bits in restricted_subnets:
706                if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits):
707                    logging.debug('The host %s (%s) is in a restricted subnet. '
708                                  'Try to locate a devserver inside subnet '
709                                  '%s:%d.', hostname, host_ip, subnet_ip,
710                                  mask_bits)
711                    devservers = cls.get_devservers_in_same_subnet(
712                            subnet_ip, mask_bits)
713                    return devservers, False
714
715        # If prefer_local_devserver is set to True and the host is not in
716        # restricted subnet, pick a devserver in the same subnet if possible.
717        # Set can_retry to True so it can pick a different devserver if all
718        # devservers in the same subnet are down.
719        if prefer_local_devserver:
720            return (cls.get_devservers_in_same_subnet(
721                    host_ip, DEFAULT_SUBNET_MASKBIT, True), True)
722
723        return cls.get_unrestricted_devservers(restricted_subnets), False
724
725
726    @classmethod
727    def resolve(cls, build, hostname=None, ban_list=None):
728        """"Resolves a build to a devserver instance.
729
730        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
731        @param hostname: The hostname of dut that requests a devserver. It's
732                         used to make sure a devserver in the same subnet is
733                         preferred.
734        @param ban_list: The blacklist of devservers shouldn't be chosen.
735
736        @raise DevServerException: If no devserver is available.
737        """
738        tried_devservers = set()
739        devservers, can_retry = cls.get_available_devservers(hostname)
740        if devservers:
741            tried_devservers |= set(devservers)
742
743        devserver = cls.get_healthy_devserver(build, devservers,
744                                              ban_list=ban_list)
745
746        if not devserver and can_retry:
747            # Find available devservers without dut location constrain.
748            devservers, _ = cls.get_available_devservers()
749            devserver = cls.get_healthy_devserver(build, devservers,
750                                                  ban_list=ban_list)
751            if devservers:
752                tried_devservers |= set(devservers)
753        if devserver:
754            return devserver
755        else:
756            error_msg = ('All devservers are currently down: %s. '
757                         'dut hostname: %s' %
758                         (tried_devservers, hostname))
759            logging.error(error_msg)
760            raise DevServerException(error_msg)
761
762
763    @classmethod
764    def random(cls):
765        """Return a random devserver that's available.
766
767        Devserver election in `resolve` method is based on a hash of the
768        build that a caller wants to stage. The purpose is that different
769        callers requesting for the same build can get the same devserver,
770        while the lab is able to distribute different builds across all
771        devservers. That helps to reduce the duplication of builds across
772        all devservers.
773        This function returns a random devserver, by passing a random
774        pseudo build name to `resolve `method.
775        """
776        return cls.resolve(build=str(time.time()))
777
778
779class CrashServer(DevServer):
780    """Class of DevServer that symbolicates crash dumps."""
781
782    @staticmethod
783    def servers():
784        return _get_crash_server_list()
785
786
787    @remote_devserver_call()
788    def symbolicate_dump(self, minidump_path, build):
789        """Ask the devserver to symbolicate the dump at minidump_path.
790
791        Stage the debug symbols for |build| and, if that works, ask the
792        devserver to symbolicate the dump at |minidump_path|.
793
794        @param minidump_path: the on-disk path of the minidump.
795        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
796                      whose debug symbols are needed for symbolication.
797        @return The contents of the stack trace
798        @raise DevServerException upon any return code that's not HTTP OK.
799        """
800        try:
801            import requests
802        except ImportError:
803            logging.warning("Can't 'import requests' to connect to dev server.")
804            return ''
805        f = {'dev_server': self.resolved_hostname}
806        c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump')
807        c.increment(fields=f)
808        # Symbolicate minidump.
809        m = 'chromeos/autotest/crashserver/symbolicate_dump_duration'
810        with metrics.SecondsTimer(m, fields=f):
811            call = self.build_call('symbolicate_dump',
812                                   archive_url=_get_image_storage_server() + build)
813            request = requests.post(
814                    call, files={'minidump': open(minidump_path, 'rb')})
815            if request.status_code == requests.codes.OK:
816                return request.text
817
818        error_fd = cStringIO.StringIO(request.text)
819        raise urllib2.HTTPError(
820                call, request.status_code, request.text, request.headers,
821                error_fd)
822
823
824    @classmethod
825    def get_available_devservers(cls, hostname):
826        """Get all available crash servers.
827
828        Crash server election doesn't need to count the location of hostname.
829
830        @param hostname: Hostname of a DUT to choose devserver for.
831
832        @return: A tuple of (all crash servers, False). can_retry is set to
833                 False, as all crash servers are returned. There is no point to
834                 retry.
835        """
836        return cls.servers(), False
837
838
839class ImageServerBase(DevServer):
840    """Base class for devservers used to stage builds.
841
842    CrOS and Android builds are staged in different ways as they have different
843    sets of artifacts. This base class abstracts the shared functions between
844    the two types of ImageServer.
845    """
846
847    @classmethod
848    def servers(cls):
849        """Returns a list of servers that can serve as a desired type of
850        devserver.
851        """
852        return _get_dev_server_list()
853
854
855    def _get_image_url(self, image):
856        """Returns the url of the directory for this image on the devserver.
857
858        @param image: the image that was fetched.
859        """
860        image = self.translate(image)
861        url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
862                                              type=str)
863        return (url_pattern % (self.url(), image)).replace('update', 'static')
864
865
866    @staticmethod
867    def create_metadata(server_name, image, artifacts=None, files=None):
868        """Create a metadata dictionary given the staged items.
869
870        The metadata can be send to metadata db along with stats.
871
872        @param server_name: name of the devserver, e.g 172.22.33.44.
873        @param image: The name of the image.
874        @param artifacts: A list of artifacts.
875        @param files: A list of files.
876
877        @return A metadata dictionary.
878
879        """
880        metadata = {'devserver': server_name,
881                    'image': image,
882                    '_type': 'devserver'}
883        if artifacts:
884            metadata['artifacts'] = ' '.join(artifacts)
885        if files:
886            metadata['files'] = ' '.join(files)
887        return metadata
888
889
890    @classmethod
891    def run_ssh_call(cls, call, readline=False, timeout=None):
892        """Construct an ssh-based rpc call, and execute it.
893
894        @param call: a url string that calls a method to a devserver.
895        @param readline: whether read http response line by line.
896        @param timeout: The timeout seconds for ssh call.
897
898        @return the results of this call.
899        """
900        hostname = get_hostname(call)
901        ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call))
902        timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60
903        try:
904            result = utils.run(ssh_call, timeout=timeout_seconds)
905        except error.CmdError as e:
906            logging.debug('Error occurred with exit_code %d when executing the '
907                          'ssh call: %s.', e.result_obj.exit_status,
908                          e.result_obj.stderr)
909            c = metrics.Counter('chromeos/autotest/devserver/ssh_failure')
910            c.increment(fields={'dev_server': hostname})
911            raise
912        response = result.stdout
913
914        # If the curl command's returned HTTP response contains certain
915        # exception string, raise the DevServerException of the response.
916        if 'DownloaderException' in response:
917            raise DevServerException(_strip_http_message(response))
918
919        if readline:
920            # Remove line terminators and trailing whitespace
921            response = response.splitlines()
922            return [line.rstrip() for line in response]
923
924        return response
925
926
927    @classmethod
928    def run_call(cls, call, readline=False, timeout=None):
929        """Invoke a given devserver call using urllib.open or ssh.
930
931        Open the URL with HTTP or SSH-based HTTP, and return the text of the
932        response. Exceptions may be raised as for urllib2.urlopen() or
933        utils.run().
934
935        @param call: a url string that calls a method to a devserver.
936        @param readline: whether read http response line by line.
937        @param timeout: The timeout seconds for urlopen call or ssh call.
938
939        @return the results of this call.
940        """
941        if not ENABLE_SSH_CONNECTION_FOR_DEVSERVER:
942            return super(ImageServerBase, cls).run_call(
943                    call, readline=readline, timeout=timeout)
944        else:
945            return cls.run_ssh_call(
946                    call, readline=readline, timeout=timeout)
947
948
949    @classmethod
950    def download_file(cls, remote_file, local_file, timeout=None):
951        """Download file from devserver.
952
953        The format of remote_file should be:
954            http://devserver_ip:8082/static/board/...
955
956        @param remote_file: The URL of the file on devserver that need to be
957            downloaded.
958        @param local_file: The path of the file saved to local.
959        @param timeout: The timeout seconds for this call.
960        """
961        response = cls.run_call(remote_file, timeout=timeout)
962        with open(local_file, 'w') as out_log:
963            out_log.write(response)
964
965
966    def _poll_is_staged(self, **kwargs):
967        """Polling devserver.is_staged until all artifacts are staged.
968
969        @param kwargs: keyword arguments to make is_staged devserver call.
970
971        @return: True if all artifacts are staged in devserver.
972        """
973        call = self.build_call('is_staged', **kwargs)
974
975        def all_staged():
976            """Call devserver.is_staged rpc to check if all files are staged.
977
978            @return: True if all artifacts are staged in devserver. False
979                     otherwise.
980            @rasies DevServerException, the exception is a wrapper of all
981                    exceptions that were raised when devserver tried to download
982                    the artifacts. devserver raises an HTTPError or a CmdError
983                    when an exception was raised in the code. Such exception
984                    should be re-raised here to stop the caller from waiting.
985                    If the call to devserver failed for connection issue, a
986                    URLError exception is raised, and caller should retry the
987                    call to avoid such network flakiness.
988
989            """
990            try:
991                result = self.run_call(call)
992                logging.debug('whether artifact is staged: %r', result)
993                return result == 'True'
994            except urllib2.HTTPError as e:
995                error_markup = e.read()
996                raise DevServerException(_strip_http_message(error_markup))
997            except urllib2.URLError as e:
998                # Could be connection issue, retry it.
999                # For example: <urlopen error [Errno 111] Connection refused>
1000                logging.error('URLError happens in is_stage: %r', e)
1001                return False
1002            except error.CmdError as e:
1003                # Retry if SSH failed to connect to the devserver.
1004                logging.warning('CmdError happens in is_stage: %r, will retry', e)
1005                return False
1006
1007        bin_utils.poll_for_condition(
1008                all_staged,
1009                exception=bin_utils.TimeoutError(),
1010                timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60,
1011                sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL)
1012
1013        return True
1014
1015
1016    def _call_and_wait(self, call_name, error_message,
1017                       expected_response=SUCCESS, **kwargs):
1018        """Helper method to make a urlopen call, and wait for artifacts staged.
1019
1020        @param call_name: name of devserver rpc call.
1021        @param error_message: Error message to be thrown if response does not
1022                              match expected_response.
1023        @param expected_response: Expected response from rpc, default to
1024                                  |Success|. If it's set to None, do not compare
1025                                  the actual response. Any response is consider
1026                                  to be good.
1027        @param kwargs: keyword arguments to make is_staged devserver call.
1028
1029        @return: The response from rpc.
1030        @raise DevServerException upon any return code that's expected_response.
1031
1032        """
1033        call = self.build_call(call_name, async=True, **kwargs)
1034        try:
1035            response = self.run_call(call)
1036            logging.debug('response for RPC: %r', response)
1037            if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response:
1038                logging.debug('Proxy error happens in RPC call, '
1039                              'will retry in 30 seconds')
1040                time.sleep(30)
1041                raise DevServerOverloadException()
1042        except httplib.BadStatusLine as e:
1043            logging.error(e)
1044            raise DevServerException('Received Bad Status line, Devserver %s '
1045                                     'might have gone down while handling '
1046                                     'the call: %s' % (self.url(), call))
1047
1048        if expected_response and not response == expected_response:
1049                raise DevServerException(error_message)
1050
1051        # `os_type` is needed in build a devserver call, but not needed for
1052        # wait_for_artifacts_staged, since that method is implemented by
1053        # each ImageServerBase child class.
1054        if 'os_type' in kwargs:
1055            del kwargs['os_type']
1056        self.wait_for_artifacts_staged(**kwargs)
1057        return response
1058
1059
1060    def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs):
1061        """Tell the devserver to download and stage |artifacts| from |image|
1062        specified by kwargs.
1063
1064        This is the main call point for staging any specific artifacts for a
1065        given build. To see the list of artifacts one can stage see:
1066
1067        ~src/platfrom/dev/artifact_info.py.
1068
1069        This is maintained along with the actual devserver code.
1070
1071        @param artifacts: A list of artifacts.
1072        @param files: A list of files to stage.
1073        @param archive_url: Optional parameter that has the archive_url to stage
1074                this artifact from. Default is specified in autotest config +
1075                image.
1076        @param kwargs: keyword arguments that specify the build information, to
1077                make stage devserver call.
1078
1079        @raise DevServerException upon any return code that's not HTTP OK.
1080        """
1081        if not archive_url:
1082            archive_url = _get_storage_server_for_artifacts(artifacts) + build
1083
1084        artifacts_arg = ','.join(artifacts) if artifacts else ''
1085        files_arg = ','.join(files) if files else ''
1086        error_message = ("staging %s for %s failed;"
1087                         "HTTP OK not accompanied by 'Success'." %
1088                         ('artifacts=%s files=%s ' % (artifacts_arg, files_arg),
1089                          build))
1090
1091        staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' %
1092                        (build, artifacts, files, archive_url))
1093        logging.info('Staging artifacts on devserver %s: %s',
1094                     self.url(), staging_info)
1095        success = False
1096        try:
1097            arguments = {'archive_url': archive_url,
1098                         'artifacts': artifacts_arg,
1099                         'files': files_arg}
1100            if kwargs:
1101                arguments.update(kwargs)
1102            # TODO(akeshet): canonicalize artifacts_arg before using it as a
1103            # metric field (as it stands it is a not-very-well-controlled
1104            # string).
1105            f = {'artifacts': artifacts_arg,
1106                 'dev_server': self.resolved_hostname}
1107            with metrics.SecondsTimer(
1108                    'chromeos/autotest/devserver/stage_artifact_duration',
1109                    fields=f):
1110                self.call_and_wait(call_name='stage', error_message=error_message,
1111                                   **arguments)
1112            logging.info('Finished staging artifacts: %s', staging_info)
1113            success = True
1114        except (bin_utils.TimeoutError, error.TimeoutException):
1115            logging.error('stage_artifacts timed out: %s', staging_info)
1116            raise DevServerException(
1117                    'stage_artifacts timed out: %s' % staging_info)
1118        finally:
1119            f = {'success': success,
1120                 'artifacts': artifacts_arg,
1121                 'dev_server': self.resolved_hostname}
1122            metrics.Counter('chromeos/autotest/devserver/stage_artifact'
1123                            ).increment(fields=f)
1124
1125
1126    def call_and_wait(self, *args, **kwargs):
1127        """Helper method to make a urlopen call, and wait for artifacts staged.
1128
1129        This method needs to be overridden in the subclass to implement the
1130        logic to call _call_and_wait.
1131        """
1132        raise NotImplementedError
1133
1134
1135    def _trigger_download(self, build, artifacts, files, synchronous=True,
1136                          **kwargs_build_info):
1137        """Tell the devserver to download and stage image specified in
1138        kwargs_build_info.
1139
1140        Tells the devserver to fetch |image| from the image storage server
1141        named by _get_image_storage_server().
1142
1143        If |synchronous| is True, waits for the entire download to finish
1144        staging before returning. Otherwise only the artifacts necessary
1145        to start installing images onto DUT's will be staged before returning.
1146        A caller can then call finish_download to guarantee the rest of the
1147        artifacts have finished staging.
1148
1149        @param synchronous: if True, waits until all components of the image are
1150               staged before returning.
1151        @param kwargs_build_info: Dictionary of build information.
1152                For CrOS, it is None as build is the CrOS image name.
1153                For Android, it is {'target': target,
1154                                    'build_id': build_id,
1155                                    'branch': branch}
1156
1157        @raise DevServerException upon any return code that's not HTTP OK.
1158
1159        """
1160        if kwargs_build_info:
1161            archive_url = None
1162        else:
1163            archive_url = _get_image_storage_server() + build
1164        error_message = ("trigger_download for %s failed;"
1165                         "HTTP OK not accompanied by 'Success'." % build)
1166        kwargs = {'archive_url': archive_url,
1167                  'artifacts': artifacts,
1168                  'files': files,
1169                  'error_message': error_message}
1170        if kwargs_build_info:
1171            kwargs.update(kwargs_build_info)
1172
1173        logging.info('trigger_download starts for %s', build)
1174        try:
1175            response = self.call_and_wait(call_name='stage', **kwargs)
1176            logging.info('trigger_download finishes for %s', build)
1177        except (bin_utils.TimeoutError, error.TimeoutException):
1178            logging.error('trigger_download timed out for %s.', build)
1179            raise DevServerException(
1180                    'trigger_download timed out for %s.' % build)
1181        was_successful = response == SUCCESS
1182        if was_successful and synchronous:
1183            self._finish_download(build, artifacts, files, **kwargs_build_info)
1184
1185
1186    def _finish_download(self, build, artifacts, files, **kwargs_build_info):
1187        """Tell the devserver to finish staging image specified in
1188        kwargs_build_info.
1189
1190        If trigger_download is called with synchronous=False, it will return
1191        before all artifacts have been staged. This method contacts the
1192        devserver and blocks until all staging is completed and should be
1193        called after a call to trigger_download.
1194
1195        @param kwargs_build_info: Dictionary of build information.
1196                For CrOS, it is None as build is the CrOS image name.
1197                For Android, it is {'target': target,
1198                                    'build_id': build_id,
1199                                    'branch': branch}
1200
1201        @raise DevServerException upon any return code that's not HTTP OK.
1202        """
1203        archive_url = _get_image_storage_server() + build
1204        error_message = ("finish_download for %s failed;"
1205                         "HTTP OK not accompanied by 'Success'." % build)
1206        kwargs = {'archive_url': archive_url,
1207                  'artifacts': artifacts,
1208                  'files': files,
1209                  'error_message': error_message}
1210        if kwargs_build_info:
1211            kwargs.update(kwargs_build_info)
1212        try:
1213            self.call_and_wait(call_name='stage', **kwargs)
1214        except (bin_utils.TimeoutError, error.TimeoutException):
1215            logging.error('finish_download timed out for %s', build)
1216            raise DevServerException(
1217                    'finish_download timed out for %s.' % build)
1218
1219
1220    @remote_devserver_call()
1221    def locate_file(self, file_name, artifacts, build, build_info):
1222        """Locate a file with the given file_name on devserver.
1223
1224        This method calls devserver RPC `locate_file` to look up a file with
1225        the given file name inside specified build artifacts.
1226
1227        @param file_name: Name of the file to look for a file.
1228        @param artifacts: A list of artifact names to search for the file.
1229        @param build: Name of the build. For Android, it's None as build_info
1230                should be used.
1231        @param build_info: Dictionary of build information.
1232                For CrOS, it is None as build is the CrOS image name.
1233                For Android, it is {'target': target,
1234                                    'build_id': build_id,
1235                                    'branch': branch}
1236
1237        @return: A devserver url to the file.
1238        @raise DevServerException upon any return code that's not HTTP OK.
1239        """
1240        if not build and not build_info:
1241            raise DevServerException('You must specify build information to '
1242                                     'look for file %s in artifacts %s.' %
1243                                     (file_name, artifacts))
1244        kwargs = {'file_name': file_name,
1245                  'artifacts': artifacts}
1246        if build_info:
1247            build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info
1248            kwargs.update(build_info)
1249            # Devserver treats Android and Brillo build in the same way as they
1250            # are both retrieved from Launch Control and have similar build
1251            # artifacts. Therefore, os_type for devserver calls is `android` for
1252            # both Android and Brillo builds.
1253            kwargs['os_type'] = 'android'
1254        else:
1255            build_path = build
1256            kwargs['build'] = build
1257        call = self.build_call('locate_file', async=False, **kwargs)
1258        try:
1259            file_path = self.run_call(call)
1260            return os.path.join(self.url(), 'static', build_path, file_path)
1261        except httplib.BadStatusLine as e:
1262            logging.error(e)
1263            raise DevServerException('Received Bad Status line, Devserver %s '
1264                                     'might have gone down while handling '
1265                                     'the call: %s' % (self.url(), call))
1266
1267
1268    @remote_devserver_call()
1269    def list_control_files(self, build, suite_name=''):
1270        """Ask the devserver to list all control files for |build|.
1271
1272        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1273                      whose control files the caller wants listed.
1274        @param suite_name: The name of the suite for which we require control
1275                           files.
1276        @return None on failure, or a list of control file paths
1277                (e.g. server/site_tests/autoupdate/control)
1278        @raise DevServerException upon any return code that's not HTTP OK.
1279        """
1280        build = self.translate(build)
1281        call = self.build_call('controlfiles', build=build,
1282                               suite_name=suite_name)
1283        return self.run_call(call, readline=True)
1284
1285
1286    @remote_devserver_call()
1287    def get_control_file(self, build, control_path):
1288        """Ask the devserver for the contents of a control file.
1289
1290        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1291                      whose control file the caller wants to fetch.
1292        @param control_path: The file to fetch
1293                             (e.g. server/site_tests/autoupdate/control)
1294        @return The contents of the desired file.
1295        @raise DevServerException upon any return code that's not HTTP OK.
1296        """
1297        build = self.translate(build)
1298        call = self.build_call('controlfiles', build=build,
1299                               control_path=control_path)
1300        return self.run_call(call)
1301
1302
1303    @remote_devserver_call()
1304    def list_suite_controls(self, build, suite_name=''):
1305        """Ask the devserver to list contents of all control files for |build|.
1306
1307        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1308                      whose control files' contents the caller wants returned.
1309        @param suite_name: The name of the suite for which we require control
1310                           files.
1311        @return None on failure, or a dict of contents of all control files
1312            (e.g. {'path1': "#Copyright controls ***", ...,
1313                pathX': "#Copyright controls ***"}
1314        @raise DevServerException upon any return code that's not HTTP OK.
1315        """
1316        build = self.translate(build)
1317        call = self.build_call('list_suite_controls', build=build,
1318                               suite_name=suite_name)
1319        return json.load(cStringIO.StringIO(self.run_call(call)))
1320
1321
1322class ImageServer(ImageServerBase):
1323    """Class for DevServer that handles RPCs related to CrOS images.
1324
1325    The calls to devserver to stage artifacts, including stage and download, are
1326    made in async mode. That is, when caller makes an RPC |stage| to request
1327    devserver to stage certain artifacts, devserver handles the call and starts
1328    staging artifacts in a new thread, and return |Success| without waiting for
1329    staging being completed. When caller receives message |Success|, it polls
1330    devserver's is_staged call until all artifacts are staged.
1331    Such mechanism is designed to prevent cherrypy threads in devserver being
1332    running out, as staging artifacts might take long time, and cherrypy starts
1333    with a fixed number of threads that handle devserver rpc.
1334    """
1335
1336    class ArtifactUrls(object):
1337        """A container for URLs of staged artifacts.
1338
1339        Attributes:
1340            full_payload: URL for downloading a staged full release update
1341            mton_payload: URL for downloading a staged M-to-N release update
1342            nton_payload: URL for downloading a staged N-to-N release update
1343
1344        """
1345        def __init__(self, full_payload=None, mton_payload=None,
1346                     nton_payload=None):
1347            self.full_payload = full_payload
1348            self.mton_payload = mton_payload
1349            self.nton_payload = nton_payload
1350
1351
1352    def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''):
1353        """Polling devserver.is_staged until all artifacts are staged.
1354
1355        @param archive_url: Google Storage URL for the build.
1356        @param artifacts: Comma separated list of artifacts to download.
1357        @param files: Comma separated list of files to download.
1358        @return: True if all artifacts are staged in devserver.
1359        """
1360        kwargs = {'archive_url': archive_url,
1361                  'artifacts': artifacts,
1362                  'files': files}
1363        return self._poll_is_staged(**kwargs)
1364
1365
1366    @remote_devserver_call()
1367    def call_and_wait(self, call_name, archive_url, artifacts, files,
1368                      error_message, expected_response=SUCCESS):
1369        """Helper method to make a urlopen call, and wait for artifacts staged.
1370
1371        @param call_name: name of devserver rpc call.
1372        @param archive_url: Google Storage URL for the build..
1373        @param artifacts: Comma separated list of artifacts to download.
1374        @param files: Comma separated list of files to download.
1375        @param expected_response: Expected response from rpc, default to
1376                                  |Success|. If it's set to None, do not compare
1377                                  the actual response. Any response is consider
1378                                  to be good.
1379        @param error_message: Error message to be thrown if response does not
1380                              match expected_response.
1381
1382        @return: The response from rpc.
1383        @raise DevServerException upon any return code that's expected_response.
1384
1385        """
1386        kwargs = {'archive_url': archive_url,
1387                  'artifacts': artifacts,
1388                  'files': files}
1389        return self._call_and_wait(call_name, error_message,
1390                                   expected_response, **kwargs)
1391
1392
1393    @remote_devserver_call()
1394    def stage_artifacts(self, image=None, artifacts=None, files='',
1395                        archive_url=None):
1396        """Tell the devserver to download and stage |artifacts| from |image|.
1397
1398         This is the main call point for staging any specific artifacts for a
1399        given build. To see the list of artifacts one can stage see:
1400
1401        ~src/platfrom/dev/artifact_info.py.
1402
1403        This is maintained along with the actual devserver code.
1404
1405        @param image: the image to fetch and stage.
1406        @param artifacts: A list of artifacts.
1407        @param files: A list of files to stage.
1408        @param archive_url: Optional parameter that has the archive_url to stage
1409                this artifact from. Default is specified in autotest config +
1410                image.
1411
1412        @raise DevServerException upon any return code that's not HTTP OK.
1413        """
1414        if not artifacts and not files:
1415            raise DevServerException('Must specify something to stage.')
1416        image = self.translate(image)
1417        self._stage_artifacts(image, artifacts, files, archive_url)
1418
1419
1420    @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS)
1421    def list_image_dir(self, image):
1422        """List the contents of the image stage directory, on the devserver.
1423
1424        @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>.
1425
1426        @raise DevServerException upon any return code that's not HTTP OK.
1427        """
1428        image = self.translate(image)
1429        logging.info('Requesting contents from devserver %s for image %s',
1430                     self.url(), image)
1431        archive_url = _get_storage_server_for_artifacts() + image
1432        call = self.build_call('list_image_dir', archive_url=archive_url)
1433        response = self.run_call(call, readline=True)
1434        for line in response:
1435            logging.info(line)
1436
1437
1438    def trigger_download(self, image, synchronous=True):
1439        """Tell the devserver to download and stage |image|.
1440
1441        Tells the devserver to fetch |image| from the image storage server
1442        named by _get_image_storage_server().
1443
1444        If |synchronous| is True, waits for the entire download to finish
1445        staging before returning. Otherwise only the artifacts necessary
1446        to start installing images onto DUT's will be staged before returning.
1447        A caller can then call finish_download to guarantee the rest of the
1448        artifacts have finished staging.
1449
1450        @param image: the image to fetch and stage.
1451        @param synchronous: if True, waits until all components of the image are
1452               staged before returning.
1453
1454        @raise DevServerException upon any return code that's not HTTP OK.
1455
1456        """
1457        image = self.translate(image)
1458        artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE
1459        self._trigger_download(image, artifacts, files='',
1460                               synchronous=synchronous)
1461
1462
1463    @remote_devserver_call()
1464    def setup_telemetry(self, build):
1465        """Tell the devserver to setup telemetry for this build.
1466
1467        The devserver will stage autotest and then extract the required files
1468        for telemetry.
1469
1470        @param build: the build to setup telemetry for.
1471
1472        @returns path on the devserver that telemetry is installed to.
1473        """
1474        build = self.translate(build)
1475        archive_url = _get_image_storage_server() + build
1476        call = self.build_call('setup_telemetry', archive_url=archive_url)
1477        try:
1478            response = self.run_call(call)
1479        except httplib.BadStatusLine as e:
1480            logging.error(e)
1481            raise DevServerException('Received Bad Status line, Devserver %s '
1482                                     'might have gone down while handling '
1483                                     'the call: %s' % (self.url(), call))
1484        return response
1485
1486
1487    def finish_download(self, image):
1488        """Tell the devserver to finish staging |image|.
1489
1490        If trigger_download is called with synchronous=False, it will return
1491        before all artifacts have been staged. This method contacts the
1492        devserver and blocks until all staging is completed and should be
1493        called after a call to trigger_download.
1494
1495        @param image: the image to fetch and stage.
1496        @raise DevServerException upon any return code that's not HTTP OK.
1497        """
1498        image = self.translate(image)
1499        artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST
1500        self._finish_download(image, artifacts, files='')
1501
1502
1503    def get_update_url(self, image):
1504        """Returns the url that should be passed to the updater.
1505
1506        @param image: the image that was fetched.
1507        """
1508        image = self.translate(image)
1509        url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
1510                                              type=str)
1511        return (url_pattern % (self.url(), image))
1512
1513
1514    def get_staged_file_url(self, filename, image):
1515        """Returns the url of a staged file for this image on the devserver."""
1516        return '/'.join([self._get_image_url(image), filename])
1517
1518
1519    def get_full_payload_url(self, image):
1520        """Returns a URL to a staged full payload.
1521
1522        @param image: the image that was fetched.
1523
1524        @return A fully qualified URL that can be used for downloading the
1525                payload.
1526
1527        """
1528        return self._get_image_url(image) + '/update.gz'
1529
1530
1531    def get_test_image_url(self, image):
1532        """Returns a URL to a staged test image.
1533
1534        @param image: the image that was fetched.
1535
1536        @return A fully qualified URL that can be used for downloading the
1537                image.
1538
1539        """
1540        return self._get_image_url(image) + '/chromiumos_test_image.bin'
1541
1542
1543    @remote_devserver_call()
1544    def get_dependencies_file(self, build):
1545        """Ask the dev server for the contents of the suite dependencies file.
1546
1547        Ask the dev server at |self._dev_server| for the contents of the
1548        pre-processed suite dependencies file (at DEPENDENCIES_FILE)
1549        for |build|.
1550
1551        @param build: The build (e.g. x86-mario-release/R21-2333.0.0)
1552                      whose dependencies the caller is interested in.
1553        @return The contents of the dependencies file, which should eval to
1554                a dict of dicts, as per bin_utils/suite_preprocessor.py.
1555        @raise DevServerException upon any return code that's not HTTP OK.
1556        """
1557        build = self.translate(build)
1558        call = self.build_call('controlfiles',
1559                               build=build, control_path=DEPENDENCIES_FILE)
1560        return self.run_call(call)
1561
1562
1563    @remote_devserver_call()
1564    def get_latest_build_in_gs(self, board):
1565        """Ask the devservers for the latest offical build in Google Storage.
1566
1567        @param board: The board for who we want the latest official build.
1568        @return A string of the returned build rambi-release/R37-5868.0.0
1569        @raise DevServerException upon any return code that's not HTTP OK.
1570        """
1571        call = self.build_call(
1572                'xbuddy_translate/remote/%s/latest-official' % board,
1573                image_dir=_get_image_storage_server())
1574        image_name = self.run_call(call)
1575        return os.path.dirname(image_name)
1576
1577
1578    def translate(self, build_name):
1579        """Translate the build name if it's in LATEST format.
1580
1581        If the build name is in the format [builder]/LATEST, return the latest
1582        build in Google Storage otherwise return the build name as is.
1583
1584        @param build_name: build_name to check.
1585
1586        @return The actual build name to use.
1587        """
1588        match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I)
1589        if not match:
1590            return build_name
1591        translated_build = self.get_latest_build_in_gs(match.groups()[0])
1592        logging.debug('Translated relative build %s to %s', build_name,
1593                      translated_build)
1594        return translated_build
1595
1596
1597    @classmethod
1598    @remote_devserver_call()
1599    def get_latest_build(cls, target, milestone=''):
1600        """Ask all the devservers for the latest build for a given target.
1601
1602        @param target: The build target, typically a combination of the board
1603                       and the type of build e.g. x86-mario-release.
1604        @param milestone:  For latest build set to '', for builds only in a
1605                           specific milestone set to a str of format Rxx
1606                           (e.g. R16). Default: ''. Since we are dealing with a
1607                           webserver sending an empty string, '', ensures that
1608                           the variable in the URL is ignored as if it was set
1609                           to None.
1610        @return A string of the returned build e.g. R20-2226.0.0.
1611        @raise DevServerException upon any return code that's not HTTP OK.
1612        """
1613        calls = cls.build_all_calls('latestbuild', target=target,
1614                                    milestone=milestone)
1615        latest_builds = []
1616        for call in calls:
1617            latest_builds.append(cls.run_call(call))
1618
1619        return max(latest_builds, key=version.LooseVersion)
1620
1621
1622    @remote_devserver_call()
1623    def _kill_au_process_for_host(self, **kwargs):
1624        """Kill the triggerred auto_update process if error happens in cros_au.
1625
1626        @param kwargs: Arguments to make kill_au_proc devserver call.
1627        """
1628        call = self.build_call('kill_au_proc', **kwargs)
1629        response = self.run_call(call)
1630        if not response == 'True':
1631            raise DevServerException(
1632                    'Failed to kill the triggerred CrOS auto_update process'
1633                    'on devserver %s, the response is %s' % (
1634                            self.url(), response))
1635
1636
1637    def kill_au_process_for_host(self, host_name, pid):
1638        """Kill the triggerred auto_update process if error happens.
1639
1640        Usually this function is used to clear all potential left au processes
1641        of the given host name.
1642
1643        If pid is specified, the devserver will further check the given pid to
1644        make sure the process is killed. This is used for the case that the au
1645        process has started in background, but then provision fails due to
1646        some unknown issues very fast. In this case, when 'kill_au_proc' is
1647        called, there's no corresponding background track log created for this
1648        ongoing au process, which prevents this RPC call from killing this au
1649        process.
1650
1651        @param host_name: The DUT's hostname.
1652        @param pid: The ongoing au process's pid.
1653
1654        @return: True if successfully kill the auto-update process for host.
1655        """
1656        kwargs = {'host_name': host_name, 'pid': pid}
1657        try:
1658            self._kill_au_process_for_host(**kwargs)
1659        except DevServerException:
1660            return False
1661
1662        return True
1663
1664
1665    @remote_devserver_call()
1666    def _clean_track_log(self, **kwargs):
1667        """Clean track log for the current auto-update process."""
1668        call = self.build_call('handler_cleanup', **kwargs)
1669        self.run_call(call)
1670
1671
1672    def clean_track_log(self, host_name, pid):
1673        """Clean track log for the current auto-update process.
1674
1675        @param host_name: The host name to be updated.
1676        @param pid: The auto-update process id.
1677
1678        @return: True if track log is successfully cleaned, False otherwise.
1679        """
1680        if not pid:
1681            return False
1682
1683        kwargs = {'host_name': host_name, 'pid': pid}
1684        try:
1685            self._clean_track_log(**kwargs)
1686        except DevServerException as e:
1687            logging.debug('Failed to clean track_status_file on '
1688                          'devserver for host %s and process id %s: %s',
1689                          host_name, pid, str(e))
1690            return False
1691
1692        return True
1693
1694
1695    def _get_au_log_filename(self, log_dir, host_name, pid):
1696        """Return the auto-update log's filename."""
1697        return os.path.join(log_dir, CROS_AU_LOG_FILENAME % (
1698                    host_name, pid))
1699
1700    @remote_devserver_call()
1701    def _collect_au_log(self, log_dir, **kwargs):
1702        """Collect logs from devserver after cros-update process is finished.
1703
1704        Collect the logs that recording the whole cros-update process, and
1705        write it to sysinfo path of a job.
1706
1707        The example log file name that is stored is like:
1708            '1220-repair/sysinfo/CrOS_update_host_name_pid.log'
1709
1710        @param host_name: the DUT's hostname.
1711        @param pid: the auto-update process id on devserver.
1712        @param log_dir: The directory to save the cros-update process log
1713                        retrieved from devserver.
1714        """
1715        call = self.build_call('collect_cros_au_log', **kwargs)
1716        response = self.run_call(call)
1717        if not os.path.exists(log_dir):
1718            os.mkdir(log_dir)
1719        write_file = self._get_au_log_filename(
1720                log_dir, kwargs['host_name'], kwargs['pid'])
1721        logging.debug('Saving auto-update logs into %s', write_file)
1722        try:
1723            with open(write_file, 'w') as out_log:
1724                out_log.write(response)
1725        except:
1726            raise DevServerException('Failed to write auto-update logs into '
1727                                     '%s' % write_file)
1728
1729
1730    def collect_au_log(self, host_name, pid, log_dir):
1731        """Collect logs from devserver after cros-update process is finished.
1732
1733        @param host_name: the DUT's hostname.
1734        @param pid: the auto-update process id on devserver.
1735        @param log_dir: The directory to save the cros-update process log
1736                        retrieved from devserver.
1737
1738        @return: True if auto-update log is successfully collected, False
1739          otherwise.
1740        """
1741        if not pid:
1742            return False
1743
1744        kwargs = {'host_name': host_name, 'pid': pid}
1745        try:
1746            self._collect_au_log(log_dir, **kwargs)
1747        except DevServerException as e:
1748            logging.debug('Failed to collect auto-update log on '
1749                          'devserver for host %s and process id %s: %s',
1750                          host_name, pid, str(e))
1751            return False
1752
1753        return True
1754
1755
1756    @remote_devserver_call()
1757    def _trigger_auto_update(self, **kwargs):
1758        """Trigger auto-update by calling devserver.cros_au.
1759
1760        @param kwargs:  Arguments to make cros_au devserver call.
1761
1762        @return: a tuple indicates whether the RPC call cros_au succeeds and
1763          the auto-update process id running on devserver.
1764        """
1765        host_name = kwargs['host_name']
1766        call = self.build_call('cros_au', async=True, **kwargs)
1767        try:
1768            response = self.run_call(call)
1769            logging.info(
1770                'Received response from devserver for cros_au call: %r',
1771                response)
1772        except httplib.BadStatusLine as e:
1773            logging.error(e)
1774            raise DevServerException('Received Bad Status line, Devserver %s '
1775                                     'might have gone down while handling '
1776                                     'the call: %s' % (self.url(), call))
1777
1778        return response
1779
1780
1781    def _wait_for_auto_update_finished(self, pid, **kwargs):
1782        """Polling devserver.get_au_status to get current auto-update status.
1783
1784        The current auto-update status is used to identify whether the update
1785        process is finished.
1786
1787        @param pid:    The background process id for auto-update in devserver.
1788        @param kwargs: keyword arguments to make get_au_status devserver call.
1789
1790        @return: True if auto-update is finished for a given dut.
1791        """
1792        logging.debug('Check the progress for auto-update process %r', pid)
1793        kwargs['pid'] = pid
1794        call = self.build_call('get_au_status', **kwargs)
1795
1796        def all_finished():
1797            """Call devserver.get_au_status rpc to check if auto-update
1798               is finished.
1799
1800            @return: True if auto-update is finished for a given dut. False
1801                     otherwise.
1802            @rasies  DevServerException, the exception is a wrapper of all
1803                     exceptions that were raised when devserver tried to
1804                     download the artifacts. devserver raises an HTTPError or
1805                     a CmdError when an exception was raised in the code. Such
1806                     exception should be re-raised here to stop the caller from
1807                     waiting. If the call to devserver failed for connection
1808                     issue, a URLError exception is raised, and caller should
1809                     retry the call to avoid such network flakiness.
1810
1811            """
1812            try:
1813                au_status = self.run_call(call)
1814                response = json.loads(au_status)
1815                # This is a temp fix to fit both dict and tuple returning
1816                # values. The dict check will be removed after a corresponding
1817                # devserver CL is deployed.
1818                if isinstance(response, dict):
1819                    if response.get('detailed_error_msg'):
1820                        raise DevServerException(
1821                                response.get('detailed_error_msg'))
1822
1823                    if response.get('finished'):
1824                        logging.debug('CrOS auto-update is finished')
1825                        return True
1826                    else:
1827                        logging.debug('Current CrOS auto-update status: %s',
1828                                      response.get('status'))
1829                        return False
1830
1831                if not response[0]:
1832                    logging.debug('Current CrOS auto-update status: %s',
1833                                  response[1])
1834                    return False
1835                else:
1836                    logging.debug('CrOS auto-update is finished')
1837                    return True
1838            except urllib2.HTTPError as e:
1839                error_markup = e.read()
1840                raise DevServerException(_strip_http_message(error_markup))
1841            except urllib2.URLError as e:
1842                # Could be connection issue, retry it.
1843                # For example: <urlopen error [Errno 111] Connection refused>
1844                logging.warning('URLError (%r): Retrying connection to '
1845                                'devserver to check auto-update status.', e)
1846                return False
1847            except error.CmdError:
1848                # Retry if SSH failed to connect to the devserver.
1849                logging.warning('CmdError: Retrying SSH connection to check '
1850                                'auto-update status.')
1851                return False
1852            except socket.error as e:
1853                # Could be some temporary devserver connection issues.
1854                logging.warning('Socket Error (%r): Retrying connection to '
1855                                'devserver to check auto-update status.', e)
1856                return False
1857            except ValueError as e:
1858                raise DevServerException(
1859                        '%s (Got AU status: %r)' % (str(e), au_status))
1860
1861        bin_utils.poll_for_condition(
1862                all_finished,
1863                exception=bin_utils.TimeoutError(),
1864                timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60,
1865                sleep_interval=CROS_AU_POLLING_INTERVAL)
1866
1867        return True
1868
1869
1870    def wait_for_auto_update_finished(self, response, **kwargs):
1871        """Processing response of 'cros_au' and polling for auto-update status.
1872
1873        Will wait for the whole auto-update process is finished.
1874
1875        @param response: The response from RPC 'cros_au'
1876        @param kwargs: keyword arguments to make get_au_status devserver call.
1877
1878        @return: a tuple includes two elements.
1879          raised_error: None if everything works well or the raised error.
1880          pid: the auto-update process id on devserver.
1881        """
1882
1883        pid = 0
1884        raised_error = None
1885        try:
1886            response = json.loads(response)
1887            if response[0]:
1888                pid = response[1]
1889                logging.debug('start process %r for auto_update in devserver',
1890                              pid)
1891                self._wait_for_auto_update_finished(pid, **kwargs)
1892        except Exception as e:
1893            logging.debug('Failed to trigger auto-update process on devserver')
1894            raised_error = e
1895        finally:
1896            return raised_error, pid
1897
1898
1899    def _parse_AU_error(self, response):
1900        """Parse auto_update error returned from devserver."""
1901        return re.split('\n', response)[-1]
1902
1903
1904    def _classify_exceptions(self, error_list):
1905        """Parse the error that was raised from auto_update.
1906
1907        @param error_list: The list of errors (string) happened in auto-update
1908
1909        @return: A classified exception type (string) from _EXCEPTION_PATTERNS
1910          or 'Unknown exception'. Current patterns in _EXCEPTION_PATTERNS are
1911          very specific so that errors cannot match more than one pattern.
1912        """
1913        raised_error = ''
1914        if not error_list:
1915            return raised_error
1916        else:
1917            target_error = error_list[0]
1918
1919        for err_pattern, classification in _EXCEPTION_PATTERNS:
1920            match = re.match(err_pattern, target_error)
1921            if match:
1922                return classification
1923
1924        return '(0) Unknown exception'
1925
1926    def _is_retryable(self, error_msg):
1927        """Detect whether we will retry auto-update based on error_msg.
1928
1929        @param error_msg: The given error message.
1930
1931        @return A boolean variable which indicates whether we will retry
1932            auto_update with another devserver based on the given error_msg.
1933        """
1934        # For now we just hard-code the error message we think it's suspicious.
1935        # When we get more date about what's the json response when devserver
1936        # is overloaded, we can update this part.
1937        retryable_errors = ['No JSON object could be decoded',
1938                            'is not pingable']
1939        for err in retryable_errors:
1940            if err in error_msg:
1941                return True
1942
1943        return False
1944
1945
1946    def _parse_buildname_safely(self, build_name):
1947        """Parse a given buildname safely.
1948
1949        @param build_name: the build name to be parsed.
1950
1951        @return: a tuple (board, build_type, milestone)
1952        """
1953        try:
1954            board, build_type, milestone, _ = server_utils.ParseBuildName(
1955                    build_name)
1956        except server_utils.ParseBuildNameException:
1957            logging.warning('Unable to parse build name %s for metrics. '
1958                            'Continuing anyway.', build_name)
1959            board, build_type, milestone = ('', '', '')
1960
1961        return board, build_type, milestone
1962
1963
1964    def auto_update(self, host_name, build_name, original_board=None,
1965                    original_release_version=None, log_dir=None,
1966                    force_update=False, full_update=False):
1967        """Auto-update a CrOS host.
1968
1969        @param host_name: The hostname of the DUT to auto-update.
1970        @param build_name:  The build name to be auto-updated on the DUT.
1971        @param original_board: The original board of the DUT to auto-update.
1972        @param original_release_version: The release version of the DUT's
1973            current build.
1974        @param log_dir: The log directory to store auto-update logs from
1975            devserver.
1976        @param force_update: Force an update even if the version installed
1977                             is the same. Default: False.
1978        @param full_update:  If True, do not run stateful update, directly
1979                             force a full reimage. If False, try stateful
1980                             update first if the dut is already installed
1981                             with the same version.
1982
1983        @return A set (is_success, is_retryable) in which:
1984            1. is_success indicates whether this auto_update succeeds.
1985            2. is_retryable indicates whether we should retry auto_update if
1986               if it fails.
1987
1988        @raise DevServerException if auto_update fails and is not retryable.
1989        """
1990        kwargs = {'host_name': host_name,
1991                  'build_name': build_name,
1992                  'force_update': force_update,
1993                  'full_update': full_update}
1994
1995        error_msg = 'CrOS auto-update failed for host %s: %s'
1996        error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s'
1997        is_au_success = False
1998        au_log_dir = os.path.join(log_dir,
1999                                  AUTO_UPDATE_LOG_DIR) if log_dir else None
2000        error_list = []
2001        retry_with_another_devserver = False
2002        board, build_type, milestone = self._parse_buildname_safely(build_name)
2003
2004        for au_attempt in range(AU_RETRY_LIMIT):
2005            logging.debug('Start CrOS auto-update for host %s at %d time(s).',
2006                          host_name, au_attempt + 1)
2007            # No matter _trigger_auto_update succeeds or fails, the auto-update
2008            # track_status_file should be cleaned, and the auto-update execute
2009            # log should be collected to directory sysinfo. Also, the error
2010            # raised by _trigger_auto_update should be displayed.
2011            try:
2012                # Try update with stateful.tgz of old release version in the
2013                # last try of auto-update.
2014                if (au_attempt > 0 and au_attempt  == AU_RETRY_LIMIT - 1 and
2015                    original_release_version):
2016                    # Monitor this case in monarch
2017                    original_build = '%s/%s' % (original_board,
2018                                                original_release_version)
2019                    c = metrics.Counter(
2020                            'chromeos/autotest/provision/'
2021                            'cros_update_with_original_build')
2022                    f = {'dev_server': self.resolved_hostname,
2023                         'board': board,
2024                         'build_type': build_type,
2025                         'milestone': milestone,
2026                         'original_build': original_build}
2027                    c.increment(fields=f)
2028
2029                    logging.debug('Try updating stateful partition of the '
2030                                  'host with the same version of its current '
2031                                  'rootfs partition: %s', original_build)
2032                    response = self._trigger_auto_update(
2033                            original_build=original_build, **kwargs)
2034                else:
2035                    response = self._trigger_auto_update(**kwargs)
2036            except DevServerException as e:
2037                logging.debug(error_msg_attempt, au_attempt+1, str(e))
2038                error_list.append(str(e))
2039            else:
2040                raised_error, pid = self.wait_for_auto_update_finished(response,
2041                                                                       **kwargs)
2042                # Error happens in _clean_track_log won't be raised. Auto-update
2043                # process will be retried.
2044                # TODO(xixuan): Change kwargs['host_name'] back to host_name
2045                # if crbug.com/651974 is fixed: host_name represents the host
2046                # name of the host, and kwargs['host_name'] could be host_name
2047                # or the IP of this host.
2048                is_clean_success = self.clean_track_log(kwargs['host_name'], pid)
2049                # Error happens in _collect_au_log won't be raised. Auto-update
2050                # process will be retried.
2051                if au_log_dir:
2052                    is_collect_success = self.collect_au_log(
2053                            kwargs['host_name'], pid, au_log_dir)
2054                else:
2055                    is_collect_success = True
2056                # If any error is raised previously, log it and retry
2057                # auto-update. Otherwise, claim a success CrOS auto-update.
2058                if not raised_error and is_clean_success and is_collect_success:
2059                    logging.debug('CrOS auto-update succeed for host %s',
2060                                  host_name)
2061                    is_au_success = True
2062                    break
2063                else:
2064                    if not self.kill_au_process_for_host(kwargs['host_name'],
2065                                                         pid):
2066                        logging.debug('Failed to kill auto_update process %d',
2067                                      pid)
2068                    if raised_error:
2069                        logging.debug(error_msg_attempt, au_attempt+1,
2070                                      str(raised_error))
2071                        if au_log_dir:
2072                            logging.debug('Please see error details in log %s',
2073                                          self._get_au_log_filename(
2074                                                  au_log_dir,
2075                                                  kwargs['host_name'],
2076                                                  pid))
2077                        error_list.append(self._parse_AU_error(str(raised_error)))
2078                        if self._is_retryable(str(raised_error)):
2079                            retry_with_another_devserver = True
2080
2081            finally:
2082                if retry_with_another_devserver:
2083                    break
2084
2085                if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1:
2086                    time.sleep(CROS_AU_RETRY_INTERVAL)
2087                    # TODO(kevcheng): Remove this once crbug.com/651974 is
2088                    # fixed.
2089                    # DNS is broken in the cassandra lab, so use the IP of the
2090                    # hostname instead if it fails. Not rename host_name here
2091                    # for error msg reporting.
2092                    host_name_ip = socket.gethostbyname(host_name)
2093                    kwargs['host_name'] = host_name_ip
2094                    logging.debug(
2095                            'AU failed, trying IP instead of hostname: %s',
2096                            host_name_ip)
2097
2098        # Note: To avoid reaching or exceeding the monarch field cardinality
2099        # limit, we avoid a metric that includes both dut hostname and other
2100        # high cardinality fields.
2101        # Per-devserver cros_update metric.
2102        c = metrics.Counter(
2103                'chromeos/autotest/provision/cros_update_by_devserver')
2104        # Add a field |error| here. Current error's pattern is manually
2105        # specified in _EXCEPTION_PATTERNS.
2106        raised_error = self._classify_exceptions(error_list)
2107        f = {'dev_server': self.resolved_hostname,
2108             'success': is_au_success,
2109             'board': board,
2110             'build_type': build_type,
2111             'milestone': milestone,
2112             'error': raised_error}
2113        c.increment(fields=f)
2114
2115        # Per-DUT cros_update metric.
2116        c = metrics.Counter('chromeos/autotest/provision/cros_update_per_dut')
2117        f = {'success': is_au_success,
2118             'board': board,
2119             'error': raised_error,
2120             'dut_host_name': host_name}
2121        c.increment(fields=f)
2122
2123        if is_au_success or retry_with_another_devserver:
2124            return (is_au_success, retry_with_another_devserver)
2125
2126        # If errors happen in the CrOS AU process, report the first error
2127        # since the following errors might be caused by the first error.
2128        # If error happens in RPCs of cleaning track log, collecting
2129        # auto-update logs, or killing auto-update processes, just report
2130        # them together.
2131        if error_list:
2132            raise DevServerException(error_msg % (host_name, error_list[0]))
2133        else:
2134            raise DevServerException(error_msg % (
2135                        host_name, ('RPC calls after the whole auto-update '
2136                                    'process failed.')))
2137
2138
2139class AndroidBuildServer(ImageServerBase):
2140    """Class for DevServer that handles RPCs related to Android builds.
2141
2142    The calls to devserver to stage artifacts, including stage and download, are
2143    made in async mode. That is, when caller makes an RPC |stage| to request
2144    devserver to stage certain artifacts, devserver handles the call and starts
2145    staging artifacts in a new thread, and return |Success| without waiting for
2146    staging being completed. When caller receives message |Success|, it polls
2147    devserver's is_staged call until all artifacts are staged.
2148    Such mechanism is designed to prevent cherrypy threads in devserver being
2149    running out, as staging artifacts might take long time, and cherrypy starts
2150    with a fixed number of threads that handle devserver rpc.
2151    """
2152
2153    def wait_for_artifacts_staged(self, target, build_id, branch,
2154                                  archive_url=None, artifacts='', files=''):
2155        """Polling devserver.is_staged until all artifacts are staged.
2156
2157        @param target: Target of the android build to stage, e.g.,
2158                       shamu-userdebug.
2159        @param build_id: Build id of the android build to stage.
2160        @param branch: Branch of the android build to stage.
2161        @param archive_url: Google Storage URL for the build.
2162        @param artifacts: Comma separated list of artifacts to download.
2163        @param files: Comma separated list of files to download.
2164
2165        @return: True if all artifacts are staged in devserver.
2166        """
2167        kwargs = {'target': target,
2168                  'build_id': build_id,
2169                  'branch': branch,
2170                  'artifacts': artifacts,
2171                  'files': files,
2172                  'os_type': 'android'}
2173        if archive_url:
2174            kwargs['archive_url'] = archive_url
2175        return self._poll_is_staged(**kwargs)
2176
2177
2178    @remote_devserver_call()
2179    def call_and_wait(self, call_name, target, build_id, branch, archive_url,
2180                      artifacts, files, error_message,
2181                      expected_response=SUCCESS):
2182        """Helper method to make a urlopen call, and wait for artifacts staged.
2183
2184        @param call_name: name of devserver rpc call.
2185        @param target: Target of the android build to stage, e.g.,
2186                       shamu-userdebug.
2187        @param build_id: Build id of the android build to stage.
2188        @param branch: Branch of the android build to stage.
2189        @param archive_url: Google Storage URL for the CrOS build.
2190        @param artifacts: Comma separated list of artifacts to download.
2191        @param files: Comma separated list of files to download.
2192        @param expected_response: Expected response from rpc, default to
2193                                  |Success|. If it's set to None, do not compare
2194                                  the actual response. Any response is consider
2195                                  to be good.
2196        @param error_message: Error message to be thrown if response does not
2197                              match expected_response.
2198
2199        @return: The response from rpc.
2200        @raise DevServerException upon any return code that's expected_response.
2201
2202        """
2203        kwargs = {'target': target,
2204                  'build_id': build_id,
2205                  'branch': branch,
2206                  'artifacts': artifacts,
2207                  'files': files,
2208                  'os_type': 'android'}
2209        if archive_url:
2210            kwargs['archive_url'] = archive_url
2211        return self._call_and_wait(call_name, error_message, expected_response,
2212                                   **kwargs)
2213
2214
2215    @remote_devserver_call()
2216    def stage_artifacts(self, target=None, build_id=None, branch=None,
2217                        image=None, artifacts=None, files='', archive_url=None):
2218        """Tell the devserver to download and stage |artifacts| from |image|.
2219
2220         This is the main call point for staging any specific artifacts for a
2221        given build. To see the list of artifacts one can stage see:
2222
2223        ~src/platfrom/dev/artifact_info.py.
2224
2225        This is maintained along with the actual devserver code.
2226
2227        @param target: Target of the android build to stage, e.g.,
2228                               shamu-userdebug.
2229        @param build_id: Build id of the android build to stage.
2230        @param branch: Branch of the android build to stage.
2231        @param image: Name of a build to test, in the format of
2232                      branch/target/build_id
2233        @param artifacts: A list of artifacts.
2234        @param files: A list of files to stage.
2235        @param archive_url: Optional parameter that has the archive_url to stage
2236                this artifact from. Default is specified in autotest config +
2237                image.
2238
2239        @raise DevServerException upon any return code that's not HTTP OK.
2240        """
2241        if image and not target and not build_id and not branch:
2242            branch, target, build_id = utils.parse_launch_control_build(image)
2243        if not target or not build_id or not branch:
2244            raise DevServerException('Must specify all build info (target, '
2245                                     'build_id and branch) to stage.')
2246
2247        android_build_info = {'target': target,
2248                              'build_id': build_id,
2249                              'branch': branch}
2250        if not artifacts and not files:
2251            raise DevServerException('Must specify something to stage.')
2252        if not all(android_build_info.values()):
2253            raise DevServerException(
2254                    'To stage an Android build, must specify target, build id '
2255                    'and branch.')
2256        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2257        self._stage_artifacts(build, artifacts, files, archive_url,
2258                              **android_build_info)
2259
2260    def get_pull_url(self, target, build_id, branch):
2261        """Get the url to pull files from the devserver.
2262
2263        @param target: Target of the android build, e.g., shamu_userdebug
2264        @param build_id: Build id of the android build.
2265        @param branch: Branch of the android build.
2266
2267        @return A url to pull files from the dev server given a specific
2268                android build.
2269        """
2270        return os.path.join(self.url(), 'static', branch, target, build_id)
2271
2272
2273    def trigger_download(self, target, build_id, branch, artifacts=None,
2274                         files='', os='android', synchronous=True):
2275        """Tell the devserver to download and stage an Android build.
2276
2277        Tells the devserver to fetch an Android build from the image storage
2278        server named by _get_image_storage_server().
2279
2280        If |synchronous| is True, waits for the entire download to finish
2281        staging before returning. Otherwise only the artifacts necessary
2282        to start installing images onto DUT's will be staged before returning.
2283        A caller can then call finish_download to guarantee the rest of the
2284        artifacts have finished staging.
2285
2286        @param target: Target of the android build to stage, e.g.,
2287                       shamu-userdebug.
2288        @param build_id: Build id of the android build to stage.
2289        @param branch: Branch of the android build to stage.
2290        @param artifacts: A string of artifacts separated by comma. If None,
2291               use the default artifacts for Android or Brillo build.
2292        @param files: String of file seperated by commas.
2293        @param os: OS artifacts to download (android/brillo).
2294        @param synchronous: if True, waits until all components of the image are
2295               staged before returning.
2296
2297        @raise DevServerException upon any return code that's not HTTP OK.
2298
2299        """
2300        android_build_info = {'target': target,
2301                              'build_id': build_id,
2302                              'branch': branch}
2303        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2304        if not artifacts:
2305            board = target.split('-')[0]
2306            artifacts = (
2307                android_utils.AndroidArtifacts.get_artifacts_for_reimage(
2308                        board, os))
2309        self._trigger_download(build, artifacts, files=files,
2310                               synchronous=synchronous, **android_build_info)
2311
2312
2313    def finish_download(self, target, build_id, branch, os='android'):
2314        """Tell the devserver to finish staging an Android build.
2315
2316        If trigger_download is called with synchronous=False, it will return
2317        before all artifacts have been staged. This method contacts the
2318        devserver and blocks until all staging is completed and should be
2319        called after a call to trigger_download.
2320
2321        @param target: Target of the android build to stage, e.g.,
2322                       shamu-userdebug.
2323        @param build_id: Build id of the android build to stage.
2324        @param branch: Branch of the android build to stage.
2325        @param os: OS artifacts to download (android/brillo).
2326
2327        @raise DevServerException upon any return code that's not HTTP OK.
2328        """
2329        android_build_info = {'target': target,
2330                              'build_id': build_id,
2331                              'branch': branch}
2332        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2333        board = target.split('-')[0]
2334        artifacts = (
2335                android_utils.AndroidArtifacts.get_artifacts_for_reimage(
2336                        board))
2337        self._finish_download(build, artifacts, files='', **android_build_info)
2338
2339
2340    def get_staged_file_url(self, filename, target, build_id, branch):
2341        """Returns the url of a staged file for this image on the devserver.
2342
2343        @param filename: Name of the file.
2344        @param target: Target of the android build to stage, e.g.,
2345                       shamu-userdebug.
2346        @param build_id: Build id of the android build to stage.
2347        @param branch: Branch of the android build to stage.
2348
2349        @return: The url of a staged file for this image on the devserver.
2350        """
2351        android_build_info = {'target': target,
2352                              'build_id': build_id,
2353                              'branch': branch,
2354                              'os_type': 'android'}
2355        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2356        return '/'.join([self._get_image_url(build), filename])
2357
2358
2359    @remote_devserver_call()
2360    def translate(self, build_name):
2361        """Translate the build name if it's in LATEST format.
2362
2363        If the build name is in the format [branch]/[target]/LATEST, return the
2364        latest build in Launch Control otherwise return the build name as is.
2365
2366        @param build_name: build_name to check.
2367
2368        @return The actual build name to use.
2369        """
2370        branch, target, build_id = utils.parse_launch_control_build(build_name)
2371        if build_id.upper() != 'LATEST':
2372            return build_name
2373        call = self.build_call('latestbuild', branch=branch, target=target,
2374                               os_type='android')
2375        translated_build_id = self.run_call(call)
2376        translated_build = (ANDROID_BUILD_NAME_PATTERN %
2377                            {'branch': branch,
2378                             'target': target,
2379                             'build_id': translated_build_id})
2380        logging.debug('Translated relative build %s to %s', build_name,
2381                      translated_build)
2382        return translated_build
2383
2384
2385def _is_load_healthy(load):
2386    """Check if devserver's load meets the minimum threshold.
2387
2388    @param load: The devserver's load stats to check.
2389
2390    @return: True if the load meets the minimum threshold. Return False
2391             otherwise.
2392
2393    """
2394    # Threshold checks, including CPU load.
2395    if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD:
2396        logging.debug('CPU load of devserver %s is at %s%%, which is higher '
2397                      'than the threshold of %s%%', load['devserver'],
2398                      load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD)
2399        return False
2400    if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO:
2401        logging.debug('Network IO of devserver %s is at %i Bps, which is '
2402                      'higher than the threshold of %i bytes per second.',
2403                      load['devserver'], load[DevServer.NETWORK_IO],
2404                      DevServer.MAX_NETWORK_IO)
2405        return False
2406    return True
2407
2408
2409def _compare_load(devserver1, devserver2):
2410    """Comparator function to compare load between two devservers.
2411
2412    @param devserver1: A dictionary of devserver load stats to be compared.
2413    @param devserver2: A dictionary of devserver load stats to be compared.
2414
2415    @return: Negative value if the load of `devserver1` is less than the load
2416             of `devserver2`. Return positive value otherwise.
2417
2418    """
2419    return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO])
2420
2421
2422def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None):
2423    """Get the devserver with the least load.
2424
2425    Iterate through all devservers and get the one with least load.
2426
2427    TODO(crbug.com/486278): Devserver with required build already staged should
2428    take higher priority. This will need check_health call to be able to verify
2429    existence of a given build/artifact. Also, in case all devservers are
2430    overloaded, the logic here should fall back to the old behavior that randomly
2431    selects a devserver based on the hash of the image name/url.
2432
2433    @param devserver_type: Type of devserver to select from. Default is set to
2434                           ImageServer.
2435    @param hostname: Hostname of the dut that the devserver is used for. The
2436            picked devserver needs to respect the location of the host if
2437            `prefer_local_devserver` is set to True or `restricted_subnets` is
2438            set.
2439
2440    @return: Name of the devserver with the least load.
2441
2442    """
2443    devservers, can_retry = devserver_type.get_available_devservers(
2444            hostname)
2445    # If no healthy devservers available and can_retry is False, return None.
2446    # Otherwise, relax the constrain on hostname, allow all devservers to be
2447    # available.
2448    if not devserver_type.get_healthy_devserver('', devservers):
2449        if not can_retry:
2450            return None
2451        else:
2452            devservers, _ = devserver_type.get_available_devservers()
2453
2454    # get_devserver_load call needs to be made in a new process to allow force
2455    # timeout using signal.
2456    output = multiprocessing.Queue()
2457    processes = []
2458    for devserver in devservers:
2459        processes.append(multiprocessing.Process(
2460                target=devserver_type.get_devserver_load_wrapper,
2461                args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output)))
2462
2463    for p in processes:
2464        p.start()
2465    for p in processes:
2466        p.join()
2467    loads = [output.get() for p in processes]
2468    # Filter out any load failed to be retrieved or does not support load check.
2469    loads = [load for load in loads if load and DevServer.CPU_LOAD in load and
2470             DevServer.is_free_disk_ok(load) and
2471             DevServer.is_apache_client_count_ok(load)]
2472    if not loads:
2473        logging.debug('Failed to retrieve load stats from any devserver. No '
2474                      'load balancing can be applied.')
2475        return None
2476    loads = [load for load in loads if _is_load_healthy(load)]
2477    if not loads:
2478        logging.error('No devserver has the capacity to be selected.')
2479        return None
2480    loads = sorted(loads, cmp=_compare_load)
2481    return loads[0]['devserver']
2482
2483
2484def resolve(build, hostname=None, ban_list=None):
2485    """Resolve a devserver can be used for given build and hostname.
2486
2487    @param build: Name of a build to stage on devserver, e.g.,
2488                  ChromeOS build: daisy-release/R50-1234.0.0
2489                  Launch Control build: git_mnc_release/shamu-eng
2490    @param hostname: Hostname of a devserver for, default is None, which means
2491            devserver is not restricted by the network location of the host.
2492    @param ban_list: The blacklist of devservers shouldn't be chosen.
2493
2494    @return: A DevServer instance that can be used to stage given build for the
2495             given host.
2496    """
2497    if utils.is_launch_control_build(build):
2498        return AndroidBuildServer.resolve(build, hostname)
2499    else:
2500        return ImageServer.resolve(build, hostname, ban_list=ban_list)
2501