dev_server.py revision 1e916724238baf58a8df0587a9886f35be794262
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from distutils import version
6import cStringIO
7import HTMLParser
8import httplib
9import json
10import logging
11import multiprocessing
12import os
13import re
14import socket
15import time
16import urllib2
17import urlparse
18
19from autotest_lib.client.bin import utils as bin_utils
20from autotest_lib.client.common_lib import android_utils
21from autotest_lib.client.common_lib import error
22from autotest_lib.client.common_lib import global_config
23from autotest_lib.client.common_lib import utils
24from autotest_lib.client.common_lib.cros import retry
25from autotest_lib.server import utils as server_utils
26# TODO(cmasone): redo this class using requests module; http://crosbug.com/30107
27
28try:
29    from chromite.lib import metrics
30except ImportError:
31    metrics = utils.metrics_mock
32
33
34CONFIG = global_config.global_config
35# This file is generated at build time and specifies, per suite and per test,
36# the DEPENDENCIES list specified in each control file.  It's a dict of dicts:
37# {'bvt':   {'/path/to/autotest/control/site_tests/test1/control': ['dep1']}
38#  'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']}
39#  'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'],
40#            '/path/to/autotest/control/site_tests/test3/control': ['dep3']}
41# }
42DEPENDENCIES_FILE = 'test_suites/dependency_info'
43# Number of seconds for caller to poll devserver's is_staged call to check if
44# artifacts are staged.
45_ARTIFACT_STAGE_POLLING_INTERVAL = 5
46# Artifacts that should be staged when client calls devserver RPC to stage an
47# image.
48_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('full_payload,test_suites,stateful,'
49                                     'quick_provision')
50# Artifacts that should be staged when client calls devserver RPC to stage an
51# image with autotest artifact.
52_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,'
53                                                   'control_files,stateful,'
54                                                   'autotest_packages,'
55                                                   'quick_provision')
56# Artifacts that should be staged when client calls devserver RPC to stage an
57# Android build.
58_BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions')
59SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value(
60        'CROS', 'skip_devserver_health_check', type=bool)
61# Number of seconds for the call to get devserver load to time out.
62TIMEOUT_GET_DEVSERVER_LOAD = 2.0
63
64# Android artifact path in devserver
65ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value(
66        'CROS', 'android_build_name_pattern', type=str).replace('\\', '')
67
68# Return value from a devserver RPC indicating the call succeeded.
69SUCCESS = 'Success'
70
71# The timeout minutes for a given devserver ssh call.
72DEVSERVER_SSH_TIMEOUT_MINS = 1
73
74# Error message for invalid devserver response.
75ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error'
76ERR_MSG_FOR_DOWN_DEVSERVER = 'Service Unavailable'
77
78# Error message for devserver call timedout.
79ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout'
80
81# The timeout minutes for waiting a devserver staging.
82DEVSERVER_IS_STAGING_RETRY_MIN = 100
83
84# The timeout minutes for waiting a DUT auto-update finished.
85DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100
86
87# The total times of devserver triggering CrOS auto-update.
88AU_RETRY_LIMIT = 2
89
90# Number of seconds for caller to poll devserver's get_au_status call to
91# check if cros auto-update is finished.
92CROS_AU_POLLING_INTERVAL = 10
93
94# Number of seconds for intervals between retrying auto-update calls.
95CROS_AU_RETRY_INTERVAL = 20
96
97# The file name for auto-update logs.
98CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log'
99
100# Provision error patterns.
101# People who see this should know that they shouldn't change these
102# classification strings. These strings are used for monitoring provision
103# failures. Any changes may mess up the stats.
104_EXCEPTION_PATTERNS = [
105        # Raised when devserver portfile does not exist on host.
106        (r".*Devserver portfile does not exist!.*$",
107         '(1) Devserver portfile does not exist on host'),
108        # Raised when devserver cannot copy packages to host.
109        (r".*Could not copy .* to device.*$",
110         '(2) Cannot copy packages to host'),
111        # Raised when devserver fails to run specific commands on host.
112        (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$",
113         '(3) Fail to run specific command on host'),
114        # Raised when new build fails to boot on the host.
115        (r'.*RootfsUpdateError: Build .* failed to boot on.*$',
116         '(4) Build failed to boot on host'),
117        # Raised when the auto-update process is timed out.
118        (r'.*The CrOS auto-update process is timed out, '
119         'thus will be terminated.*$',
120         '(5) Auto-update is timed out'),
121        # Raised when the host is not pingable.
122        (r".*DeviceNotPingableError.*$",
123         '(6) Host is not pingable during auto-update'),
124        # Raised when hosts have unexpected status after rootfs update.
125        (r'.*Update failed with unexpected update status: '
126         'UPDATE_STATUS_IDLE.*$',
127         '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs '
128         'update'),
129        # Raised when devserver returns non-json response to shard/drone.
130        (r'.*No JSON object could be decoded.*$',
131         '(8) Devserver returned non-json object'),
132        # Raised when devserver loses host's ssh connection
133        (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$',
134         "(9) Devserver lost host's ssh connection"),
135        # Raised when error happens in writing files to host
136        (r'.*Write failed\: Broken pipe.*$',
137         "(10) Broken pipe while writing or connecting to host")]
138
139PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value(
140        'CROS', 'prefer_local_devserver', type=bool, default=False)
141
142ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value(
143        'CROS', 'enable_ssh_connection_for_devserver', type=bool,
144        default=False)
145
146# Directory to save auto-update logs
147AUTO_UPDATE_LOG_DIR = 'autoupdate_logs'
148
149DEFAULT_SUBNET_MASKBIT = 19
150
151
152class DevServerException(Exception):
153    """Raised when the dev server returns a non-200 HTTP response."""
154    pass
155
156class RetryableProvisionException(DevServerException):
157    """Raised when provision fails due to a retryable reason."""
158    pass
159
160class DevServerOverloadException(Exception):
161    """Raised when the dev server returns a 502 HTTP response."""
162    pass
163
164class DevServerFailToLocateException(Exception):
165    """Raised when fail to locate any devserver."""
166    pass
167
168class MarkupStripper(HTMLParser.HTMLParser):
169    """HTML parser that strips HTML tags, coded characters like &
170
171    Works by, basically, not doing anything for any tags, and only recording
172    the content of text nodes in an internal data structure.
173    """
174    def __init__(self):
175        self.reset()
176        self.fed = []
177
178
179    def handle_data(self, d):
180        """Consume content of text nodes, store it away."""
181        self.fed.append(d)
182
183
184    def get_data(self):
185        """Concatenate and return all stored data."""
186        return ''.join(self.fed)
187
188
189def _strip_http_message(message):
190    """Strip the HTTP marker from the an HTTP message.
191
192    @param message: A string returned by an HTTP call.
193
194    @return: A string with HTTP marker being stripped.
195    """
196    strip = MarkupStripper()
197    try:
198        strip.feed(message.decode('utf_32'))
199    except UnicodeDecodeError:
200        strip.feed(message)
201    return strip.get_data()
202
203
204def _get_image_storage_server():
205    return CONFIG.get_config_value('CROS', 'image_storage_server', type=str)
206
207
208def _get_canary_channel_server():
209    """
210    Get the url of the canary-channel server,
211    eg: gsutil://chromeos-releases/canary-channel/<board>/<release>
212
213    @return: The url to the canary channel server.
214    """
215    return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str)
216
217
218def _get_storage_server_for_artifacts(artifacts=None):
219    """Gets the appropriate storage server for the given artifacts.
220
221    @param artifacts: A list of artifacts we need to stage.
222    @return: The address of the storage server that has these artifacts.
223             The default image storage server if no artifacts are specified.
224    """
225    factory_artifact = global_config.global_config.get_config_value(
226            'CROS', 'factory_artifact', type=str, default='')
227    if artifacts and factory_artifact and factory_artifact in artifacts:
228        return _get_canary_channel_server()
229    return _get_image_storage_server()
230
231
232def _reverse_lookup_from_config(address):
233    """Look up hostname for the given IP address.
234
235    This uses the hostname-address map from the config file.
236
237    If multiple hostnames map to the same IP address, the first one
238    defined in the configuration file takes precedence.
239
240    @param address: IP address string
241    @returns: hostname string, or original input if not found
242    """
243    for hostname, addr in _get_hostname_addr_map().iteritems():
244        if addr == address:
245            return hostname
246    return address
247
248
249def _get_hostname_addr_map():
250    """Get hostname address mapping from config.
251
252    @return: dict mapping server hostnames to addresses
253    """
254    return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP')
255
256
257def _get_dev_server_list():
258    return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[])
259
260
261def _get_crash_server_list():
262    return CONFIG.get_config_value('CROS', 'crash_server', type=list,
263        default=[])
264
265
266def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN,
267                          exception_to_raise=DevServerException):
268    """A decorator to use with remote devserver calls.
269
270    This decorator converts urllib2.HTTPErrors into DevServerExceptions
271    with any embedded error info converted into plain text. The method
272    retries on urllib2.URLError or error.CmdError to avoid devserver flakiness.
273    """
274    #pylint: disable=C0111
275
276    def inner_decorator(method):
277        label = method.__name__ if hasattr(method, '__name__') else None
278        def metrics_wrapper(*args, **kwargs):
279            @retry.retry((urllib2.URLError, error.CmdError,
280                          DevServerOverloadException),
281                         timeout_min=timeout_min,
282                         exception_to_raise=exception_to_raise,
283                        label=label)
284            def wrapper():
285                """This wrapper actually catches the HTTPError."""
286                try:
287                    return method(*args, **kwargs)
288                except urllib2.HTTPError as e:
289                    error_markup = e.read()
290                    raise DevServerException(_strip_http_message(error_markup))
291
292            try:
293                return wrapper()
294            except Exception as e:
295                if ERR_MSG_FOR_TIMED_OUT_CALL in str(e):
296                    dev_server = None
297                    if args and isinstance(args[0], DevServer):
298                        dev_server = args[0].hostname
299                    elif 'devserver' in kwargs:
300                        dev_server = get_hostname(kwargs['devserver'])
301
302                    logging.debug('RPC call %s has timed out on devserver %s.',
303                                  label, dev_server)
304                    c = metrics.Counter(
305                            'chromeos/autotest/devserver/call_timeout')
306                    c.increment(fields={'dev_server': dev_server,
307                                        'healthy': label})
308
309                raise
310
311        return metrics_wrapper
312
313    return inner_decorator
314
315
316def get_hostname(url):
317    """Get the hostname portion of a URL
318
319    schema://hostname:port/path
320
321    @param url: a Url string
322    @return: a hostname string
323    """
324    return urlparse.urlparse(url).hostname
325
326
327class DevServer(object):
328    """Base class for all DevServer-like server stubs.
329
330    This is the base class for interacting with all Dev Server-like servers.
331    A caller should instantiate a sub-class of DevServer with:
332
333    host = SubClassServer.resolve(build)
334    server = SubClassServer(host)
335    """
336    _MIN_FREE_DISK_SPACE_GB = 20
337    _MAX_APACHE_CLIENT_COUNT = 75
338    # Threshold for the CPU load percentage for a devserver to be selected.
339    MAX_CPU_LOAD = 80.0
340    # Threshold for the network IO, set to 80MB/s
341    MAX_NETWORK_IO = 1024 * 1024 * 80
342    DISK_IO = 'disk_total_bytes_per_second'
343    NETWORK_IO = 'network_total_bytes_per_second'
344    CPU_LOAD = 'cpu_percent'
345    FREE_DISK = 'free_disk'
346    AU_PROCESS = 'au_process_count'
347    STAGING_THREAD_COUNT = 'staging_thread_count'
348    APACHE_CLIENT_COUNT = 'apache_client_count'
349
350
351    def __init__(self, devserver):
352        self._devserver = devserver
353
354
355    def url(self):
356        """Returns the url for this devserver."""
357        return self._devserver
358
359
360    @property
361    def hostname(self):
362        """Return devserver hostname parsed from the devserver URL.
363
364        Note that this is likely parsed from the devserver URL from
365        shadow_config.ini, meaning that the "hostname" part of the
366        devserver URL is actually an IP address.
367
368        @return hostname string
369        """
370        return get_hostname(self.url())
371
372
373    @property
374    def resolved_hostname(self):
375        """Return devserver hostname, resolved from its IP address.
376
377        Unlike the hostname property, this property attempts to look up
378        the proper hostname from the devserver IP address.  If lookup
379        fails, then fall back to whatever the hostname property would
380        have returned.
381
382        @return hostname string
383        """
384        return _reverse_lookup_from_config(self.hostname)
385
386
387    @staticmethod
388    def get_server_url(url):
389        """Get the devserver url from a repo url, which includes build info.
390
391        @param url: A job repo url.
392
393        @return A devserver url, e.g., http://127.0.0.10:8080
394        """
395        res = urlparse.urlparse(url)
396        if res.netloc:
397            return res.scheme + '://' + res.netloc
398
399
400    @classmethod
401    def get_devserver_load_wrapper(cls, devserver, timeout_sec, output):
402        """A wrapper function to call get_devserver_load in parallel.
403
404        @param devserver: url of the devserver.
405        @param timeout_sec: Number of seconds before time out the devserver
406                            call.
407        @param output: An output queue to save results to.
408        """
409        load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0)
410        if load:
411            load['devserver'] = devserver
412        output.put(load)
413
414
415    @classmethod
416    def get_devserver_load(cls, devserver,
417                           timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
418        """Returns True if the |devserver| is healthy to stage build.
419
420        @param devserver: url of the devserver.
421        @param timeout_min: How long to wait in minutes before deciding the
422                            the devserver is not up (float).
423
424        @return: A dictionary of the devserver's load.
425
426        """
427        call = cls._build_call(devserver, 'check_health')
428        @remote_devserver_call(timeout_min=timeout_min)
429        def get_load(devserver=devserver):
430            """Inner method that makes the call."""
431            return cls.run_call(call, timeout=timeout_min*60)
432
433        try:
434            return json.load(cStringIO.StringIO(get_load(devserver=devserver)))
435        except Exception as e:
436            logging.error('Devserver call failed: "%s", timeout: %s seconds,'
437                          ' Error: %s', call, timeout_min * 60, e)
438
439
440    @classmethod
441    def is_free_disk_ok(cls, load):
442        """Check if a devserver has enough free disk.
443
444        @param load: A dict of the load of the devserver.
445
446        @return: True if the devserver has enough free disk or disk check is
447                 skipped in global config.
448
449        """
450        if SKIP_DEVSERVER_HEALTH_CHECK:
451            logging.debug('devserver health check is skipped.')
452        elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB:
453            return False
454
455        return True
456
457
458    @classmethod
459    def is_apache_client_count_ok(cls, load):
460        """Check if a devserver has enough Apache connections available.
461
462        Apache server by default has maximum of 150 concurrent connections. If
463        a devserver has too many live connections, it likely indicates the
464        server is busy handling many long running download requests, e.g.,
465        downloading stateful partitions. It is better not to add more requests
466        to it.
467
468        @param load: A dict of the load of the devserver.
469
470        @return: True if the devserver has enough Apache connections available,
471                 or disk check is skipped in global config.
472
473        """
474        if SKIP_DEVSERVER_HEALTH_CHECK:
475            logging.debug('devserver health check is skipped.')
476        elif cls.APACHE_CLIENT_COUNT not in load:
477            logging.debug('Apache client count is not collected from devserver.')
478        elif (load[cls.APACHE_CLIENT_COUNT] >
479              cls._MAX_APACHE_CLIENT_COUNT):
480            return False
481
482        return True
483
484
485    @classmethod
486    def devserver_healthy(cls, devserver,
487                          timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
488        """Returns True if the |devserver| is healthy to stage build.
489
490        @param devserver: url of the devserver.
491        @param timeout_min: How long to wait in minutes before deciding the
492                            the devserver is not up (float).
493
494        @return: True if devserver is healthy. Return False otherwise.
495
496        """
497        c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy')
498        reason = ''
499        healthy = False
500        load = cls.get_devserver_load(devserver, timeout_min=timeout_min)
501        try:
502            if not load:
503                # Failed to get the load of devserver.
504                reason = '(1) Failed to get load.'
505                return False
506
507            apache_ok = cls.is_apache_client_count_ok(load)
508            if not apache_ok:
509                reason = '(2) Apache client count too high.'
510                logging.error('Devserver check_health failed. Live Apache client '
511                              'count is too high: %d.',
512                              load[cls.APACHE_CLIENT_COUNT])
513                return False
514
515            disk_ok = cls.is_free_disk_ok(load)
516            if not disk_ok:
517                reason = '(3) Disk space too low.'
518                logging.error('Devserver check_health failed. Free disk space is '
519                              'low. Only %dGB is available.',
520                              load[cls.FREE_DISK])
521            healthy = bool(disk_ok)
522            return disk_ok
523        finally:
524            c.increment(fields={'dev_server': cls(devserver).resolved_hostname,
525                                'healthy': healthy,
526                                'reason': reason})
527            # Monitor how many AU processes the devserver is currently running.
528            if load is not None and load.get(DevServer.AU_PROCESS):
529                c_au = metrics.Gauge(
530                        'chromeos/autotest/devserver/devserver_au_count')
531                c_au.set(
532                    load.get(DevServer.AU_PROCESS),
533                    fields={'dev_server': cls(devserver).resolved_hostname})
534
535
536    @staticmethod
537    def _build_call(host, method, **kwargs):
538        """Build a URL to |host| that calls |method|, passing |kwargs|.
539
540        Builds a URL that calls |method| on the dev server defined by |host|,
541        passing a set of key/value pairs built from the dict |kwargs|.
542
543        @param host: a string that is the host basename e.g. http://server:90.
544        @param method: the dev server method to call.
545        @param kwargs: a dict mapping arg names to arg values.
546        @return the URL string.
547        """
548        argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems()))
549        return "%(host)s/%(method)s?%(argstr)s" % dict(
550                host=host, method=method, argstr=argstr)
551
552
553    def build_call(self, method, **kwargs):
554        """Builds a devserver RPC string that is used by 'run_call()'.
555
556        @param method: remote devserver method to call.
557        """
558        return self._build_call(self._devserver, method, **kwargs)
559
560
561    @classmethod
562    def build_all_calls(cls, method, **kwargs):
563        """Builds a list of URLs that makes RPC calls on all devservers.
564
565        Build a URL that calls |method| on the dev server, passing a set
566        of key/value pairs built from the dict |kwargs|.
567
568        @param method: the dev server method to call.
569        @param kwargs: a dict mapping arg names to arg values
570
571        @return the URL string
572        """
573        calls = []
574        # Note we use cls.servers as servers is class specific.
575        for server in cls.servers():
576            if cls.devserver_healthy(server):
577                calls.append(cls._build_call(server, method, **kwargs))
578
579        return calls
580
581
582    @classmethod
583    def run_call(cls, call, readline=False, timeout=None):
584        """Invoke a given devserver call using urllib.open.
585
586        Open the URL with HTTP, and return the text of the response. Exceptions
587        may be raised as for urllib2.urlopen().
588
589        @param call: a url string that calls a method to a devserver.
590        @param readline: whether read http response line by line.
591        @param timeout: The timeout seconds for this urlopen call.
592
593        @return the results of this call.
594        """
595        if timeout is not None:
596            return utils.urlopen_socket_timeout(
597                    call, timeout=timeout).read()
598        elif readline:
599            response = urllib2.urlopen(call)
600            return [line.rstrip() for line in response]
601        else:
602            return urllib2.urlopen(call).read()
603
604
605    @staticmethod
606    def servers():
607        """Returns a list of servers that can serve as this type of server."""
608        raise NotImplementedError()
609
610
611    @classmethod
612    def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT,
613                                      unrestricted_only=False):
614        """Get the devservers in the same subnet of the given ip.
615
616        @param ip: The IP address of a dut to look for devserver.
617        @param mask_bits: Number of mask bits. Default is 19.
618        @param unrestricted_only: Set to True to select from devserver in
619                unrestricted subnet only. Default is False.
620
621        @return: A list of devservers in the same subnet of the given ip.
622
623        """
624        # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so
625        # we need a dict to return the full devserver path once the IPs are
626        # filtered in get_servers_in_same_subnet.
627        server_names = {}
628        all_devservers = []
629        devservers = (cls.get_unrestricted_devservers() if unrestricted_only
630                      else cls.servers())
631        for server in devservers:
632            server_name = get_hostname(server)
633            server_names[server_name] = server
634            all_devservers.append(server_name)
635        if not all_devservers:
636            devserver_type = 'unrestricted only' if unrestricted_only else 'all'
637            raise DevServerFailToLocateException(
638                'Fail to locate a devserver for dut %s in %s devservers'
639                % (ip, devserver_type))
640
641        devservers = utils.get_servers_in_same_subnet(ip, mask_bits,
642                                                      all_devservers)
643        return [server_names[s] for s in devservers]
644
645
646    @classmethod
647    def get_unrestricted_devservers(
648                cls, restricted_subnets=utils.RESTRICTED_SUBNETS):
649        """Get the devservers not in any restricted subnet specified in
650        restricted_subnets.
651
652        @param restricted_subnets: A list of restriected subnets.
653
654        @return: A list of devservers not in any restricted subnet.
655
656        """
657        if not restricted_subnets:
658            return cls.servers()
659
660        devservers = []
661        for server in cls.servers():
662            server_name = get_hostname(server)
663            if not utils.get_restricted_subnet(server_name, restricted_subnets):
664                devservers.append(server)
665        return devservers
666
667
668    @classmethod
669    def get_healthy_devserver(cls, build, devservers, ban_list=None):
670        """"Get a healthy devserver instance from the list of devservers.
671
672        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
673        @param devservers: The devserver list to be chosen out a healthy one.
674        @param ban_list: The blacklist of devservers we don't want to choose.
675                Default is None.
676
677        @return: A DevServer object of a healthy devserver. Return None if no
678                healthy devserver is found.
679
680        """
681        logging.debug('Pick one healthy devserver from %r', devservers)
682        while devservers:
683            hash_index = hash(build) % len(devservers)
684            devserver = devservers.pop(hash_index)
685            logging.debug('Check health for %s', devserver)
686            if ban_list and devserver in ban_list:
687                continue
688
689            if cls.devserver_healthy(devserver):
690                logging.debug('Pick %s', devserver)
691                return cls(devserver)
692
693
694    @classmethod
695    def get_available_devservers(cls, hostname=None,
696                                 prefer_local_devserver=PREFER_LOCAL_DEVSERVER,
697                                 restricted_subnets=utils.RESTRICTED_SUBNETS):
698        """Get devservers in the same subnet of the given hostname.
699
700        @param hostname: Hostname of a DUT to choose devserver for.
701
702        @return: A tuple of (devservers, can_retry), devservers is a list of
703                 devservers that's available for the given hostname. can_retry
704                 is a flag that indicate if caller can retry the selection of
705                 devserver if no devserver in the returned devservers can be
706                 used. For example, if hostname is in a restricted subnet,
707                 can_retry will be False.
708        """
709        logging.info('Getting devservers for host: %s',  hostname)
710        host_ip = None
711        if hostname:
712            host_ip = bin_utils.get_ip_address(hostname)
713            if not host_ip:
714                logging.error('Failed to get IP address of %s. Will pick a '
715                              'devserver without subnet constraint.', hostname)
716
717        if not host_ip:
718            return cls.get_unrestricted_devservers(restricted_subnets), False
719
720        # Go through all restricted subnet settings and check if the DUT is
721        # inside a restricted subnet. If so, only return the devservers in the
722        # restricted subnet and doesn't allow retry.
723        if host_ip and restricted_subnets:
724            subnet_ip, mask_bits = _get_subnet_for_host_ip(
725                    host_ip, restricted_subnets=restricted_subnets)
726            if subnet_ip:
727                logging.debug('The host %s (%s) is in a restricted subnet. '
728                              'Try to locate a devserver inside subnet '
729                              '%s:%d.', hostname, host_ip, subnet_ip,
730                              mask_bits)
731                devservers = cls.get_devservers_in_same_subnet(
732                        subnet_ip, mask_bits)
733                return devservers, False
734
735        # If prefer_local_devserver is set to True and the host is not in
736        # restricted subnet, pick a devserver in the same subnet if possible.
737        # Set can_retry to True so it can pick a different devserver if all
738        # devservers in the same subnet are down.
739        if prefer_local_devserver:
740            return (cls.get_devservers_in_same_subnet(
741                    host_ip, DEFAULT_SUBNET_MASKBIT, True), True)
742
743        return cls.get_unrestricted_devservers(restricted_subnets), False
744
745
746    @classmethod
747    def resolve(cls, build, hostname=None, ban_list=None):
748        """"Resolves a build to a devserver instance.
749
750        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
751        @param hostname: The hostname of dut that requests a devserver. It's
752                         used to make sure a devserver in the same subnet is
753                         preferred.
754        @param ban_list: The blacklist of devservers shouldn't be chosen.
755
756        @raise DevServerException: If no devserver is available.
757        """
758        tried_devservers = set()
759        devservers, can_retry = cls.get_available_devservers(hostname)
760        if devservers:
761            tried_devservers |= set(devservers)
762
763        devserver = cls.get_healthy_devserver(build, devservers,
764                                              ban_list=ban_list)
765
766        if not devserver and can_retry:
767            # Find available devservers without dut location constrain.
768            devservers, _ = cls.get_available_devservers()
769            devserver = cls.get_healthy_devserver(build, devservers,
770                                                  ban_list=ban_list)
771            if devservers:
772                tried_devservers |= set(devservers)
773        if devserver:
774            return devserver
775        else:
776            subnet = 'unrestricted subnet'
777            if hostname is not None:
778                host_ip = bin_utils.get_ip_address(hostname)
779                if host_ip:
780                    subnet_ip, mask_bits = _get_subnet_for_host_ip(host_ip)
781                    subnet = '%s/%s' % (str(subnet_ip), str(mask_bits))
782
783            error_msg = ('All devservers in subnet: %s are currently down: '
784                         '%s. (dut hostname: %s)' %
785                         (subnet, tried_devservers, hostname))
786            logging.error(error_msg)
787            c = metrics.Counter(
788                    'chromeos/autotest/devserver/subnet_without_devservers')
789            c.increment(fields={'subnet': subnet, 'hostname': str(hostname)})
790            raise DevServerException(error_msg)
791
792
793    @classmethod
794    def random(cls):
795        """Return a random devserver that's available.
796
797        Devserver election in `resolve` method is based on a hash of the
798        build that a caller wants to stage. The purpose is that different
799        callers requesting for the same build can get the same devserver,
800        while the lab is able to distribute different builds across all
801        devservers. That helps to reduce the duplication of builds across
802        all devservers.
803        This function returns a random devserver, by passing a random
804        pseudo build name to `resolve `method.
805        """
806        return cls.resolve(build=str(time.time()))
807
808
809class CrashServer(DevServer):
810    """Class of DevServer that symbolicates crash dumps."""
811
812    @staticmethod
813    def servers():
814        return _get_crash_server_list()
815
816
817    @remote_devserver_call()
818    def symbolicate_dump(self, minidump_path, build):
819        """Ask the devserver to symbolicate the dump at minidump_path.
820
821        Stage the debug symbols for |build| and, if that works, ask the
822        devserver to symbolicate the dump at |minidump_path|.
823
824        @param minidump_path: the on-disk path of the minidump.
825        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
826                      whose debug symbols are needed for symbolication.
827        @return The contents of the stack trace
828        @raise DevServerException upon any return code that's not HTTP OK.
829        """
830        try:
831            import requests
832        except ImportError:
833            logging.warning("Can't 'import requests' to connect to dev server.")
834            return ''
835        f = {'dev_server': self.resolved_hostname}
836        c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump')
837        c.increment(fields=f)
838        # Symbolicate minidump.
839        m = 'chromeos/autotest/crashserver/symbolicate_dump_duration'
840        with metrics.SecondsTimer(m, fields=f):
841            call = self.build_call('symbolicate_dump',
842                                   archive_url=_get_image_storage_server() + build)
843            request = requests.post(
844                    call, files={'minidump': open(minidump_path, 'rb')})
845            if request.status_code == requests.codes.OK:
846                return request.text
847
848        error_fd = cStringIO.StringIO(request.text)
849        raise urllib2.HTTPError(
850                call, request.status_code, request.text, request.headers,
851                error_fd)
852
853
854    @classmethod
855    def get_available_devservers(cls, hostname):
856        """Get all available crash servers.
857
858        Crash server election doesn't need to count the location of hostname.
859
860        @param hostname: Hostname of a DUT to choose devserver for.
861
862        @return: A tuple of (all crash servers, False). can_retry is set to
863                 False, as all crash servers are returned. There is no point to
864                 retry.
865        """
866        return cls.servers(), False
867
868
869class ImageServerBase(DevServer):
870    """Base class for devservers used to stage builds.
871
872    CrOS and Android builds are staged in different ways as they have different
873    sets of artifacts. This base class abstracts the shared functions between
874    the two types of ImageServer.
875    """
876
877    @classmethod
878    def servers(cls):
879        """Returns a list of servers that can serve as a desired type of
880        devserver.
881        """
882        return _get_dev_server_list()
883
884
885    def _get_image_url(self, image):
886        """Returns the url of the directory for this image on the devserver.
887
888        @param image: the image that was fetched.
889        """
890        image = self.translate(image)
891        url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
892                                              type=str)
893        return (url_pattern % (self.url(), image)).replace('update', 'static')
894
895
896    @staticmethod
897    def create_metadata(server_name, image, artifacts=None, files=None):
898        """Create a metadata dictionary given the staged items.
899
900        The metadata can be send to metadata db along with stats.
901
902        @param server_name: name of the devserver, e.g 172.22.33.44.
903        @param image: The name of the image.
904        @param artifacts: A list of artifacts.
905        @param files: A list of files.
906
907        @return A metadata dictionary.
908
909        """
910        metadata = {'devserver': server_name,
911                    'image': image,
912                    '_type': 'devserver'}
913        if artifacts:
914            metadata['artifacts'] = ' '.join(artifacts)
915        if files:
916            metadata['files'] = ' '.join(files)
917        return metadata
918
919
920    @classmethod
921    def run_ssh_call(cls, call, readline=False, timeout=None):
922        """Construct an ssh-based rpc call, and execute it.
923
924        @param call: a url string that calls a method to a devserver.
925        @param readline: whether read http response line by line.
926        @param timeout: The timeout seconds for ssh call.
927
928        @return the results of this call.
929        """
930        hostname = get_hostname(call)
931        ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call))
932        timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60
933        try:
934            result = utils.run(ssh_call, timeout=timeout_seconds)
935        except error.CmdError as e:
936            logging.debug('Error occurred with exit_code %d when executing the '
937                          'ssh call: %s.', e.result_obj.exit_status,
938                          e.result_obj.stderr)
939            c = metrics.Counter('chromeos/autotest/devserver/ssh_failure')
940            c.increment(fields={'dev_server': hostname})
941            raise
942        response = result.stdout
943
944        # If the curl command's returned HTTP response contains certain
945        # exception string, raise the DevServerException of the response.
946        if 'DownloaderException' in response:
947            raise DevServerException(_strip_http_message(response))
948
949        if readline:
950            # Remove line terminators and trailing whitespace
951            response = response.splitlines()
952            return [line.rstrip() for line in response]
953
954        return response
955
956
957    @classmethod
958    def run_call(cls, call, readline=False, timeout=None):
959        """Invoke a given devserver call using urllib.open or ssh.
960
961        Open the URL with HTTP or SSH-based HTTP, and return the text of the
962        response. Exceptions may be raised as for urllib2.urlopen() or
963        utils.run().
964
965        @param call: a url string that calls a method to a devserver.
966        @param readline: whether read http response line by line.
967        @param timeout: The timeout seconds for urlopen call or ssh call.
968
969        @return the results of this call.
970        """
971        server_name = get_hostname(call)
972        is_in_restricted_subnet = utils.get_restricted_subnet(
973                server_name, utils.RESTRICTED_SUBNETS)
974        _EMPTY_SENTINEL_VALUE = object()
975        def kickoff_call():
976            """Invoke a given devserver call using urllib.open or ssh.
977
978            @param call: a url string that calls a method to a devserver.
979            @param is_in_restricted_subnet: whether the devserver is in subnet.
980            @param readline: whether read http response line by line.
981            @param timeout: The timeout seconds for urlopen call or ssh call.
982            """
983            if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or
984                not is_in_restricted_subnet):
985                response = super(ImageServerBase, cls).run_call(
986                        call, readline=readline, timeout=timeout)
987            else:
988                response = cls.run_ssh_call(
989                        call, readline=readline, timeout=timeout)
990            # Retry if devserver service is temporarily down, e.g. in a
991            # devserver push.
992            if ERR_MSG_FOR_DOWN_DEVSERVER in response:
993                return False
994
995            # Don't return response directly since it may be empty string,
996            # which causes poll_for_condition to retry.
997            return _EMPTY_SENTINEL_VALUE if not response else response
998
999        try:
1000            response = bin_utils.poll_for_condition(
1001                    kickoff_call,
1002                    exception=bin_utils.TimeoutError(),
1003                    timeout=60,
1004                    sleep_interval=5)
1005            return '' if response is _EMPTY_SENTINEL_VALUE else response
1006        except bin_utils.TimeoutError:
1007            return ERR_MSG_FOR_DOWN_DEVSERVER
1008
1009
1010    @classmethod
1011    def download_file(cls, remote_file, local_file, timeout=None):
1012        """Download file from devserver.
1013
1014        The format of remote_file should be:
1015            http://devserver_ip:8082/static/board/...
1016
1017        @param remote_file: The URL of the file on devserver that need to be
1018            downloaded.
1019        @param local_file: The path of the file saved to local.
1020        @param timeout: The timeout seconds for this call.
1021        """
1022        response = cls.run_call(remote_file, timeout=timeout)
1023        with open(local_file, 'w') as out_log:
1024            out_log.write(response)
1025
1026
1027    def _poll_is_staged(self, **kwargs):
1028        """Polling devserver.is_staged until all artifacts are staged.
1029
1030        @param kwargs: keyword arguments to make is_staged devserver call.
1031
1032        @return: True if all artifacts are staged in devserver.
1033        """
1034        call = self.build_call('is_staged', **kwargs)
1035
1036        def all_staged():
1037            """Call devserver.is_staged rpc to check if all files are staged.
1038
1039            @return: True if all artifacts are staged in devserver. False
1040                     otherwise.
1041            @rasies DevServerException, the exception is a wrapper of all
1042                    exceptions that were raised when devserver tried to download
1043                    the artifacts. devserver raises an HTTPError or a CmdError
1044                    when an exception was raised in the code. Such exception
1045                    should be re-raised here to stop the caller from waiting.
1046                    If the call to devserver failed for connection issue, a
1047                    URLError exception is raised, and caller should retry the
1048                    call to avoid such network flakiness.
1049
1050            """
1051            try:
1052                result = self.run_call(call)
1053                logging.debug('whether artifact is staged: %r', result)
1054                return result == 'True'
1055            except urllib2.HTTPError as e:
1056                error_markup = e.read()
1057                raise DevServerException(_strip_http_message(error_markup))
1058            except urllib2.URLError as e:
1059                # Could be connection issue, retry it.
1060                # For example: <urlopen error [Errno 111] Connection refused>
1061                logging.error('URLError happens in is_stage: %r', e)
1062                return False
1063            except error.CmdError as e:
1064                # Retry if SSH failed to connect to the devserver.
1065                logging.warning('CmdError happens in is_stage: %r, will retry', e)
1066                return False
1067
1068        bin_utils.poll_for_condition(
1069                all_staged,
1070                exception=bin_utils.TimeoutError(),
1071                timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60,
1072                sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL)
1073
1074        return True
1075
1076
1077    def _call_and_wait(self, call_name, error_message,
1078                       expected_response=SUCCESS, **kwargs):
1079        """Helper method to make a urlopen call, and wait for artifacts staged.
1080
1081        @param call_name: name of devserver rpc call.
1082        @param error_message: Error message to be thrown if response does not
1083                              match expected_response.
1084        @param expected_response: Expected response from rpc, default to
1085                                  |Success|. If it's set to None, do not compare
1086                                  the actual response. Any response is consider
1087                                  to be good.
1088        @param kwargs: keyword arguments to make is_staged devserver call.
1089
1090        @return: The response from rpc.
1091        @raise DevServerException upon any return code that's expected_response.
1092
1093        """
1094        call = self.build_call(call_name, async=True, **kwargs)
1095        try:
1096            response = self.run_call(call)
1097            logging.debug('response for RPC: %r', response)
1098            if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response:
1099                logging.debug('Proxy error happens in RPC call, '
1100                              'will retry in 30 seconds')
1101                time.sleep(30)
1102                raise DevServerOverloadException()
1103        except httplib.BadStatusLine as e:
1104            logging.error(e)
1105            raise DevServerException('Received Bad Status line, Devserver %s '
1106                                     'might have gone down while handling '
1107                                     'the call: %s' % (self.url(), call))
1108
1109        if expected_response and not response == expected_response:
1110                raise DevServerException(error_message)
1111
1112        # `os_type` is needed in build a devserver call, but not needed for
1113        # wait_for_artifacts_staged, since that method is implemented by
1114        # each ImageServerBase child class.
1115        if 'os_type' in kwargs:
1116            del kwargs['os_type']
1117        self.wait_for_artifacts_staged(**kwargs)
1118        return response
1119
1120
1121    def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs):
1122        """Tell the devserver to download and stage |artifacts| from |image|
1123        specified by kwargs.
1124
1125        This is the main call point for staging any specific artifacts for a
1126        given build. To see the list of artifacts one can stage see:
1127
1128        ~src/platfrom/dev/artifact_info.py.
1129
1130        This is maintained along with the actual devserver code.
1131
1132        @param artifacts: A list of artifacts.
1133        @param files: A list of files to stage.
1134        @param archive_url: Optional parameter that has the archive_url to stage
1135                this artifact from. Default is specified in autotest config +
1136                image.
1137        @param kwargs: keyword arguments that specify the build information, to
1138                make stage devserver call.
1139
1140        @raise DevServerException upon any return code that's not HTTP OK.
1141        """
1142        if not archive_url:
1143            archive_url = _get_storage_server_for_artifacts(artifacts) + build
1144
1145        artifacts_arg = ','.join(artifacts) if artifacts else ''
1146        files_arg = ','.join(files) if files else ''
1147        error_message = ("staging %s for %s failed;"
1148                         "HTTP OK not accompanied by 'Success'." %
1149                         ('artifacts=%s files=%s ' % (artifacts_arg, files_arg),
1150                          build))
1151
1152        staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' %
1153                        (build, artifacts, files, archive_url))
1154        logging.info('Staging artifacts on devserver %s: %s',
1155                     self.url(), staging_info)
1156        success = False
1157        try:
1158            arguments = {'archive_url': archive_url,
1159                         'artifacts': artifacts_arg,
1160                         'files': files_arg}
1161            if kwargs:
1162                arguments.update(kwargs)
1163            # TODO(akeshet): canonicalize artifacts_arg before using it as a
1164            # metric field (as it stands it is a not-very-well-controlled
1165            # string).
1166            f = {'artifacts': artifacts_arg,
1167                 'dev_server': self.resolved_hostname}
1168            with metrics.SecondsTimer(
1169                    'chromeos/autotest/devserver/stage_artifact_duration',
1170                    fields=f):
1171                self.call_and_wait(call_name='stage', error_message=error_message,
1172                                   **arguments)
1173            logging.info('Finished staging artifacts: %s', staging_info)
1174            success = True
1175        except (bin_utils.TimeoutError, error.TimeoutException):
1176            logging.error('stage_artifacts timed out: %s', staging_info)
1177            raise DevServerException(
1178                    'stage_artifacts timed out: %s' % staging_info)
1179        finally:
1180            f = {'success': success,
1181                 'artifacts': artifacts_arg,
1182                 'dev_server': self.resolved_hostname}
1183            metrics.Counter('chromeos/autotest/devserver/stage_artifact'
1184                            ).increment(fields=f)
1185
1186
1187    def call_and_wait(self, *args, **kwargs):
1188        """Helper method to make a urlopen call, and wait for artifacts staged.
1189
1190        This method needs to be overridden in the subclass to implement the
1191        logic to call _call_and_wait.
1192        """
1193        raise NotImplementedError
1194
1195
1196    def _trigger_download(self, build, artifacts, files, synchronous=True,
1197                          **kwargs_build_info):
1198        """Tell the devserver to download and stage image specified in
1199        kwargs_build_info.
1200
1201        Tells the devserver to fetch |image| from the image storage server
1202        named by _get_image_storage_server().
1203
1204        If |synchronous| is True, waits for the entire download to finish
1205        staging before returning. Otherwise only the artifacts necessary
1206        to start installing images onto DUT's will be staged before returning.
1207        A caller can then call finish_download to guarantee the rest of the
1208        artifacts have finished staging.
1209
1210        @param synchronous: if True, waits until all components of the image are
1211               staged before returning.
1212        @param kwargs_build_info: Dictionary of build information.
1213                For CrOS, it is None as build is the CrOS image name.
1214                For Android, it is {'target': target,
1215                                    'build_id': build_id,
1216                                    'branch': branch}
1217
1218        @raise DevServerException upon any return code that's not HTTP OK.
1219
1220        """
1221        if kwargs_build_info:
1222            archive_url = None
1223        else:
1224            archive_url = _get_image_storage_server() + build
1225        error_message = ("trigger_download for %s failed;"
1226                         "HTTP OK not accompanied by 'Success'." % build)
1227        kwargs = {'archive_url': archive_url,
1228                  'artifacts': artifacts,
1229                  'files': files,
1230                  'error_message': error_message}
1231        if kwargs_build_info:
1232            kwargs.update(kwargs_build_info)
1233
1234        logging.info('trigger_download starts for %s', build)
1235        try:
1236            response = self.call_and_wait(call_name='stage', **kwargs)
1237            logging.info('trigger_download finishes for %s', build)
1238        except (bin_utils.TimeoutError, error.TimeoutException):
1239            logging.error('trigger_download timed out for %s.', build)
1240            raise DevServerException(
1241                    'trigger_download timed out for %s.' % build)
1242        was_successful = response == SUCCESS
1243        if was_successful and synchronous:
1244            self._finish_download(build, artifacts, files, **kwargs_build_info)
1245
1246
1247    def _finish_download(self, build, artifacts, files, **kwargs_build_info):
1248        """Tell the devserver to finish staging image specified in
1249        kwargs_build_info.
1250
1251        If trigger_download is called with synchronous=False, it will return
1252        before all artifacts have been staged. This method contacts the
1253        devserver and blocks until all staging is completed and should be
1254        called after a call to trigger_download.
1255
1256        @param kwargs_build_info: Dictionary of build information.
1257                For CrOS, it is None as build is the CrOS image name.
1258                For Android, it is {'target': target,
1259                                    'build_id': build_id,
1260                                    'branch': branch}
1261
1262        @raise DevServerException upon any return code that's not HTTP OK.
1263        """
1264        archive_url = _get_image_storage_server() + build
1265        error_message = ("finish_download for %s failed;"
1266                         "HTTP OK not accompanied by 'Success'." % build)
1267        kwargs = {'archive_url': archive_url,
1268                  'artifacts': artifacts,
1269                  'files': files,
1270                  'error_message': error_message}
1271        if kwargs_build_info:
1272            kwargs.update(kwargs_build_info)
1273        try:
1274            self.call_and_wait(call_name='stage', **kwargs)
1275        except (bin_utils.TimeoutError, error.TimeoutException):
1276            logging.error('finish_download timed out for %s', build)
1277            raise DevServerException(
1278                    'finish_download timed out for %s.' % build)
1279
1280
1281    @remote_devserver_call()
1282    def locate_file(self, file_name, artifacts, build, build_info):
1283        """Locate a file with the given file_name on devserver.
1284
1285        This method calls devserver RPC `locate_file` to look up a file with
1286        the given file name inside specified build artifacts.
1287
1288        @param file_name: Name of the file to look for a file.
1289        @param artifacts: A list of artifact names to search for the file.
1290        @param build: Name of the build. For Android, it's None as build_info
1291                should be used.
1292        @param build_info: Dictionary of build information.
1293                For CrOS, it is None as build is the CrOS image name.
1294                For Android, it is {'target': target,
1295                                    'build_id': build_id,
1296                                    'branch': branch}
1297
1298        @return: A devserver url to the file.
1299        @raise DevServerException upon any return code that's not HTTP OK.
1300        """
1301        if not build and not build_info:
1302            raise DevServerException('You must specify build information to '
1303                                     'look for file %s in artifacts %s.' %
1304                                     (file_name, artifacts))
1305        kwargs = {'file_name': file_name,
1306                  'artifacts': artifacts}
1307        if build_info:
1308            build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info
1309            kwargs.update(build_info)
1310            # Devserver treats Android and Brillo build in the same way as they
1311            # are both retrieved from Launch Control and have similar build
1312            # artifacts. Therefore, os_type for devserver calls is `android` for
1313            # both Android and Brillo builds.
1314            kwargs['os_type'] = 'android'
1315        else:
1316            build_path = build
1317            kwargs['build'] = build
1318        call = self.build_call('locate_file', async=False, **kwargs)
1319        try:
1320            file_path = self.run_call(call)
1321            return os.path.join(self.url(), 'static', build_path, file_path)
1322        except httplib.BadStatusLine as e:
1323            logging.error(e)
1324            raise DevServerException('Received Bad Status line, Devserver %s '
1325                                     'might have gone down while handling '
1326                                     'the call: %s' % (self.url(), call))
1327
1328
1329    @remote_devserver_call()
1330    def list_control_files(self, build, suite_name=''):
1331        """Ask the devserver to list all control files for |build|.
1332
1333        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1334                      whose control files the caller wants listed.
1335        @param suite_name: The name of the suite for which we require control
1336                           files.
1337        @return None on failure, or a list of control file paths
1338                (e.g. server/site_tests/autoupdate/control)
1339        @raise DevServerException upon any return code that's not HTTP OK.
1340        """
1341        build = self.translate(build)
1342        call = self.build_call('controlfiles', build=build,
1343                               suite_name=suite_name)
1344        return self.run_call(call, readline=True)
1345
1346
1347    @remote_devserver_call()
1348    def get_control_file(self, build, control_path):
1349        """Ask the devserver for the contents of a control file.
1350
1351        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1352                      whose control file the caller wants to fetch.
1353        @param control_path: The file to fetch
1354                             (e.g. server/site_tests/autoupdate/control)
1355        @return The contents of the desired file.
1356        @raise DevServerException upon any return code that's not HTTP OK.
1357        """
1358        build = self.translate(build)
1359        call = self.build_call('controlfiles', build=build,
1360                               control_path=control_path)
1361        return self.run_call(call)
1362
1363
1364    @remote_devserver_call()
1365    def list_suite_controls(self, build, suite_name=''):
1366        """Ask the devserver to list contents of all control files for |build|.
1367
1368        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1369                      whose control files' contents the caller wants returned.
1370        @param suite_name: The name of the suite for which we require control
1371                           files.
1372        @return None on failure, or a dict of contents of all control files
1373            (e.g. {'path1': "#Copyright controls ***", ...,
1374                pathX': "#Copyright controls ***"}
1375        @raise DevServerException upon any return code that's not HTTP OK.
1376        """
1377        build = self.translate(build)
1378        call = self.build_call('list_suite_controls', build=build,
1379                               suite_name=suite_name)
1380        return json.load(cStringIO.StringIO(self.run_call(call)))
1381
1382
1383class ImageServer(ImageServerBase):
1384    """Class for DevServer that handles RPCs related to CrOS images.
1385
1386    The calls to devserver to stage artifacts, including stage and download, are
1387    made in async mode. That is, when caller makes an RPC |stage| to request
1388    devserver to stage certain artifacts, devserver handles the call and starts
1389    staging artifacts in a new thread, and return |Success| without waiting for
1390    staging being completed. When caller receives message |Success|, it polls
1391    devserver's is_staged call until all artifacts are staged.
1392    Such mechanism is designed to prevent cherrypy threads in devserver being
1393    running out, as staging artifacts might take long time, and cherrypy starts
1394    with a fixed number of threads that handle devserver rpc.
1395    """
1396
1397    class ArtifactUrls(object):
1398        """A container for URLs of staged artifacts.
1399
1400        Attributes:
1401            full_payload: URL for downloading a staged full release update
1402            mton_payload: URL for downloading a staged M-to-N release update
1403            nton_payload: URL for downloading a staged N-to-N release update
1404
1405        """
1406        def __init__(self, full_payload=None, mton_payload=None,
1407                     nton_payload=None):
1408            self.full_payload = full_payload
1409            self.mton_payload = mton_payload
1410            self.nton_payload = nton_payload
1411
1412
1413    def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''):
1414        """Polling devserver.is_staged until all artifacts are staged.
1415
1416        @param archive_url: Google Storage URL for the build.
1417        @param artifacts: Comma separated list of artifacts to download.
1418        @param files: Comma separated list of files to download.
1419        @return: True if all artifacts are staged in devserver.
1420        """
1421        kwargs = {'archive_url': archive_url,
1422                  'artifacts': artifacts,
1423                  'files': files}
1424        return self._poll_is_staged(**kwargs)
1425
1426
1427    @remote_devserver_call()
1428    def call_and_wait(self, call_name, archive_url, artifacts, files,
1429                      error_message, expected_response=SUCCESS):
1430        """Helper method to make a urlopen call, and wait for artifacts staged.
1431
1432        @param call_name: name of devserver rpc call.
1433        @param archive_url: Google Storage URL for the build..
1434        @param artifacts: Comma separated list of artifacts to download.
1435        @param files: Comma separated list of files to download.
1436        @param expected_response: Expected response from rpc, default to
1437                                  |Success|. If it's set to None, do not compare
1438                                  the actual response. Any response is consider
1439                                  to be good.
1440        @param error_message: Error message to be thrown if response does not
1441                              match expected_response.
1442
1443        @return: The response from rpc.
1444        @raise DevServerException upon any return code that's expected_response.
1445
1446        """
1447        kwargs = {'archive_url': archive_url,
1448                  'artifacts': artifacts,
1449                  'files': files}
1450        return self._call_and_wait(call_name, error_message,
1451                                   expected_response, **kwargs)
1452
1453
1454    @remote_devserver_call()
1455    def stage_artifacts(self, image=None, artifacts=None, files='',
1456                        archive_url=None):
1457        """Tell the devserver to download and stage |artifacts| from |image|.
1458
1459         This is the main call point for staging any specific artifacts for a
1460        given build. To see the list of artifacts one can stage see:
1461
1462        ~src/platfrom/dev/artifact_info.py.
1463
1464        This is maintained along with the actual devserver code.
1465
1466        @param image: the image to fetch and stage.
1467        @param artifacts: A list of artifacts.
1468        @param files: A list of files to stage.
1469        @param archive_url: Optional parameter that has the archive_url to stage
1470                this artifact from. Default is specified in autotest config +
1471                image.
1472
1473        @raise DevServerException upon any return code that's not HTTP OK.
1474        """
1475        if not artifacts and not files:
1476            raise DevServerException('Must specify something to stage.')
1477        image = self.translate(image)
1478        self._stage_artifacts(image, artifacts, files, archive_url)
1479
1480
1481    @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS)
1482    def list_image_dir(self, image):
1483        """List the contents of the image stage directory, on the devserver.
1484
1485        @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>.
1486
1487        @raise DevServerException upon any return code that's not HTTP OK.
1488        """
1489        image = self.translate(image)
1490        logging.info('Requesting contents from devserver %s for image %s',
1491                     self.url(), image)
1492        archive_url = _get_storage_server_for_artifacts() + image
1493        call = self.build_call('list_image_dir', archive_url=archive_url)
1494        response = self.run_call(call, readline=True)
1495        for line in response:
1496            logging.info(line)
1497
1498
1499    def trigger_download(self, image, synchronous=True):
1500        """Tell the devserver to download and stage |image|.
1501
1502        Tells the devserver to fetch |image| from the image storage server
1503        named by _get_image_storage_server().
1504
1505        If |synchronous| is True, waits for the entire download to finish
1506        staging before returning. Otherwise only the artifacts necessary
1507        to start installing images onto DUT's will be staged before returning.
1508        A caller can then call finish_download to guarantee the rest of the
1509        artifacts have finished staging.
1510
1511        @param image: the image to fetch and stage.
1512        @param synchronous: if True, waits until all components of the image are
1513               staged before returning.
1514
1515        @raise DevServerException upon any return code that's not HTTP OK.
1516
1517        """
1518        image = self.translate(image)
1519        artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE
1520        self._trigger_download(image, artifacts, files='',
1521                               synchronous=synchronous)
1522
1523
1524    @remote_devserver_call()
1525    def setup_telemetry(self, build):
1526        """Tell the devserver to setup telemetry for this build.
1527
1528        The devserver will stage autotest and then extract the required files
1529        for telemetry.
1530
1531        @param build: the build to setup telemetry for.
1532
1533        @returns path on the devserver that telemetry is installed to.
1534        """
1535        build = self.translate(build)
1536        archive_url = _get_image_storage_server() + build
1537        call = self.build_call('setup_telemetry', archive_url=archive_url)
1538        try:
1539            response = self.run_call(call)
1540        except httplib.BadStatusLine as e:
1541            logging.error(e)
1542            raise DevServerException('Received Bad Status line, Devserver %s '
1543                                     'might have gone down while handling '
1544                                     'the call: %s' % (self.url(), call))
1545        return response
1546
1547
1548    def finish_download(self, image):
1549        """Tell the devserver to finish staging |image|.
1550
1551        If trigger_download is called with synchronous=False, it will return
1552        before all artifacts have been staged. This method contacts the
1553        devserver and blocks until all staging is completed and should be
1554        called after a call to trigger_download.
1555
1556        @param image: the image to fetch and stage.
1557        @raise DevServerException upon any return code that's not HTTP OK.
1558        """
1559        image = self.translate(image)
1560        artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST
1561        self._finish_download(image, artifacts, files='')
1562
1563
1564    def get_update_url(self, image):
1565        """Returns the url that should be passed to the updater.
1566
1567        @param image: the image that was fetched.
1568        """
1569        image = self.translate(image)
1570        url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
1571                                              type=str)
1572        return (url_pattern % (self.url(), image))
1573
1574
1575    def get_staged_file_url(self, filename, image):
1576        """Returns the url of a staged file for this image on the devserver."""
1577        return '/'.join([self._get_image_url(image), filename])
1578
1579
1580    def get_full_payload_url(self, image):
1581        """Returns a URL to a staged full payload.
1582
1583        @param image: the image that was fetched.
1584
1585        @return A fully qualified URL that can be used for downloading the
1586                payload.
1587
1588        """
1589        return self._get_image_url(image) + '/update.gz'
1590
1591
1592    def get_test_image_url(self, image):
1593        """Returns a URL to a staged test image.
1594
1595        @param image: the image that was fetched.
1596
1597        @return A fully qualified URL that can be used for downloading the
1598                image.
1599
1600        """
1601        return self._get_image_url(image) + '/chromiumos_test_image.bin'
1602
1603
1604    @remote_devserver_call()
1605    def get_dependencies_file(self, build):
1606        """Ask the dev server for the contents of the suite dependencies file.
1607
1608        Ask the dev server at |self._dev_server| for the contents of the
1609        pre-processed suite dependencies file (at DEPENDENCIES_FILE)
1610        for |build|.
1611
1612        @param build: The build (e.g. x86-mario-release/R21-2333.0.0)
1613                      whose dependencies the caller is interested in.
1614        @return The contents of the dependencies file, which should eval to
1615                a dict of dicts, as per bin_utils/suite_preprocessor.py.
1616        @raise DevServerException upon any return code that's not HTTP OK.
1617        """
1618        build = self.translate(build)
1619        call = self.build_call('controlfiles',
1620                               build=build, control_path=DEPENDENCIES_FILE)
1621        return self.run_call(call)
1622
1623
1624    @remote_devserver_call()
1625    def get_latest_build_in_gs(self, board):
1626        """Ask the devservers for the latest offical build in Google Storage.
1627
1628        @param board: The board for who we want the latest official build.
1629        @return A string of the returned build rambi-release/R37-5868.0.0
1630        @raise DevServerException upon any return code that's not HTTP OK.
1631        """
1632        call = self.build_call(
1633                'xbuddy_translate/remote/%s/latest-official' % board,
1634                image_dir=_get_image_storage_server())
1635        image_name = self.run_call(call)
1636        return os.path.dirname(image_name)
1637
1638
1639    def translate(self, build_name):
1640        """Translate the build name if it's in LATEST format.
1641
1642        If the build name is in the format [builder]/LATEST, return the latest
1643        build in Google Storage otherwise return the build name as is.
1644
1645        @param build_name: build_name to check.
1646
1647        @return The actual build name to use.
1648        """
1649        match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I)
1650        if not match:
1651            return build_name
1652        translated_build = self.get_latest_build_in_gs(match.groups()[0])
1653        logging.debug('Translated relative build %s to %s', build_name,
1654                      translated_build)
1655        return translated_build
1656
1657
1658    @classmethod
1659    @remote_devserver_call()
1660    def get_latest_build(cls, target, milestone=''):
1661        """Ask all the devservers for the latest build for a given target.
1662
1663        @param target: The build target, typically a combination of the board
1664                       and the type of build e.g. x86-mario-release.
1665        @param milestone:  For latest build set to '', for builds only in a
1666                           specific milestone set to a str of format Rxx
1667                           (e.g. R16). Default: ''. Since we are dealing with a
1668                           webserver sending an empty string, '', ensures that
1669                           the variable in the URL is ignored as if it was set
1670                           to None.
1671        @return A string of the returned build e.g. R20-2226.0.0.
1672        @raise DevServerException upon any return code that's not HTTP OK.
1673        """
1674        calls = cls.build_all_calls('latestbuild', target=target,
1675                                    milestone=milestone)
1676        latest_builds = []
1677        for call in calls:
1678            latest_builds.append(cls.run_call(call))
1679
1680        return max(latest_builds, key=version.LooseVersion)
1681
1682
1683    @remote_devserver_call()
1684    def _kill_au_process_for_host(self, **kwargs):
1685        """Kill the triggerred auto_update process if error happens in cros_au.
1686
1687        @param kwargs: Arguments to make kill_au_proc devserver call.
1688        """
1689        call = self.build_call('kill_au_proc', **kwargs)
1690        response = self.run_call(call)
1691        if not response == 'True':
1692            raise DevServerException(
1693                    'Failed to kill the triggerred CrOS auto_update process'
1694                    'on devserver %s, the response is %s' % (
1695                            self.url(), response))
1696
1697
1698    def kill_au_process_for_host(self, host_name, pid):
1699        """Kill the triggerred auto_update process if error happens.
1700
1701        Usually this function is used to clear all potential left au processes
1702        of the given host name.
1703
1704        If pid is specified, the devserver will further check the given pid to
1705        make sure the process is killed. This is used for the case that the au
1706        process has started in background, but then provision fails due to
1707        some unknown issues very fast. In this case, when 'kill_au_proc' is
1708        called, there's no corresponding background track log created for this
1709        ongoing au process, which prevents this RPC call from killing this au
1710        process.
1711
1712        @param host_name: The DUT's hostname.
1713        @param pid: The ongoing au process's pid.
1714
1715        @return: True if successfully kill the auto-update process for host.
1716        """
1717        kwargs = {'host_name': host_name, 'pid': pid}
1718        try:
1719            self._kill_au_process_for_host(**kwargs)
1720        except DevServerException:
1721            return False
1722
1723        return True
1724
1725
1726    @remote_devserver_call()
1727    def _clean_track_log(self, **kwargs):
1728        """Clean track log for the current auto-update process."""
1729        call = self.build_call('handler_cleanup', **kwargs)
1730        self.run_call(call)
1731
1732
1733    def clean_track_log(self, host_name, pid):
1734        """Clean track log for the current auto-update process.
1735
1736        @param host_name: The host name to be updated.
1737        @param pid: The auto-update process id.
1738
1739        @return: True if track log is successfully cleaned, False otherwise.
1740        """
1741        if not pid:
1742            return False
1743
1744        kwargs = {'host_name': host_name, 'pid': pid}
1745        try:
1746            self._clean_track_log(**kwargs)
1747        except DevServerException as e:
1748            logging.debug('Failed to clean track_status_file on '
1749                          'devserver for host %s and process id %s: %s',
1750                          host_name, pid, str(e))
1751            return False
1752
1753        return True
1754
1755
1756    def _get_au_log_filename(self, log_dir, host_name, pid):
1757        """Return the auto-update log's filename."""
1758        return os.path.join(log_dir, CROS_AU_LOG_FILENAME % (
1759                    host_name, pid))
1760
1761    def _read_json_response_from_devserver(self, response):
1762        """Reads the json response from the devserver.
1763
1764        This is extracted to its own function so that it can be easily mocked.
1765        @param response: the response for a devserver.
1766        """
1767        try:
1768            return json.loads(response)
1769        except ValueError as e:
1770            logging.debug('Failed to load json response: %s', response)
1771            raise DevServerException(e)
1772
1773
1774    @remote_devserver_call()
1775    def _collect_au_log(self, log_dir, **kwargs):
1776        """Collect logs from devserver after cros-update process is finished.
1777
1778        Collect the logs that recording the whole cros-update process, and
1779        write it to sysinfo path of a job.
1780
1781        The example log file name that is stored is like:
1782            '1220-repair/sysinfo/CrOS_update_host_name_pid.log'
1783
1784        @param host_name: the DUT's hostname.
1785        @param pid: the auto-update process id on devserver.
1786        @param log_dir: The directory to save the cros-update process log
1787                        retrieved from devserver.
1788        """
1789        call = self.build_call('collect_cros_au_log', **kwargs)
1790        response = self.run_call(call)
1791        if not os.path.exists(log_dir):
1792            os.mkdir(log_dir)
1793        write_file = self._get_au_log_filename(
1794                log_dir, kwargs['host_name'], kwargs['pid'])
1795        logging.debug('Saving auto-update logs into %s', write_file)
1796
1797        au_logs = self._read_json_response_from_devserver(response)
1798
1799        try:
1800            for k, v in au_logs['host_logs'].items():
1801                log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid'])
1802                log_path = os.path.join(log_dir, log_name)
1803                with open(log_path, 'w') as out_log:
1804                    out_log.write(v)
1805        except IOError as e:
1806            raise DevServerException('Failed to write auto-update hostlogs: '
1807                                     '%s' % e)
1808
1809        try:
1810            with open(write_file, 'w') as out_log:
1811                out_log.write(au_logs['cros_au_log'])
1812        except:
1813            raise DevServerException('Failed to write auto-update logs into '
1814                                     '%s' % write_file)
1815
1816
1817    def collect_au_log(self, host_name, pid, log_dir):
1818        """Collect logs from devserver after cros-update process is finished.
1819
1820        @param host_name: the DUT's hostname.
1821        @param pid: the auto-update process id on devserver.
1822        @param log_dir: The directory to save the cros-update process log
1823                        retrieved from devserver.
1824
1825        @return: True if auto-update log is successfully collected, False
1826          otherwise.
1827        """
1828        if not pid:
1829            return False
1830
1831        kwargs = {'host_name': host_name, 'pid': pid}
1832        try:
1833            self._collect_au_log(log_dir, **kwargs)
1834        except DevServerException as e:
1835            logging.debug('Failed to collect auto-update log on '
1836                          'devserver for host %s and process id %s: %s',
1837                          host_name, pid, str(e))
1838            return False
1839
1840        return True
1841
1842
1843    @remote_devserver_call()
1844    def _trigger_auto_update(self, **kwargs):
1845        """Trigger auto-update by calling devserver.cros_au.
1846
1847        @param kwargs:  Arguments to make cros_au devserver call.
1848
1849        @return: a tuple indicates whether the RPC call cros_au succeeds and
1850          the auto-update process id running on devserver.
1851        """
1852        host_name = kwargs['host_name']
1853        call = self.build_call('cros_au', async=True, **kwargs)
1854        try:
1855            response = self.run_call(call)
1856            logging.info(
1857                'Received response from devserver for cros_au call: %r',
1858                response)
1859        except httplib.BadStatusLine as e:
1860            logging.error(e)
1861            raise DevServerException('Received Bad Status line, Devserver %s '
1862                                     'might have gone down while handling '
1863                                     'the call: %s' % (self.url(), call))
1864
1865        return response
1866
1867
1868    def _check_for_auto_update_finished(self, pid, wait=True, **kwargs):
1869        """Polling devserver.get_au_status to get current auto-update status.
1870
1871        The current auto-update status is used to identify whether the update
1872        process is finished.
1873
1874        @param pid:    The background process id for auto-update in devserver.
1875        @param kwargs: keyword arguments to make get_au_status devserver call.
1876        @param wait:   Should the check wait for completion.
1877
1878        @return: True if auto-update is finished for a given dut.
1879        """
1880        logging.debug('Check the progress for auto-update process %r', pid)
1881        kwargs['pid'] = pid
1882        call = self.build_call('get_au_status', **kwargs)
1883
1884        def all_finished():
1885            """Call devserver.get_au_status rpc to check if auto-update
1886               is finished.
1887
1888            @return: True if auto-update is finished for a given dut. False
1889                     otherwise.
1890            @rasies  DevServerException, the exception is a wrapper of all
1891                     exceptions that were raised when devserver tried to
1892                     download the artifacts. devserver raises an HTTPError or
1893                     a CmdError when an exception was raised in the code. Such
1894                     exception should be re-raised here to stop the caller from
1895                     waiting. If the call to devserver failed for connection
1896                     issue, a URLError exception is raised, and caller should
1897                     retry the call to avoid such network flakiness.
1898
1899            """
1900            try:
1901                au_status = self.run_call(call)
1902                response = json.loads(au_status)
1903                # This is a temp fix to fit both dict and tuple returning
1904                # values. The dict check will be removed after a corresponding
1905                # devserver CL is deployed.
1906                if isinstance(response, dict):
1907                    if response.get('detailed_error_msg'):
1908                        raise DevServerException(
1909                                response.get('detailed_error_msg'))
1910
1911                    if response.get('finished'):
1912                        logging.debug('CrOS auto-update is finished')
1913                        return True
1914                    else:
1915                        logging.debug('Current CrOS auto-update status: %s',
1916                                      response.get('status'))
1917                        return False
1918
1919                if not response[0]:
1920                    logging.debug('Current CrOS auto-update status: %s',
1921                                  response[1])
1922                    return False
1923                else:
1924                    logging.debug('CrOS auto-update is finished')
1925                    return True
1926            except urllib2.HTTPError as e:
1927                error_markup = e.read()
1928                raise DevServerException(_strip_http_message(error_markup))
1929            except urllib2.URLError as e:
1930                # Could be connection issue, retry it.
1931                # For example: <urlopen error [Errno 111] Connection refused>
1932                logging.warning('URLError (%r): Retrying connection to '
1933                                'devserver to check auto-update status.', e)
1934                return False
1935            except error.CmdError:
1936                # Retry if SSH failed to connect to the devserver.
1937                logging.warning('CmdError: Retrying SSH connection to check '
1938                                'auto-update status.')
1939                return False
1940            except socket.error as e:
1941                # Could be some temporary devserver connection issues.
1942                logging.warning('Socket Error (%r): Retrying connection to '
1943                                'devserver to check auto-update status.', e)
1944                return False
1945            except ValueError as e:
1946                raise DevServerException(
1947                        '%s (Got AU status: %r)' % (str(e), au_status))
1948
1949        if wait:
1950            bin_utils.poll_for_condition(
1951                    all_finished,
1952                    exception=bin_utils.TimeoutError(),
1953                    timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60,
1954                    sleep_interval=CROS_AU_POLLING_INTERVAL)
1955
1956            return True
1957        else:
1958            return all_finished()
1959
1960
1961    def check_for_auto_update_finished(self, response, wait=True, **kwargs):
1962        """Processing response of 'cros_au' and polling for auto-update status.
1963
1964        Will wait for the whole auto-update process is finished.
1965
1966        @param response: The response from RPC 'cros_au'
1967        @param kwargs: keyword arguments to make get_au_status devserver call.
1968
1969        @return: a tuple includes two elements.
1970          finished: True if the operation has completed.
1971          raised_error: None if everything works well or the raised error.
1972          pid: the auto-update process id on devserver.
1973        """
1974
1975        pid = 0
1976        raised_error = None
1977        finished = False
1978        try:
1979            response = json.loads(response)
1980            if response[0]:
1981                pid = response[1]
1982                # If provision is kicked off asynchronously, pid will be -1.
1983                # If provision is not successfully kicked off , pid continues
1984                # to be 0.
1985                if pid > 0:
1986                    logging.debug('start process %r for auto_update in '
1987                                  'devserver', pid)
1988                    finished = self._check_for_auto_update_finished(
1989                            pid, wait=wait, **kwargs)
1990        except Exception as e:
1991            logging.debug('Failed to trigger auto-update process on devserver')
1992            finished = True
1993            raised_error = e
1994        finally:
1995            return finished, raised_error, pid
1996
1997
1998    def _parse_AU_error(self, response):
1999        """Parse auto_update error returned from devserver."""
2000        return re.split('\n', response)[-1]
2001
2002
2003    def _classify_exceptions(self, target_error):
2004        """Parse the error that was raised from auto_update.
2005
2006        @param target_error: A single string representing one time provision
2007            error happened in auto_update().
2008
2009        @return: If target_error is empty, return None. Otherwise, return a
2010            classified exception type (string) from _EXCEPTION_PATTERNS
2011            or 'Unknown exception'. Current patterns in _EXCEPTION_PATTERNS are
2012            very specific so that errors cannot match more than one pattern.
2013        """
2014        if not target_error:
2015            return None
2016
2017        for err_pattern, classification in _EXCEPTION_PATTERNS:
2018            match = re.match(err_pattern, target_error)
2019            if match:
2020                return classification
2021
2022        return '(0) Unknown exception'
2023
2024
2025    def _check_error_message(self, error_patterns_to_check, error_msg):
2026        """Detect whether specific error pattern exist in error message.
2027
2028        @param error_patterns_to_check: the error patterns to check
2029        @param error_msg: the error message which may include any error
2030                          pattern.
2031
2032        @return A boolean variable, True if error_msg contains any error
2033            pattern in error_patterns_to_check, False otherwise.
2034        """
2035        for err in error_patterns_to_check:
2036            if err in error_msg:
2037                return True
2038
2039        return False
2040
2041
2042    def _is_retryable(self, error_msg):
2043        """Detect whether we will retry auto-update based on error_msg.
2044
2045        @param error_msg: The given error message.
2046
2047        @return A boolean variable which indicates whether we will retry
2048            auto_update with another devserver based on the given error_msg.
2049        """
2050        # For now we just hard-code the error message we think it's suspicious.
2051        # When we get more date about what's the json response when devserver
2052        # is overloaded, we can update this part.
2053        retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE,
2054                                    'is not pingable']
2055        return self._check_error_message(retryable_error_patterns, error_msg)
2056
2057
2058    def _should_use_original_payload(self, error_msg):
2059        devserver_error_patterns = ['DevserverCannotStartError']
2060        return self._check_error_message(devserver_error_patterns, error_msg)
2061
2062
2063    def _parse_buildname_safely(self, build_name):
2064        """Parse a given buildname safely.
2065
2066        @param build_name: the build name to be parsed.
2067
2068        @return: a tuple (board, build_type, milestone)
2069        """
2070        try:
2071            board, build_type, milestone, _ = server_utils.ParseBuildName(
2072                    build_name)
2073        except server_utils.ParseBuildNameException:
2074            logging.warning('Unable to parse build name %s for metrics. '
2075                            'Continuing anyway.', build_name)
2076            board, build_type, milestone = ('', '', '')
2077
2078        return board, build_type, milestone
2079
2080
2081    def _emit_auto_update_metrics(self, error_list, duration_list,
2082                                  is_au_success, board, build_type, milestone,
2083                                  dut_host_name, is_aue2etest):
2084        """Send metrics for auto_update.
2085
2086        Please note: to avoid reaching or exceeding the monarch field
2087        cardinality limit, we avoid a metric that includes both dut hostname
2088        and other high cardinality fields.
2089
2090        @param error_list: a list of errors happened in provision. Usually it
2091            contains 1 ~ AU_RETRY_LIMIT errors since we only retry provision
2092            for several times.
2093        @param duration_list: a list of provision duration time, counted by
2094            seconds.
2095        @param is_au_success: a field in metrics, representing whether this
2096            auto_update succeeds or not.
2097        @param board: a field in metrics representing which board this
2098            auto_update tries to update.
2099        @param build_type: a field in metrics representing which build type this
2100            auto_update tries to update.
2101        @param milestone: a field in metrics representing which milestone this
2102            auto_update tries to update.
2103        @param dut_host_name: a field in metrics representing which DUT this
2104            auto_update tries to update.
2105        @param is_aue2etest: a field in metrics representing if provision was
2106            done as part of the autoupdate_EndToEndTest.
2107        """
2108        # Per-devserver cros_update metric.
2109        c1 = metrics.Counter(
2110                'chromeos/autotest/provision/cros_update_per_devserver')
2111        f1 = {'dev_server': self.resolved_hostname,
2112              'success': is_au_success,
2113              'board': board,
2114              'build_type': build_type,
2115              'milestone': milestone,
2116              'error': '',
2117              'is_aue2etest': is_aue2etest}
2118        # Per-DUT cros_update metric.
2119        c2 = metrics.Counter('chromeos/autotest/provision/cros_update_by_dut')
2120        f2 = {'success': is_au_success,
2121              'board': board,
2122              'dut_host_name': dut_host_name,
2123              'error': '',
2124              'is_aue2etest': is_aue2etest}
2125        # Per auto_update metric.
2126        c3 = metrics.Counter(
2127                'chromeos/autotest/provision/cros_update_failure_by_devserver')
2128        f3 = {'dev_server': self.resolved_hostname,
2129              'board': board,
2130              'build_type': build_type,
2131              'milestone': milestone,
2132              'error': '',
2133              'is_aue2etest': is_aue2etest}
2134        # Per auto_update duration metric.
2135        c4 = metrics.Counter(
2136                'chromeos/autotest/provision/auto_update_duration_by_devserver')
2137        c5 = metrics.Counter(
2138                'chromeos/autotest/provision/provision_duration_by_devserver')
2139        f_duration = {'dev_server': self.resolved_hostname,
2140                      'success': is_au_success,
2141                      'board': board,
2142                      'build_type': build_type,
2143                      'milestone': milestone,
2144                      'duration_seconds': 0,
2145                      'is_aue2etest': is_aue2etest}
2146
2147        # Add a field |error| here. Current error's pattern is manually
2148        # specified in _EXCEPTION_PATTERNS.
2149        if not error_list:
2150            c1.increment(fields=f1)
2151            c2.increment(fields=f2)
2152        else:
2153            # In metrics, use the first error as the real provision errors.
2154            raised_error = str(self._classify_exceptions(error_list[0]))
2155            f1['error'] = raised_error
2156            c1.increment(fields=f1)
2157            f2['error'] = raised_error
2158            c2.increment(fields=f2)
2159
2160            # Record all errors in metrics cros_update_failure_by_devserver,
2161            # to make it truly reflect whether there're some particular errors
2162            # hit lab frequently. Previously, all errors raised in the second
2163            # try of auto_update will be eaten.
2164            for err in error_list:
2165                f3['error'] = str(self._classify_exceptions(err))
2166                c3.increment(fields=f3)
2167
2168        total_provision_duration = 0
2169        for per_au_duration in duration_list:
2170            total_provision_duration += per_au_duration
2171            f_duration['duration_seconds'] = per_au_duration
2172            c4.increment(fields=f_duration)
2173
2174        f_duration['duration_seconds'] = total_provision_duration
2175        c5.increment(fields=f_duration)
2176
2177    def _parse_buildname_from_gs_uri(self, uri):
2178        """Get parameters needed for AU metrics when build_name is not known.
2179
2180        autoupdate_EndToEndTest is run with two Google Storage URIs from the
2181        gs://chromeos-releases bucket. URIs in this bucket do not have the
2182        build_name in the format samus-release/R60-0000.0.0.
2183
2184        We can get the milestone and board by checking the instructions.json
2185        file contained in the bucket with the payloads.
2186
2187        @param uri: The partial uri we received from autoupdate_EndToEndTest.
2188        """
2189        try:
2190            # Get the instructions file that contains info about the build.
2191            gs_file = 'gs://chromeos-releases/' + uri + '/*instructions.json'
2192            files = bin_utils.gs_ls(gs_file)
2193            for f in files:
2194                gs_folder, _, instruction_file = f.rpartition('/')
2195                self.stage_artifacts(image=uri,
2196                                     files=[instruction_file],
2197                                     archive_url=gs_folder)
2198                json_file = self.get_staged_file_url(instruction_file, uri)
2199                response = urllib2.urlopen(json_file)
2200                data = json.load(response)
2201                return data['board'], 'release', data['version']['milestone']
2202        except (ValueError, error.CmdError, urllib2.URLError) as e:
2203            logging.debug('Problem getting values for metrics: %s', e)
2204            logging.warning('Unable to parse build name %s from AU test for '
2205                            'metrics. Continuing anyway.', uri)
2206
2207        return '', '', ''
2208
2209
2210    def auto_update(self, host_name, build_name, original_board=None,
2211                    original_release_version=None, log_dir=None,
2212                    force_update=False, full_update=False,
2213                    payload_filename=None, force_original=False,
2214                    clobber_stateful=True, quick_provision=False):
2215        """Auto-update a CrOS host.
2216
2217        @param host_name: The hostname of the DUT to auto-update.
2218        @param build_name:  The build name to be auto-updated on the DUT.
2219        @param original_board: The original board of the DUT to auto-update.
2220        @param original_release_version: The release version of the DUT's
2221            current build.
2222        @param log_dir: The log directory to store auto-update logs from
2223            devserver.
2224        @param force_update: Force an update even if the version installed
2225                             is the same. Default: False.
2226        @param full_update:  If True, do not run stateful update, directly
2227                             force a full reimage. If False, try stateful
2228                             update first if the dut is already installed
2229                             with the same version.
2230        @param payload_filename: Used to specify the exact file to
2231                                 use for autoupdating. If None, the payload
2232                                 will be determined by build_name. You
2233                                 must have already staged this file before
2234                                 passing it in here.
2235        @param force_original: Whether to force stateful update with the
2236                               original payload.
2237        @param clobber_stateful: If True do a clean install of stateful.
2238        @param quick_provision: Attempt to use quick provision path first.
2239
2240        @return A set (is_success, pid) in which:
2241            1. is_success indicates whether this auto_update succeeds.
2242            2. pid is the process id of the successful autoupdate run.
2243
2244        @raise DevServerException if auto_update fails and is not retryable.
2245        @raise RetryableProvisionException if it fails and is retryable.
2246        """
2247        kwargs = {'host_name': host_name,
2248                  'build_name': build_name,
2249                  'force_update': force_update,
2250                  'full_update': full_update,
2251                  'clobber_stateful': clobber_stateful,
2252                  'quick_provision': quick_provision}
2253
2254        is_aue2etest = payload_filename is not None
2255
2256        if is_aue2etest:
2257            kwargs['payload_filename'] = payload_filename
2258
2259        error_msg = 'CrOS auto-update failed for host %s: %s'
2260        error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s'
2261        is_au_success = False
2262        au_log_dir = os.path.join(log_dir,
2263                                  AUTO_UPDATE_LOG_DIR) if log_dir else None
2264        error_list = []
2265        retry_with_another_devserver = False
2266        duration_list = []
2267
2268        if is_aue2etest:
2269            board, build_type, milestone = self._parse_buildname_from_gs_uri(
2270                build_name)
2271        else:
2272            board, build_type, milestone = self._parse_buildname_safely(
2273                build_name)
2274
2275        for au_attempt in range(AU_RETRY_LIMIT):
2276            logging.debug('Start CrOS auto-update for host %s at %d time(s).',
2277                          host_name, au_attempt + 1)
2278            start_time = time.time()
2279            # No matter _trigger_auto_update succeeds or fails, the auto-update
2280            # track_status_file should be cleaned, and the auto-update execute
2281            # log should be collected to directory sysinfo. Also, the error
2282            # raised by _trigger_auto_update should be displayed.
2283            try:
2284                # Try update with stateful.tgz of old release version in the
2285                # last try of auto-update.
2286                if force_original and original_release_version:
2287                    # Monitor this case in monarch
2288                    original_build = '%s/%s' % (original_board,
2289                                                original_release_version)
2290                    c = metrics.Counter(
2291                            'chromeos/autotest/provision/'
2292                            'cros_update_with_original_build')
2293                    f = {'dev_server': self.resolved_hostname,
2294                         'board': board,
2295                         'build_type': build_type,
2296                         'milestone': milestone,
2297                         'original_build': original_build}
2298                    c.increment(fields=f)
2299
2300                    logging.debug('Try updating stateful partition of the '
2301                                  'host with the same version of its current '
2302                                  'rootfs partition: %s', original_build)
2303                    response = self._trigger_auto_update(
2304                            original_build=original_build, **kwargs)
2305                else:
2306                    response = self._trigger_auto_update(**kwargs)
2307            except DevServerException as e:
2308                logging.debug(error_msg_attempt, au_attempt+1, str(e))
2309                error_list.append(str(e))
2310            else:
2311                _, raised_error, pid = self.check_for_auto_update_finished(
2312                        response, **kwargs)
2313
2314                # Error happens in _collect_au_log won't be raised.
2315                if au_log_dir:
2316                    is_collect_success = self.collect_au_log(
2317                            kwargs['host_name'], pid, au_log_dir)
2318                else:
2319                    is_collect_success = True
2320
2321                # Error happens in _clean_track_log won't be raised.
2322                if pid >= 0:
2323                    is_clean_success = self.clean_track_log(
2324                            kwargs['host_name'], pid)
2325                else:
2326                    is_clean_success = True
2327
2328                # If any error is raised previously, log it and retry
2329                # auto-update. Otherwise, claim a successful CrOS auto-update.
2330                if (not raised_error and is_clean_success and
2331                    is_collect_success):
2332                    logging.debug('CrOS auto-update succeed for host %s',
2333                                  host_name)
2334                    is_au_success = True
2335                    break
2336                else:
2337                    if not self.kill_au_process_for_host(kwargs['host_name'],
2338                                                         pid):
2339                        logging.debug('Failed to kill auto_update process %d',
2340                                      pid)
2341                    if raised_error:
2342                        logging.debug(error_msg_attempt, au_attempt+1,
2343                                      str(raised_error))
2344                        if au_log_dir:
2345                            logging.debug('Please see error details in log %s',
2346                                          self._get_au_log_filename(
2347                                                  au_log_dir,
2348                                                  kwargs['host_name'],
2349                                                  pid))
2350                        error_list.append(self._parse_AU_error(str(raised_error)))
2351                        if self._is_retryable(str(raised_error)):
2352                            retry_with_another_devserver = True
2353
2354                        if self._should_use_original_payload(str(raised_error)):
2355                            force_original = True
2356
2357            finally:
2358                duration_list.append(int(time.time() - start_time))
2359                if retry_with_another_devserver:
2360                    break
2361
2362                if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1:
2363                    time.sleep(CROS_AU_RETRY_INTERVAL)
2364                    # Use the IP of DUT if the hostname failed.
2365                    host_name_ip = socket.gethostbyname(host_name)
2366                    kwargs['host_name'] = host_name_ip
2367                    logging.debug(
2368                            'AU failed, trying IP instead of hostname: %s',
2369                            host_name_ip)
2370
2371        self._emit_auto_update_metrics(error_list, duration_list, is_au_success,
2372                                       board, build_type, milestone, host_name,
2373                                       is_aue2etest)
2374
2375        if is_au_success:
2376            return (is_au_success, pid)
2377
2378        # If errors happen in the CrOS AU process, report the concatenation
2379        # of the errors happening in first & second provision.
2380        # If error happens in RPCs of cleaning track log, collecting
2381        # auto-update logs, or killing auto-update processes, just report a
2382        # common error here.
2383        if error_list:
2384            real_error = ''
2385            for i in range(len(error_list)):
2386                real_error += '%d) %s, ' % (
2387                        i, ' '.join(error_list[i].splitlines()))
2388            if retry_with_another_devserver:
2389                raise RetryableProvisionException(
2390                        error_msg % (host_name, real_error))
2391            else:
2392                raise DevServerException(
2393                        error_msg % (host_name, real_error))
2394        else:
2395            raise DevServerException(error_msg % (
2396                        host_name, ('RPC calls after the whole auto-update '
2397                                    'process failed.')))
2398
2399
2400class AndroidBuildServer(ImageServerBase):
2401    """Class for DevServer that handles RPCs related to Android builds.
2402
2403    The calls to devserver to stage artifacts, including stage and download, are
2404    made in async mode. That is, when caller makes an RPC |stage| to request
2405    devserver to stage certain artifacts, devserver handles the call and starts
2406    staging artifacts in a new thread, and return |Success| without waiting for
2407    staging being completed. When caller receives message |Success|, it polls
2408    devserver's is_staged call until all artifacts are staged.
2409    Such mechanism is designed to prevent cherrypy threads in devserver being
2410    running out, as staging artifacts might take long time, and cherrypy starts
2411    with a fixed number of threads that handle devserver rpc.
2412    """
2413
2414    def wait_for_artifacts_staged(self, target, build_id, branch,
2415                                  archive_url=None, artifacts='', files=''):
2416        """Polling devserver.is_staged until all artifacts are staged.
2417
2418        @param target: Target of the android build to stage, e.g.,
2419                       shamu-userdebug.
2420        @param build_id: Build id of the android build to stage.
2421        @param branch: Branch of the android build to stage.
2422        @param archive_url: Google Storage URL for the build.
2423        @param artifacts: Comma separated list of artifacts to download.
2424        @param files: Comma separated list of files to download.
2425
2426        @return: True if all artifacts are staged in devserver.
2427        """
2428        kwargs = {'target': target,
2429                  'build_id': build_id,
2430                  'branch': branch,
2431                  'artifacts': artifacts,
2432                  'files': files,
2433                  'os_type': 'android'}
2434        if archive_url:
2435            kwargs['archive_url'] = archive_url
2436        return self._poll_is_staged(**kwargs)
2437
2438
2439    @remote_devserver_call()
2440    def call_and_wait(self, call_name, target, build_id, branch, archive_url,
2441                      artifacts, files, error_message,
2442                      expected_response=SUCCESS):
2443        """Helper method to make a urlopen call, and wait for artifacts staged.
2444
2445        @param call_name: name of devserver rpc call.
2446        @param target: Target of the android build to stage, e.g.,
2447                       shamu-userdebug.
2448        @param build_id: Build id of the android build to stage.
2449        @param branch: Branch of the android build to stage.
2450        @param archive_url: Google Storage URL for the CrOS build.
2451        @param artifacts: Comma separated list of artifacts to download.
2452        @param files: Comma separated list of files to download.
2453        @param expected_response: Expected response from rpc, default to
2454                                  |Success|. If it's set to None, do not compare
2455                                  the actual response. Any response is consider
2456                                  to be good.
2457        @param error_message: Error message to be thrown if response does not
2458                              match expected_response.
2459
2460        @return: The response from rpc.
2461        @raise DevServerException upon any return code that's expected_response.
2462
2463        """
2464        kwargs = {'target': target,
2465                  'build_id': build_id,
2466                  'branch': branch,
2467                  'artifacts': artifacts,
2468                  'files': files,
2469                  'os_type': 'android'}
2470        if archive_url:
2471            kwargs['archive_url'] = archive_url
2472        return self._call_and_wait(call_name, error_message, expected_response,
2473                                   **kwargs)
2474
2475
2476    @remote_devserver_call()
2477    def stage_artifacts(self, target=None, build_id=None, branch=None,
2478                        image=None, artifacts=None, files='', archive_url=None):
2479        """Tell the devserver to download and stage |artifacts| from |image|.
2480
2481         This is the main call point for staging any specific artifacts for a
2482        given build. To see the list of artifacts one can stage see:
2483
2484        ~src/platfrom/dev/artifact_info.py.
2485
2486        This is maintained along with the actual devserver code.
2487
2488        @param target: Target of the android build to stage, e.g.,
2489                               shamu-userdebug.
2490        @param build_id: Build id of the android build to stage.
2491        @param branch: Branch of the android build to stage.
2492        @param image: Name of a build to test, in the format of
2493                      branch/target/build_id
2494        @param artifacts: A list of artifacts.
2495        @param files: A list of files to stage.
2496        @param archive_url: Optional parameter that has the archive_url to stage
2497                this artifact from. Default is specified in autotest config +
2498                image.
2499
2500        @raise DevServerException upon any return code that's not HTTP OK.
2501        """
2502        if image and not target and not build_id and not branch:
2503            branch, target, build_id = utils.parse_launch_control_build(image)
2504        if not target or not build_id or not branch:
2505            raise DevServerException('Must specify all build info (target, '
2506                                     'build_id and branch) to stage.')
2507
2508        android_build_info = {'target': target,
2509                              'build_id': build_id,
2510                              'branch': branch}
2511        if not artifacts and not files:
2512            raise DevServerException('Must specify something to stage.')
2513        if not all(android_build_info.values()):
2514            raise DevServerException(
2515                    'To stage an Android build, must specify target, build id '
2516                    'and branch.')
2517        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2518        self._stage_artifacts(build, artifacts, files, archive_url,
2519                              **android_build_info)
2520
2521    def get_pull_url(self, target, build_id, branch):
2522        """Get the url to pull files from the devserver.
2523
2524        @param target: Target of the android build, e.g., shamu_userdebug
2525        @param build_id: Build id of the android build.
2526        @param branch: Branch of the android build.
2527
2528        @return A url to pull files from the dev server given a specific
2529                android build.
2530        """
2531        return os.path.join(self.url(), 'static', branch, target, build_id)
2532
2533
2534    def trigger_download(self, target, build_id, branch, artifacts=None,
2535                         files='', os='android', synchronous=True):
2536        """Tell the devserver to download and stage an Android build.
2537
2538        Tells the devserver to fetch an Android build from the image storage
2539        server named by _get_image_storage_server().
2540
2541        If |synchronous| is True, waits for the entire download to finish
2542        staging before returning. Otherwise only the artifacts necessary
2543        to start installing images onto DUT's will be staged before returning.
2544        A caller can then call finish_download to guarantee the rest of the
2545        artifacts have finished staging.
2546
2547        @param target: Target of the android build to stage, e.g.,
2548                       shamu-userdebug.
2549        @param build_id: Build id of the android build to stage.
2550        @param branch: Branch of the android build to stage.
2551        @param artifacts: A string of artifacts separated by comma. If None,
2552               use the default artifacts for Android or Brillo build.
2553        @param files: String of file seperated by commas.
2554        @param os: OS artifacts to download (android/brillo).
2555        @param synchronous: if True, waits until all components of the image are
2556               staged before returning.
2557
2558        @raise DevServerException upon any return code that's not HTTP OK.
2559
2560        """
2561        android_build_info = {'target': target,
2562                              'build_id': build_id,
2563                              'branch': branch}
2564        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2565        if not artifacts:
2566            board = target.split('-')[0]
2567            artifacts = (
2568                android_utils.AndroidArtifacts.get_artifacts_for_reimage(
2569                        board, os))
2570        self._trigger_download(build, artifacts, files=files,
2571                               synchronous=synchronous, **android_build_info)
2572
2573
2574    def finish_download(self, target, build_id, branch, os='android'):
2575        """Tell the devserver to finish staging an Android build.
2576
2577        If trigger_download is called with synchronous=False, it will return
2578        before all artifacts have been staged. This method contacts the
2579        devserver and blocks until all staging is completed and should be
2580        called after a call to trigger_download.
2581
2582        @param target: Target of the android build to stage, e.g.,
2583                       shamu-userdebug.
2584        @param build_id: Build id of the android build to stage.
2585        @param branch: Branch of the android build to stage.
2586        @param os: OS artifacts to download (android/brillo).
2587
2588        @raise DevServerException upon any return code that's not HTTP OK.
2589        """
2590        android_build_info = {'target': target,
2591                              'build_id': build_id,
2592                              'branch': branch}
2593        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2594        board = target.split('-')[0]
2595        artifacts = (
2596                android_utils.AndroidArtifacts.get_artifacts_for_reimage(
2597                        board))
2598        self._finish_download(build, artifacts, files='', **android_build_info)
2599
2600
2601    def get_staged_file_url(self, filename, target, build_id, branch):
2602        """Returns the url of a staged file for this image on the devserver.
2603
2604        @param filename: Name of the file.
2605        @param target: Target of the android build to stage, e.g.,
2606                       shamu-userdebug.
2607        @param build_id: Build id of the android build to stage.
2608        @param branch: Branch of the android build to stage.
2609
2610        @return: The url of a staged file for this image on the devserver.
2611        """
2612        android_build_info = {'target': target,
2613                              'build_id': build_id,
2614                              'branch': branch,
2615                              'os_type': 'android'}
2616        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2617        return '/'.join([self._get_image_url(build), filename])
2618
2619
2620    @remote_devserver_call()
2621    def translate(self, build_name):
2622        """Translate the build name if it's in LATEST format.
2623
2624        If the build name is in the format [branch]/[target]/LATEST, return the
2625        latest build in Launch Control otherwise return the build name as is.
2626
2627        @param build_name: build_name to check.
2628
2629        @return The actual build name to use.
2630        """
2631        branch, target, build_id = utils.parse_launch_control_build(build_name)
2632        if build_id.upper() != 'LATEST':
2633            return build_name
2634        call = self.build_call('latestbuild', branch=branch, target=target,
2635                               os_type='android')
2636        translated_build_id = self.run_call(call)
2637        translated_build = (ANDROID_BUILD_NAME_PATTERN %
2638                            {'branch': branch,
2639                             'target': target,
2640                             'build_id': translated_build_id})
2641        logging.debug('Translated relative build %s to %s', build_name,
2642                      translated_build)
2643        return translated_build
2644
2645
2646def _is_load_healthy(load):
2647    """Check if devserver's load meets the minimum threshold.
2648
2649    @param load: The devserver's load stats to check.
2650
2651    @return: True if the load meets the minimum threshold. Return False
2652             otherwise.
2653
2654    """
2655    # Threshold checks, including CPU load.
2656    if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD:
2657        logging.debug('CPU load of devserver %s is at %s%%, which is higher '
2658                      'than the threshold of %s%%', load['devserver'],
2659                      load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD)
2660        return False
2661    if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO:
2662        logging.debug('Network IO of devserver %s is at %i Bps, which is '
2663                      'higher than the threshold of %i bytes per second.',
2664                      load['devserver'], load[DevServer.NETWORK_IO],
2665                      DevServer.MAX_NETWORK_IO)
2666        return False
2667    return True
2668
2669
2670def _compare_load(devserver1, devserver2):
2671    """Comparator function to compare load between two devservers.
2672
2673    @param devserver1: A dictionary of devserver load stats to be compared.
2674    @param devserver2: A dictionary of devserver load stats to be compared.
2675
2676    @return: Negative value if the load of `devserver1` is less than the load
2677             of `devserver2`. Return positive value otherwise.
2678
2679    """
2680    return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO])
2681
2682
2683def _get_subnet_for_host_ip(host_ip,
2684                            restricted_subnets=utils.RESTRICTED_SUBNETS):
2685    """Get the subnet for a given host IP.
2686
2687    @param host_ip: the IP of a DUT.
2688    @param restricted_subnets: A list of restriected subnets.
2689
2690    @return: a (subnet_ip, mask_bits) tuple. If no matched subnet for the
2691             host_ip, return (None, None).
2692    """
2693    for subnet_ip, mask_bits in restricted_subnets:
2694        if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits):
2695            return subnet_ip, mask_bits
2696
2697    return None, None
2698
2699
2700def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None):
2701    """Get the devserver with the least load.
2702
2703    Iterate through all devservers and get the one with least load.
2704
2705    TODO(crbug.com/486278): Devserver with required build already staged should
2706    take higher priority. This will need check_health call to be able to verify
2707    existence of a given build/artifact. Also, in case all devservers are
2708    overloaded, the logic here should fall back to the old behavior that randomly
2709    selects a devserver based on the hash of the image name/url.
2710
2711    @param devserver_type: Type of devserver to select from. Default is set to
2712                           ImageServer.
2713    @param hostname: Hostname of the dut that the devserver is used for. The
2714            picked devserver needs to respect the location of the host if
2715            `prefer_local_devserver` is set to True or `restricted_subnets` is
2716            set.
2717
2718    @return: Name of the devserver with the least load.
2719
2720    """
2721    logging.debug('Get the least loaded %r', devserver_type)
2722    devservers, can_retry = devserver_type.get_available_devservers(
2723            hostname)
2724    # If no healthy devservers available and can_retry is False, return None.
2725    # Otherwise, relax the constrain on hostname, allow all devservers to be
2726    # available.
2727    if not devserver_type.get_healthy_devserver('', devservers):
2728        if not can_retry:
2729            return None
2730        else:
2731            devservers, _ = devserver_type.get_available_devservers()
2732
2733    # get_devserver_load call needs to be made in a new process to allow force
2734    # timeout using signal.
2735    output = multiprocessing.Queue()
2736    processes = []
2737    for devserver in devservers:
2738        processes.append(multiprocessing.Process(
2739                target=devserver_type.get_devserver_load_wrapper,
2740                args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output)))
2741
2742    for p in processes:
2743        p.start()
2744    for p in processes:
2745        p.join()
2746    loads = [output.get() for p in processes]
2747    # Filter out any load failed to be retrieved or does not support load check.
2748    loads = [load for load in loads if load and DevServer.CPU_LOAD in load and
2749             DevServer.is_free_disk_ok(load) and
2750             DevServer.is_apache_client_count_ok(load)]
2751    if not loads:
2752        logging.debug('Failed to retrieve load stats from any devserver. No '
2753                      'load balancing can be applied.')
2754        return None
2755    loads = [load for load in loads if _is_load_healthy(load)]
2756    if not loads:
2757        logging.error('No devserver has the capacity to be selected.')
2758        return None
2759    loads = sorted(loads, cmp=_compare_load)
2760    return loads[0]['devserver']
2761
2762
2763def resolve(build, hostname=None, ban_list=None):
2764    """Resolve a devserver can be used for given build and hostname.
2765
2766    @param build: Name of a build to stage on devserver, e.g.,
2767                  ChromeOS build: daisy-release/R50-1234.0.0
2768                  Launch Control build: git_mnc_release/shamu-eng
2769    @param hostname: Hostname of a devserver for, default is None, which means
2770            devserver is not restricted by the network location of the host.
2771    @param ban_list: The blacklist of devservers shouldn't be chosen.
2772
2773    @return: A DevServer instance that can be used to stage given build for the
2774             given host.
2775    """
2776    if utils.is_launch_control_build(build):
2777        return AndroidBuildServer.resolve(build, hostname)
2778    else:
2779        return ImageServer.resolve(build, hostname, ban_list=ban_list)
2780