dev_server.py revision 1e916724238baf58a8df0587a9886f35be794262
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from distutils import version 6import cStringIO 7import HTMLParser 8import httplib 9import json 10import logging 11import multiprocessing 12import os 13import re 14import socket 15import time 16import urllib2 17import urlparse 18 19from autotest_lib.client.bin import utils as bin_utils 20from autotest_lib.client.common_lib import android_utils 21from autotest_lib.client.common_lib import error 22from autotest_lib.client.common_lib import global_config 23from autotest_lib.client.common_lib import utils 24from autotest_lib.client.common_lib.cros import retry 25from autotest_lib.server import utils as server_utils 26# TODO(cmasone): redo this class using requests module; http://crosbug.com/30107 27 28try: 29 from chromite.lib import metrics 30except ImportError: 31 metrics = utils.metrics_mock 32 33 34CONFIG = global_config.global_config 35# This file is generated at build time and specifies, per suite and per test, 36# the DEPENDENCIES list specified in each control file. It's a dict of dicts: 37# {'bvt': {'/path/to/autotest/control/site_tests/test1/control': ['dep1']} 38# 'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']} 39# 'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'], 40# '/path/to/autotest/control/site_tests/test3/control': ['dep3']} 41# } 42DEPENDENCIES_FILE = 'test_suites/dependency_info' 43# Number of seconds for caller to poll devserver's is_staged call to check if 44# artifacts are staged. 45_ARTIFACT_STAGE_POLLING_INTERVAL = 5 46# Artifacts that should be staged when client calls devserver RPC to stage an 47# image. 48_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('full_payload,test_suites,stateful,' 49 'quick_provision') 50# Artifacts that should be staged when client calls devserver RPC to stage an 51# image with autotest artifact. 52_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,' 53 'control_files,stateful,' 54 'autotest_packages,' 55 'quick_provision') 56# Artifacts that should be staged when client calls devserver RPC to stage an 57# Android build. 58_BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions') 59SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value( 60 'CROS', 'skip_devserver_health_check', type=bool) 61# Number of seconds for the call to get devserver load to time out. 62TIMEOUT_GET_DEVSERVER_LOAD = 2.0 63 64# Android artifact path in devserver 65ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value( 66 'CROS', 'android_build_name_pattern', type=str).replace('\\', '') 67 68# Return value from a devserver RPC indicating the call succeeded. 69SUCCESS = 'Success' 70 71# The timeout minutes for a given devserver ssh call. 72DEVSERVER_SSH_TIMEOUT_MINS = 1 73 74# Error message for invalid devserver response. 75ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error' 76ERR_MSG_FOR_DOWN_DEVSERVER = 'Service Unavailable' 77 78# Error message for devserver call timedout. 79ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout' 80 81# The timeout minutes for waiting a devserver staging. 82DEVSERVER_IS_STAGING_RETRY_MIN = 100 83 84# The timeout minutes for waiting a DUT auto-update finished. 85DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100 86 87# The total times of devserver triggering CrOS auto-update. 88AU_RETRY_LIMIT = 2 89 90# Number of seconds for caller to poll devserver's get_au_status call to 91# check if cros auto-update is finished. 92CROS_AU_POLLING_INTERVAL = 10 93 94# Number of seconds for intervals between retrying auto-update calls. 95CROS_AU_RETRY_INTERVAL = 20 96 97# The file name for auto-update logs. 98CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log' 99 100# Provision error patterns. 101# People who see this should know that they shouldn't change these 102# classification strings. These strings are used for monitoring provision 103# failures. Any changes may mess up the stats. 104_EXCEPTION_PATTERNS = [ 105 # Raised when devserver portfile does not exist on host. 106 (r".*Devserver portfile does not exist!.*$", 107 '(1) Devserver portfile does not exist on host'), 108 # Raised when devserver cannot copy packages to host. 109 (r".*Could not copy .* to device.*$", 110 '(2) Cannot copy packages to host'), 111 # Raised when devserver fails to run specific commands on host. 112 (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$", 113 '(3) Fail to run specific command on host'), 114 # Raised when new build fails to boot on the host. 115 (r'.*RootfsUpdateError: Build .* failed to boot on.*$', 116 '(4) Build failed to boot on host'), 117 # Raised when the auto-update process is timed out. 118 (r'.*The CrOS auto-update process is timed out, ' 119 'thus will be terminated.*$', 120 '(5) Auto-update is timed out'), 121 # Raised when the host is not pingable. 122 (r".*DeviceNotPingableError.*$", 123 '(6) Host is not pingable during auto-update'), 124 # Raised when hosts have unexpected status after rootfs update. 125 (r'.*Update failed with unexpected update status: ' 126 'UPDATE_STATUS_IDLE.*$', 127 '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs ' 128 'update'), 129 # Raised when devserver returns non-json response to shard/drone. 130 (r'.*No JSON object could be decoded.*$', 131 '(8) Devserver returned non-json object'), 132 # Raised when devserver loses host's ssh connection 133 (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$', 134 "(9) Devserver lost host's ssh connection"), 135 # Raised when error happens in writing files to host 136 (r'.*Write failed\: Broken pipe.*$', 137 "(10) Broken pipe while writing or connecting to host")] 138 139PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value( 140 'CROS', 'prefer_local_devserver', type=bool, default=False) 141 142ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value( 143 'CROS', 'enable_ssh_connection_for_devserver', type=bool, 144 default=False) 145 146# Directory to save auto-update logs 147AUTO_UPDATE_LOG_DIR = 'autoupdate_logs' 148 149DEFAULT_SUBNET_MASKBIT = 19 150 151 152class DevServerException(Exception): 153 """Raised when the dev server returns a non-200 HTTP response.""" 154 pass 155 156class RetryableProvisionException(DevServerException): 157 """Raised when provision fails due to a retryable reason.""" 158 pass 159 160class DevServerOverloadException(Exception): 161 """Raised when the dev server returns a 502 HTTP response.""" 162 pass 163 164class DevServerFailToLocateException(Exception): 165 """Raised when fail to locate any devserver.""" 166 pass 167 168class MarkupStripper(HTMLParser.HTMLParser): 169 """HTML parser that strips HTML tags, coded characters like & 170 171 Works by, basically, not doing anything for any tags, and only recording 172 the content of text nodes in an internal data structure. 173 """ 174 def __init__(self): 175 self.reset() 176 self.fed = [] 177 178 179 def handle_data(self, d): 180 """Consume content of text nodes, store it away.""" 181 self.fed.append(d) 182 183 184 def get_data(self): 185 """Concatenate and return all stored data.""" 186 return ''.join(self.fed) 187 188 189def _strip_http_message(message): 190 """Strip the HTTP marker from the an HTTP message. 191 192 @param message: A string returned by an HTTP call. 193 194 @return: A string with HTTP marker being stripped. 195 """ 196 strip = MarkupStripper() 197 try: 198 strip.feed(message.decode('utf_32')) 199 except UnicodeDecodeError: 200 strip.feed(message) 201 return strip.get_data() 202 203 204def _get_image_storage_server(): 205 return CONFIG.get_config_value('CROS', 'image_storage_server', type=str) 206 207 208def _get_canary_channel_server(): 209 """ 210 Get the url of the canary-channel server, 211 eg: gsutil://chromeos-releases/canary-channel/<board>/<release> 212 213 @return: The url to the canary channel server. 214 """ 215 return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str) 216 217 218def _get_storage_server_for_artifacts(artifacts=None): 219 """Gets the appropriate storage server for the given artifacts. 220 221 @param artifacts: A list of artifacts we need to stage. 222 @return: The address of the storage server that has these artifacts. 223 The default image storage server if no artifacts are specified. 224 """ 225 factory_artifact = global_config.global_config.get_config_value( 226 'CROS', 'factory_artifact', type=str, default='') 227 if artifacts and factory_artifact and factory_artifact in artifacts: 228 return _get_canary_channel_server() 229 return _get_image_storage_server() 230 231 232def _reverse_lookup_from_config(address): 233 """Look up hostname for the given IP address. 234 235 This uses the hostname-address map from the config file. 236 237 If multiple hostnames map to the same IP address, the first one 238 defined in the configuration file takes precedence. 239 240 @param address: IP address string 241 @returns: hostname string, or original input if not found 242 """ 243 for hostname, addr in _get_hostname_addr_map().iteritems(): 244 if addr == address: 245 return hostname 246 return address 247 248 249def _get_hostname_addr_map(): 250 """Get hostname address mapping from config. 251 252 @return: dict mapping server hostnames to addresses 253 """ 254 return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP') 255 256 257def _get_dev_server_list(): 258 return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[]) 259 260 261def _get_crash_server_list(): 262 return CONFIG.get_config_value('CROS', 'crash_server', type=list, 263 default=[]) 264 265 266def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN, 267 exception_to_raise=DevServerException): 268 """A decorator to use with remote devserver calls. 269 270 This decorator converts urllib2.HTTPErrors into DevServerExceptions 271 with any embedded error info converted into plain text. The method 272 retries on urllib2.URLError or error.CmdError to avoid devserver flakiness. 273 """ 274 #pylint: disable=C0111 275 276 def inner_decorator(method): 277 label = method.__name__ if hasattr(method, '__name__') else None 278 def metrics_wrapper(*args, **kwargs): 279 @retry.retry((urllib2.URLError, error.CmdError, 280 DevServerOverloadException), 281 timeout_min=timeout_min, 282 exception_to_raise=exception_to_raise, 283 label=label) 284 def wrapper(): 285 """This wrapper actually catches the HTTPError.""" 286 try: 287 return method(*args, **kwargs) 288 except urllib2.HTTPError as e: 289 error_markup = e.read() 290 raise DevServerException(_strip_http_message(error_markup)) 291 292 try: 293 return wrapper() 294 except Exception as e: 295 if ERR_MSG_FOR_TIMED_OUT_CALL in str(e): 296 dev_server = None 297 if args and isinstance(args[0], DevServer): 298 dev_server = args[0].hostname 299 elif 'devserver' in kwargs: 300 dev_server = get_hostname(kwargs['devserver']) 301 302 logging.debug('RPC call %s has timed out on devserver %s.', 303 label, dev_server) 304 c = metrics.Counter( 305 'chromeos/autotest/devserver/call_timeout') 306 c.increment(fields={'dev_server': dev_server, 307 'healthy': label}) 308 309 raise 310 311 return metrics_wrapper 312 313 return inner_decorator 314 315 316def get_hostname(url): 317 """Get the hostname portion of a URL 318 319 schema://hostname:port/path 320 321 @param url: a Url string 322 @return: a hostname string 323 """ 324 return urlparse.urlparse(url).hostname 325 326 327class DevServer(object): 328 """Base class for all DevServer-like server stubs. 329 330 This is the base class for interacting with all Dev Server-like servers. 331 A caller should instantiate a sub-class of DevServer with: 332 333 host = SubClassServer.resolve(build) 334 server = SubClassServer(host) 335 """ 336 _MIN_FREE_DISK_SPACE_GB = 20 337 _MAX_APACHE_CLIENT_COUNT = 75 338 # Threshold for the CPU load percentage for a devserver to be selected. 339 MAX_CPU_LOAD = 80.0 340 # Threshold for the network IO, set to 80MB/s 341 MAX_NETWORK_IO = 1024 * 1024 * 80 342 DISK_IO = 'disk_total_bytes_per_second' 343 NETWORK_IO = 'network_total_bytes_per_second' 344 CPU_LOAD = 'cpu_percent' 345 FREE_DISK = 'free_disk' 346 AU_PROCESS = 'au_process_count' 347 STAGING_THREAD_COUNT = 'staging_thread_count' 348 APACHE_CLIENT_COUNT = 'apache_client_count' 349 350 351 def __init__(self, devserver): 352 self._devserver = devserver 353 354 355 def url(self): 356 """Returns the url for this devserver.""" 357 return self._devserver 358 359 360 @property 361 def hostname(self): 362 """Return devserver hostname parsed from the devserver URL. 363 364 Note that this is likely parsed from the devserver URL from 365 shadow_config.ini, meaning that the "hostname" part of the 366 devserver URL is actually an IP address. 367 368 @return hostname string 369 """ 370 return get_hostname(self.url()) 371 372 373 @property 374 def resolved_hostname(self): 375 """Return devserver hostname, resolved from its IP address. 376 377 Unlike the hostname property, this property attempts to look up 378 the proper hostname from the devserver IP address. If lookup 379 fails, then fall back to whatever the hostname property would 380 have returned. 381 382 @return hostname string 383 """ 384 return _reverse_lookup_from_config(self.hostname) 385 386 387 @staticmethod 388 def get_server_url(url): 389 """Get the devserver url from a repo url, which includes build info. 390 391 @param url: A job repo url. 392 393 @return A devserver url, e.g., http://127.0.0.10:8080 394 """ 395 res = urlparse.urlparse(url) 396 if res.netloc: 397 return res.scheme + '://' + res.netloc 398 399 400 @classmethod 401 def get_devserver_load_wrapper(cls, devserver, timeout_sec, output): 402 """A wrapper function to call get_devserver_load in parallel. 403 404 @param devserver: url of the devserver. 405 @param timeout_sec: Number of seconds before time out the devserver 406 call. 407 @param output: An output queue to save results to. 408 """ 409 load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0) 410 if load: 411 load['devserver'] = devserver 412 output.put(load) 413 414 415 @classmethod 416 def get_devserver_load(cls, devserver, 417 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 418 """Returns True if the |devserver| is healthy to stage build. 419 420 @param devserver: url of the devserver. 421 @param timeout_min: How long to wait in minutes before deciding the 422 the devserver is not up (float). 423 424 @return: A dictionary of the devserver's load. 425 426 """ 427 call = cls._build_call(devserver, 'check_health') 428 @remote_devserver_call(timeout_min=timeout_min) 429 def get_load(devserver=devserver): 430 """Inner method that makes the call.""" 431 return cls.run_call(call, timeout=timeout_min*60) 432 433 try: 434 return json.load(cStringIO.StringIO(get_load(devserver=devserver))) 435 except Exception as e: 436 logging.error('Devserver call failed: "%s", timeout: %s seconds,' 437 ' Error: %s', call, timeout_min * 60, e) 438 439 440 @classmethod 441 def is_free_disk_ok(cls, load): 442 """Check if a devserver has enough free disk. 443 444 @param load: A dict of the load of the devserver. 445 446 @return: True if the devserver has enough free disk or disk check is 447 skipped in global config. 448 449 """ 450 if SKIP_DEVSERVER_HEALTH_CHECK: 451 logging.debug('devserver health check is skipped.') 452 elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB: 453 return False 454 455 return True 456 457 458 @classmethod 459 def is_apache_client_count_ok(cls, load): 460 """Check if a devserver has enough Apache connections available. 461 462 Apache server by default has maximum of 150 concurrent connections. If 463 a devserver has too many live connections, it likely indicates the 464 server is busy handling many long running download requests, e.g., 465 downloading stateful partitions. It is better not to add more requests 466 to it. 467 468 @param load: A dict of the load of the devserver. 469 470 @return: True if the devserver has enough Apache connections available, 471 or disk check is skipped in global config. 472 473 """ 474 if SKIP_DEVSERVER_HEALTH_CHECK: 475 logging.debug('devserver health check is skipped.') 476 elif cls.APACHE_CLIENT_COUNT not in load: 477 logging.debug('Apache client count is not collected from devserver.') 478 elif (load[cls.APACHE_CLIENT_COUNT] > 479 cls._MAX_APACHE_CLIENT_COUNT): 480 return False 481 482 return True 483 484 485 @classmethod 486 def devserver_healthy(cls, devserver, 487 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 488 """Returns True if the |devserver| is healthy to stage build. 489 490 @param devserver: url of the devserver. 491 @param timeout_min: How long to wait in minutes before deciding the 492 the devserver is not up (float). 493 494 @return: True if devserver is healthy. Return False otherwise. 495 496 """ 497 c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy') 498 reason = '' 499 healthy = False 500 load = cls.get_devserver_load(devserver, timeout_min=timeout_min) 501 try: 502 if not load: 503 # Failed to get the load of devserver. 504 reason = '(1) Failed to get load.' 505 return False 506 507 apache_ok = cls.is_apache_client_count_ok(load) 508 if not apache_ok: 509 reason = '(2) Apache client count too high.' 510 logging.error('Devserver check_health failed. Live Apache client ' 511 'count is too high: %d.', 512 load[cls.APACHE_CLIENT_COUNT]) 513 return False 514 515 disk_ok = cls.is_free_disk_ok(load) 516 if not disk_ok: 517 reason = '(3) Disk space too low.' 518 logging.error('Devserver check_health failed. Free disk space is ' 519 'low. Only %dGB is available.', 520 load[cls.FREE_DISK]) 521 healthy = bool(disk_ok) 522 return disk_ok 523 finally: 524 c.increment(fields={'dev_server': cls(devserver).resolved_hostname, 525 'healthy': healthy, 526 'reason': reason}) 527 # Monitor how many AU processes the devserver is currently running. 528 if load is not None and load.get(DevServer.AU_PROCESS): 529 c_au = metrics.Gauge( 530 'chromeos/autotest/devserver/devserver_au_count') 531 c_au.set( 532 load.get(DevServer.AU_PROCESS), 533 fields={'dev_server': cls(devserver).resolved_hostname}) 534 535 536 @staticmethod 537 def _build_call(host, method, **kwargs): 538 """Build a URL to |host| that calls |method|, passing |kwargs|. 539 540 Builds a URL that calls |method| on the dev server defined by |host|, 541 passing a set of key/value pairs built from the dict |kwargs|. 542 543 @param host: a string that is the host basename e.g. http://server:90. 544 @param method: the dev server method to call. 545 @param kwargs: a dict mapping arg names to arg values. 546 @return the URL string. 547 """ 548 argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems())) 549 return "%(host)s/%(method)s?%(argstr)s" % dict( 550 host=host, method=method, argstr=argstr) 551 552 553 def build_call(self, method, **kwargs): 554 """Builds a devserver RPC string that is used by 'run_call()'. 555 556 @param method: remote devserver method to call. 557 """ 558 return self._build_call(self._devserver, method, **kwargs) 559 560 561 @classmethod 562 def build_all_calls(cls, method, **kwargs): 563 """Builds a list of URLs that makes RPC calls on all devservers. 564 565 Build a URL that calls |method| on the dev server, passing a set 566 of key/value pairs built from the dict |kwargs|. 567 568 @param method: the dev server method to call. 569 @param kwargs: a dict mapping arg names to arg values 570 571 @return the URL string 572 """ 573 calls = [] 574 # Note we use cls.servers as servers is class specific. 575 for server in cls.servers(): 576 if cls.devserver_healthy(server): 577 calls.append(cls._build_call(server, method, **kwargs)) 578 579 return calls 580 581 582 @classmethod 583 def run_call(cls, call, readline=False, timeout=None): 584 """Invoke a given devserver call using urllib.open. 585 586 Open the URL with HTTP, and return the text of the response. Exceptions 587 may be raised as for urllib2.urlopen(). 588 589 @param call: a url string that calls a method to a devserver. 590 @param readline: whether read http response line by line. 591 @param timeout: The timeout seconds for this urlopen call. 592 593 @return the results of this call. 594 """ 595 if timeout is not None: 596 return utils.urlopen_socket_timeout( 597 call, timeout=timeout).read() 598 elif readline: 599 response = urllib2.urlopen(call) 600 return [line.rstrip() for line in response] 601 else: 602 return urllib2.urlopen(call).read() 603 604 605 @staticmethod 606 def servers(): 607 """Returns a list of servers that can serve as this type of server.""" 608 raise NotImplementedError() 609 610 611 @classmethod 612 def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT, 613 unrestricted_only=False): 614 """Get the devservers in the same subnet of the given ip. 615 616 @param ip: The IP address of a dut to look for devserver. 617 @param mask_bits: Number of mask bits. Default is 19. 618 @param unrestricted_only: Set to True to select from devserver in 619 unrestricted subnet only. Default is False. 620 621 @return: A list of devservers in the same subnet of the given ip. 622 623 """ 624 # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so 625 # we need a dict to return the full devserver path once the IPs are 626 # filtered in get_servers_in_same_subnet. 627 server_names = {} 628 all_devservers = [] 629 devservers = (cls.get_unrestricted_devservers() if unrestricted_only 630 else cls.servers()) 631 for server in devservers: 632 server_name = get_hostname(server) 633 server_names[server_name] = server 634 all_devservers.append(server_name) 635 if not all_devservers: 636 devserver_type = 'unrestricted only' if unrestricted_only else 'all' 637 raise DevServerFailToLocateException( 638 'Fail to locate a devserver for dut %s in %s devservers' 639 % (ip, devserver_type)) 640 641 devservers = utils.get_servers_in_same_subnet(ip, mask_bits, 642 all_devservers) 643 return [server_names[s] for s in devservers] 644 645 646 @classmethod 647 def get_unrestricted_devservers( 648 cls, restricted_subnets=utils.RESTRICTED_SUBNETS): 649 """Get the devservers not in any restricted subnet specified in 650 restricted_subnets. 651 652 @param restricted_subnets: A list of restriected subnets. 653 654 @return: A list of devservers not in any restricted subnet. 655 656 """ 657 if not restricted_subnets: 658 return cls.servers() 659 660 devservers = [] 661 for server in cls.servers(): 662 server_name = get_hostname(server) 663 if not utils.get_restricted_subnet(server_name, restricted_subnets): 664 devservers.append(server) 665 return devservers 666 667 668 @classmethod 669 def get_healthy_devserver(cls, build, devservers, ban_list=None): 670 """"Get a healthy devserver instance from the list of devservers. 671 672 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 673 @param devservers: The devserver list to be chosen out a healthy one. 674 @param ban_list: The blacklist of devservers we don't want to choose. 675 Default is None. 676 677 @return: A DevServer object of a healthy devserver. Return None if no 678 healthy devserver is found. 679 680 """ 681 logging.debug('Pick one healthy devserver from %r', devservers) 682 while devservers: 683 hash_index = hash(build) % len(devservers) 684 devserver = devservers.pop(hash_index) 685 logging.debug('Check health for %s', devserver) 686 if ban_list and devserver in ban_list: 687 continue 688 689 if cls.devserver_healthy(devserver): 690 logging.debug('Pick %s', devserver) 691 return cls(devserver) 692 693 694 @classmethod 695 def get_available_devservers(cls, hostname=None, 696 prefer_local_devserver=PREFER_LOCAL_DEVSERVER, 697 restricted_subnets=utils.RESTRICTED_SUBNETS): 698 """Get devservers in the same subnet of the given hostname. 699 700 @param hostname: Hostname of a DUT to choose devserver for. 701 702 @return: A tuple of (devservers, can_retry), devservers is a list of 703 devservers that's available for the given hostname. can_retry 704 is a flag that indicate if caller can retry the selection of 705 devserver if no devserver in the returned devservers can be 706 used. For example, if hostname is in a restricted subnet, 707 can_retry will be False. 708 """ 709 logging.info('Getting devservers for host: %s', hostname) 710 host_ip = None 711 if hostname: 712 host_ip = bin_utils.get_ip_address(hostname) 713 if not host_ip: 714 logging.error('Failed to get IP address of %s. Will pick a ' 715 'devserver without subnet constraint.', hostname) 716 717 if not host_ip: 718 return cls.get_unrestricted_devservers(restricted_subnets), False 719 720 # Go through all restricted subnet settings and check if the DUT is 721 # inside a restricted subnet. If so, only return the devservers in the 722 # restricted subnet and doesn't allow retry. 723 if host_ip and restricted_subnets: 724 subnet_ip, mask_bits = _get_subnet_for_host_ip( 725 host_ip, restricted_subnets=restricted_subnets) 726 if subnet_ip: 727 logging.debug('The host %s (%s) is in a restricted subnet. ' 728 'Try to locate a devserver inside subnet ' 729 '%s:%d.', hostname, host_ip, subnet_ip, 730 mask_bits) 731 devservers = cls.get_devservers_in_same_subnet( 732 subnet_ip, mask_bits) 733 return devservers, False 734 735 # If prefer_local_devserver is set to True and the host is not in 736 # restricted subnet, pick a devserver in the same subnet if possible. 737 # Set can_retry to True so it can pick a different devserver if all 738 # devservers in the same subnet are down. 739 if prefer_local_devserver: 740 return (cls.get_devservers_in_same_subnet( 741 host_ip, DEFAULT_SUBNET_MASKBIT, True), True) 742 743 return cls.get_unrestricted_devservers(restricted_subnets), False 744 745 746 @classmethod 747 def resolve(cls, build, hostname=None, ban_list=None): 748 """"Resolves a build to a devserver instance. 749 750 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 751 @param hostname: The hostname of dut that requests a devserver. It's 752 used to make sure a devserver in the same subnet is 753 preferred. 754 @param ban_list: The blacklist of devservers shouldn't be chosen. 755 756 @raise DevServerException: If no devserver is available. 757 """ 758 tried_devservers = set() 759 devservers, can_retry = cls.get_available_devservers(hostname) 760 if devservers: 761 tried_devservers |= set(devservers) 762 763 devserver = cls.get_healthy_devserver(build, devservers, 764 ban_list=ban_list) 765 766 if not devserver and can_retry: 767 # Find available devservers without dut location constrain. 768 devservers, _ = cls.get_available_devservers() 769 devserver = cls.get_healthy_devserver(build, devservers, 770 ban_list=ban_list) 771 if devservers: 772 tried_devservers |= set(devservers) 773 if devserver: 774 return devserver 775 else: 776 subnet = 'unrestricted subnet' 777 if hostname is not None: 778 host_ip = bin_utils.get_ip_address(hostname) 779 if host_ip: 780 subnet_ip, mask_bits = _get_subnet_for_host_ip(host_ip) 781 subnet = '%s/%s' % (str(subnet_ip), str(mask_bits)) 782 783 error_msg = ('All devservers in subnet: %s are currently down: ' 784 '%s. (dut hostname: %s)' % 785 (subnet, tried_devservers, hostname)) 786 logging.error(error_msg) 787 c = metrics.Counter( 788 'chromeos/autotest/devserver/subnet_without_devservers') 789 c.increment(fields={'subnet': subnet, 'hostname': str(hostname)}) 790 raise DevServerException(error_msg) 791 792 793 @classmethod 794 def random(cls): 795 """Return a random devserver that's available. 796 797 Devserver election in `resolve` method is based on a hash of the 798 build that a caller wants to stage. The purpose is that different 799 callers requesting for the same build can get the same devserver, 800 while the lab is able to distribute different builds across all 801 devservers. That helps to reduce the duplication of builds across 802 all devservers. 803 This function returns a random devserver, by passing a random 804 pseudo build name to `resolve `method. 805 """ 806 return cls.resolve(build=str(time.time())) 807 808 809class CrashServer(DevServer): 810 """Class of DevServer that symbolicates crash dumps.""" 811 812 @staticmethod 813 def servers(): 814 return _get_crash_server_list() 815 816 817 @remote_devserver_call() 818 def symbolicate_dump(self, minidump_path, build): 819 """Ask the devserver to symbolicate the dump at minidump_path. 820 821 Stage the debug symbols for |build| and, if that works, ask the 822 devserver to symbolicate the dump at |minidump_path|. 823 824 @param minidump_path: the on-disk path of the minidump. 825 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 826 whose debug symbols are needed for symbolication. 827 @return The contents of the stack trace 828 @raise DevServerException upon any return code that's not HTTP OK. 829 """ 830 try: 831 import requests 832 except ImportError: 833 logging.warning("Can't 'import requests' to connect to dev server.") 834 return '' 835 f = {'dev_server': self.resolved_hostname} 836 c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump') 837 c.increment(fields=f) 838 # Symbolicate minidump. 839 m = 'chromeos/autotest/crashserver/symbolicate_dump_duration' 840 with metrics.SecondsTimer(m, fields=f): 841 call = self.build_call('symbolicate_dump', 842 archive_url=_get_image_storage_server() + build) 843 request = requests.post( 844 call, files={'minidump': open(minidump_path, 'rb')}) 845 if request.status_code == requests.codes.OK: 846 return request.text 847 848 error_fd = cStringIO.StringIO(request.text) 849 raise urllib2.HTTPError( 850 call, request.status_code, request.text, request.headers, 851 error_fd) 852 853 854 @classmethod 855 def get_available_devservers(cls, hostname): 856 """Get all available crash servers. 857 858 Crash server election doesn't need to count the location of hostname. 859 860 @param hostname: Hostname of a DUT to choose devserver for. 861 862 @return: A tuple of (all crash servers, False). can_retry is set to 863 False, as all crash servers are returned. There is no point to 864 retry. 865 """ 866 return cls.servers(), False 867 868 869class ImageServerBase(DevServer): 870 """Base class for devservers used to stage builds. 871 872 CrOS and Android builds are staged in different ways as they have different 873 sets of artifacts. This base class abstracts the shared functions between 874 the two types of ImageServer. 875 """ 876 877 @classmethod 878 def servers(cls): 879 """Returns a list of servers that can serve as a desired type of 880 devserver. 881 """ 882 return _get_dev_server_list() 883 884 885 def _get_image_url(self, image): 886 """Returns the url of the directory for this image on the devserver. 887 888 @param image: the image that was fetched. 889 """ 890 image = self.translate(image) 891 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 892 type=str) 893 return (url_pattern % (self.url(), image)).replace('update', 'static') 894 895 896 @staticmethod 897 def create_metadata(server_name, image, artifacts=None, files=None): 898 """Create a metadata dictionary given the staged items. 899 900 The metadata can be send to metadata db along with stats. 901 902 @param server_name: name of the devserver, e.g 172.22.33.44. 903 @param image: The name of the image. 904 @param artifacts: A list of artifacts. 905 @param files: A list of files. 906 907 @return A metadata dictionary. 908 909 """ 910 metadata = {'devserver': server_name, 911 'image': image, 912 '_type': 'devserver'} 913 if artifacts: 914 metadata['artifacts'] = ' '.join(artifacts) 915 if files: 916 metadata['files'] = ' '.join(files) 917 return metadata 918 919 920 @classmethod 921 def run_ssh_call(cls, call, readline=False, timeout=None): 922 """Construct an ssh-based rpc call, and execute it. 923 924 @param call: a url string that calls a method to a devserver. 925 @param readline: whether read http response line by line. 926 @param timeout: The timeout seconds for ssh call. 927 928 @return the results of this call. 929 """ 930 hostname = get_hostname(call) 931 ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call)) 932 timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60 933 try: 934 result = utils.run(ssh_call, timeout=timeout_seconds) 935 except error.CmdError as e: 936 logging.debug('Error occurred with exit_code %d when executing the ' 937 'ssh call: %s.', e.result_obj.exit_status, 938 e.result_obj.stderr) 939 c = metrics.Counter('chromeos/autotest/devserver/ssh_failure') 940 c.increment(fields={'dev_server': hostname}) 941 raise 942 response = result.stdout 943 944 # If the curl command's returned HTTP response contains certain 945 # exception string, raise the DevServerException of the response. 946 if 'DownloaderException' in response: 947 raise DevServerException(_strip_http_message(response)) 948 949 if readline: 950 # Remove line terminators and trailing whitespace 951 response = response.splitlines() 952 return [line.rstrip() for line in response] 953 954 return response 955 956 957 @classmethod 958 def run_call(cls, call, readline=False, timeout=None): 959 """Invoke a given devserver call using urllib.open or ssh. 960 961 Open the URL with HTTP or SSH-based HTTP, and return the text of the 962 response. Exceptions may be raised as for urllib2.urlopen() or 963 utils.run(). 964 965 @param call: a url string that calls a method to a devserver. 966 @param readline: whether read http response line by line. 967 @param timeout: The timeout seconds for urlopen call or ssh call. 968 969 @return the results of this call. 970 """ 971 server_name = get_hostname(call) 972 is_in_restricted_subnet = utils.get_restricted_subnet( 973 server_name, utils.RESTRICTED_SUBNETS) 974 _EMPTY_SENTINEL_VALUE = object() 975 def kickoff_call(): 976 """Invoke a given devserver call using urllib.open or ssh. 977 978 @param call: a url string that calls a method to a devserver. 979 @param is_in_restricted_subnet: whether the devserver is in subnet. 980 @param readline: whether read http response line by line. 981 @param timeout: The timeout seconds for urlopen call or ssh call. 982 """ 983 if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or 984 not is_in_restricted_subnet): 985 response = super(ImageServerBase, cls).run_call( 986 call, readline=readline, timeout=timeout) 987 else: 988 response = cls.run_ssh_call( 989 call, readline=readline, timeout=timeout) 990 # Retry if devserver service is temporarily down, e.g. in a 991 # devserver push. 992 if ERR_MSG_FOR_DOWN_DEVSERVER in response: 993 return False 994 995 # Don't return response directly since it may be empty string, 996 # which causes poll_for_condition to retry. 997 return _EMPTY_SENTINEL_VALUE if not response else response 998 999 try: 1000 response = bin_utils.poll_for_condition( 1001 kickoff_call, 1002 exception=bin_utils.TimeoutError(), 1003 timeout=60, 1004 sleep_interval=5) 1005 return '' if response is _EMPTY_SENTINEL_VALUE else response 1006 except bin_utils.TimeoutError: 1007 return ERR_MSG_FOR_DOWN_DEVSERVER 1008 1009 1010 @classmethod 1011 def download_file(cls, remote_file, local_file, timeout=None): 1012 """Download file from devserver. 1013 1014 The format of remote_file should be: 1015 http://devserver_ip:8082/static/board/... 1016 1017 @param remote_file: The URL of the file on devserver that need to be 1018 downloaded. 1019 @param local_file: The path of the file saved to local. 1020 @param timeout: The timeout seconds for this call. 1021 """ 1022 response = cls.run_call(remote_file, timeout=timeout) 1023 with open(local_file, 'w') as out_log: 1024 out_log.write(response) 1025 1026 1027 def _poll_is_staged(self, **kwargs): 1028 """Polling devserver.is_staged until all artifacts are staged. 1029 1030 @param kwargs: keyword arguments to make is_staged devserver call. 1031 1032 @return: True if all artifacts are staged in devserver. 1033 """ 1034 call = self.build_call('is_staged', **kwargs) 1035 1036 def all_staged(): 1037 """Call devserver.is_staged rpc to check if all files are staged. 1038 1039 @return: True if all artifacts are staged in devserver. False 1040 otherwise. 1041 @rasies DevServerException, the exception is a wrapper of all 1042 exceptions that were raised when devserver tried to download 1043 the artifacts. devserver raises an HTTPError or a CmdError 1044 when an exception was raised in the code. Such exception 1045 should be re-raised here to stop the caller from waiting. 1046 If the call to devserver failed for connection issue, a 1047 URLError exception is raised, and caller should retry the 1048 call to avoid such network flakiness. 1049 1050 """ 1051 try: 1052 result = self.run_call(call) 1053 logging.debug('whether artifact is staged: %r', result) 1054 return result == 'True' 1055 except urllib2.HTTPError as e: 1056 error_markup = e.read() 1057 raise DevServerException(_strip_http_message(error_markup)) 1058 except urllib2.URLError as e: 1059 # Could be connection issue, retry it. 1060 # For example: <urlopen error [Errno 111] Connection refused> 1061 logging.error('URLError happens in is_stage: %r', e) 1062 return False 1063 except error.CmdError as e: 1064 # Retry if SSH failed to connect to the devserver. 1065 logging.warning('CmdError happens in is_stage: %r, will retry', e) 1066 return False 1067 1068 bin_utils.poll_for_condition( 1069 all_staged, 1070 exception=bin_utils.TimeoutError(), 1071 timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60, 1072 sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL) 1073 1074 return True 1075 1076 1077 def _call_and_wait(self, call_name, error_message, 1078 expected_response=SUCCESS, **kwargs): 1079 """Helper method to make a urlopen call, and wait for artifacts staged. 1080 1081 @param call_name: name of devserver rpc call. 1082 @param error_message: Error message to be thrown if response does not 1083 match expected_response. 1084 @param expected_response: Expected response from rpc, default to 1085 |Success|. If it's set to None, do not compare 1086 the actual response. Any response is consider 1087 to be good. 1088 @param kwargs: keyword arguments to make is_staged devserver call. 1089 1090 @return: The response from rpc. 1091 @raise DevServerException upon any return code that's expected_response. 1092 1093 """ 1094 call = self.build_call(call_name, async=True, **kwargs) 1095 try: 1096 response = self.run_call(call) 1097 logging.debug('response for RPC: %r', response) 1098 if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response: 1099 logging.debug('Proxy error happens in RPC call, ' 1100 'will retry in 30 seconds') 1101 time.sleep(30) 1102 raise DevServerOverloadException() 1103 except httplib.BadStatusLine as e: 1104 logging.error(e) 1105 raise DevServerException('Received Bad Status line, Devserver %s ' 1106 'might have gone down while handling ' 1107 'the call: %s' % (self.url(), call)) 1108 1109 if expected_response and not response == expected_response: 1110 raise DevServerException(error_message) 1111 1112 # `os_type` is needed in build a devserver call, but not needed for 1113 # wait_for_artifacts_staged, since that method is implemented by 1114 # each ImageServerBase child class. 1115 if 'os_type' in kwargs: 1116 del kwargs['os_type'] 1117 self.wait_for_artifacts_staged(**kwargs) 1118 return response 1119 1120 1121 def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs): 1122 """Tell the devserver to download and stage |artifacts| from |image| 1123 specified by kwargs. 1124 1125 This is the main call point for staging any specific artifacts for a 1126 given build. To see the list of artifacts one can stage see: 1127 1128 ~src/platfrom/dev/artifact_info.py. 1129 1130 This is maintained along with the actual devserver code. 1131 1132 @param artifacts: A list of artifacts. 1133 @param files: A list of files to stage. 1134 @param archive_url: Optional parameter that has the archive_url to stage 1135 this artifact from. Default is specified in autotest config + 1136 image. 1137 @param kwargs: keyword arguments that specify the build information, to 1138 make stage devserver call. 1139 1140 @raise DevServerException upon any return code that's not HTTP OK. 1141 """ 1142 if not archive_url: 1143 archive_url = _get_storage_server_for_artifacts(artifacts) + build 1144 1145 artifacts_arg = ','.join(artifacts) if artifacts else '' 1146 files_arg = ','.join(files) if files else '' 1147 error_message = ("staging %s for %s failed;" 1148 "HTTP OK not accompanied by 'Success'." % 1149 ('artifacts=%s files=%s ' % (artifacts_arg, files_arg), 1150 build)) 1151 1152 staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' % 1153 (build, artifacts, files, archive_url)) 1154 logging.info('Staging artifacts on devserver %s: %s', 1155 self.url(), staging_info) 1156 success = False 1157 try: 1158 arguments = {'archive_url': archive_url, 1159 'artifacts': artifacts_arg, 1160 'files': files_arg} 1161 if kwargs: 1162 arguments.update(kwargs) 1163 # TODO(akeshet): canonicalize artifacts_arg before using it as a 1164 # metric field (as it stands it is a not-very-well-controlled 1165 # string). 1166 f = {'artifacts': artifacts_arg, 1167 'dev_server': self.resolved_hostname} 1168 with metrics.SecondsTimer( 1169 'chromeos/autotest/devserver/stage_artifact_duration', 1170 fields=f): 1171 self.call_and_wait(call_name='stage', error_message=error_message, 1172 **arguments) 1173 logging.info('Finished staging artifacts: %s', staging_info) 1174 success = True 1175 except (bin_utils.TimeoutError, error.TimeoutException): 1176 logging.error('stage_artifacts timed out: %s', staging_info) 1177 raise DevServerException( 1178 'stage_artifacts timed out: %s' % staging_info) 1179 finally: 1180 f = {'success': success, 1181 'artifacts': artifacts_arg, 1182 'dev_server': self.resolved_hostname} 1183 metrics.Counter('chromeos/autotest/devserver/stage_artifact' 1184 ).increment(fields=f) 1185 1186 1187 def call_and_wait(self, *args, **kwargs): 1188 """Helper method to make a urlopen call, and wait for artifacts staged. 1189 1190 This method needs to be overridden in the subclass to implement the 1191 logic to call _call_and_wait. 1192 """ 1193 raise NotImplementedError 1194 1195 1196 def _trigger_download(self, build, artifacts, files, synchronous=True, 1197 **kwargs_build_info): 1198 """Tell the devserver to download and stage image specified in 1199 kwargs_build_info. 1200 1201 Tells the devserver to fetch |image| from the image storage server 1202 named by _get_image_storage_server(). 1203 1204 If |synchronous| is True, waits for the entire download to finish 1205 staging before returning. Otherwise only the artifacts necessary 1206 to start installing images onto DUT's will be staged before returning. 1207 A caller can then call finish_download to guarantee the rest of the 1208 artifacts have finished staging. 1209 1210 @param synchronous: if True, waits until all components of the image are 1211 staged before returning. 1212 @param kwargs_build_info: Dictionary of build information. 1213 For CrOS, it is None as build is the CrOS image name. 1214 For Android, it is {'target': target, 1215 'build_id': build_id, 1216 'branch': branch} 1217 1218 @raise DevServerException upon any return code that's not HTTP OK. 1219 1220 """ 1221 if kwargs_build_info: 1222 archive_url = None 1223 else: 1224 archive_url = _get_image_storage_server() + build 1225 error_message = ("trigger_download for %s failed;" 1226 "HTTP OK not accompanied by 'Success'." % build) 1227 kwargs = {'archive_url': archive_url, 1228 'artifacts': artifacts, 1229 'files': files, 1230 'error_message': error_message} 1231 if kwargs_build_info: 1232 kwargs.update(kwargs_build_info) 1233 1234 logging.info('trigger_download starts for %s', build) 1235 try: 1236 response = self.call_and_wait(call_name='stage', **kwargs) 1237 logging.info('trigger_download finishes for %s', build) 1238 except (bin_utils.TimeoutError, error.TimeoutException): 1239 logging.error('trigger_download timed out for %s.', build) 1240 raise DevServerException( 1241 'trigger_download timed out for %s.' % build) 1242 was_successful = response == SUCCESS 1243 if was_successful and synchronous: 1244 self._finish_download(build, artifacts, files, **kwargs_build_info) 1245 1246 1247 def _finish_download(self, build, artifacts, files, **kwargs_build_info): 1248 """Tell the devserver to finish staging image specified in 1249 kwargs_build_info. 1250 1251 If trigger_download is called with synchronous=False, it will return 1252 before all artifacts have been staged. This method contacts the 1253 devserver and blocks until all staging is completed and should be 1254 called after a call to trigger_download. 1255 1256 @param kwargs_build_info: Dictionary of build information. 1257 For CrOS, it is None as build is the CrOS image name. 1258 For Android, it is {'target': target, 1259 'build_id': build_id, 1260 'branch': branch} 1261 1262 @raise DevServerException upon any return code that's not HTTP OK. 1263 """ 1264 archive_url = _get_image_storage_server() + build 1265 error_message = ("finish_download for %s failed;" 1266 "HTTP OK not accompanied by 'Success'." % build) 1267 kwargs = {'archive_url': archive_url, 1268 'artifacts': artifacts, 1269 'files': files, 1270 'error_message': error_message} 1271 if kwargs_build_info: 1272 kwargs.update(kwargs_build_info) 1273 try: 1274 self.call_and_wait(call_name='stage', **kwargs) 1275 except (bin_utils.TimeoutError, error.TimeoutException): 1276 logging.error('finish_download timed out for %s', build) 1277 raise DevServerException( 1278 'finish_download timed out for %s.' % build) 1279 1280 1281 @remote_devserver_call() 1282 def locate_file(self, file_name, artifacts, build, build_info): 1283 """Locate a file with the given file_name on devserver. 1284 1285 This method calls devserver RPC `locate_file` to look up a file with 1286 the given file name inside specified build artifacts. 1287 1288 @param file_name: Name of the file to look for a file. 1289 @param artifacts: A list of artifact names to search for the file. 1290 @param build: Name of the build. For Android, it's None as build_info 1291 should be used. 1292 @param build_info: Dictionary of build information. 1293 For CrOS, it is None as build is the CrOS image name. 1294 For Android, it is {'target': target, 1295 'build_id': build_id, 1296 'branch': branch} 1297 1298 @return: A devserver url to the file. 1299 @raise DevServerException upon any return code that's not HTTP OK. 1300 """ 1301 if not build and not build_info: 1302 raise DevServerException('You must specify build information to ' 1303 'look for file %s in artifacts %s.' % 1304 (file_name, artifacts)) 1305 kwargs = {'file_name': file_name, 1306 'artifacts': artifacts} 1307 if build_info: 1308 build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info 1309 kwargs.update(build_info) 1310 # Devserver treats Android and Brillo build in the same way as they 1311 # are both retrieved from Launch Control and have similar build 1312 # artifacts. Therefore, os_type for devserver calls is `android` for 1313 # both Android and Brillo builds. 1314 kwargs['os_type'] = 'android' 1315 else: 1316 build_path = build 1317 kwargs['build'] = build 1318 call = self.build_call('locate_file', async=False, **kwargs) 1319 try: 1320 file_path = self.run_call(call) 1321 return os.path.join(self.url(), 'static', build_path, file_path) 1322 except httplib.BadStatusLine as e: 1323 logging.error(e) 1324 raise DevServerException('Received Bad Status line, Devserver %s ' 1325 'might have gone down while handling ' 1326 'the call: %s' % (self.url(), call)) 1327 1328 1329 @remote_devserver_call() 1330 def list_control_files(self, build, suite_name=''): 1331 """Ask the devserver to list all control files for |build|. 1332 1333 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1334 whose control files the caller wants listed. 1335 @param suite_name: The name of the suite for which we require control 1336 files. 1337 @return None on failure, or a list of control file paths 1338 (e.g. server/site_tests/autoupdate/control) 1339 @raise DevServerException upon any return code that's not HTTP OK. 1340 """ 1341 build = self.translate(build) 1342 call = self.build_call('controlfiles', build=build, 1343 suite_name=suite_name) 1344 return self.run_call(call, readline=True) 1345 1346 1347 @remote_devserver_call() 1348 def get_control_file(self, build, control_path): 1349 """Ask the devserver for the contents of a control file. 1350 1351 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1352 whose control file the caller wants to fetch. 1353 @param control_path: The file to fetch 1354 (e.g. server/site_tests/autoupdate/control) 1355 @return The contents of the desired file. 1356 @raise DevServerException upon any return code that's not HTTP OK. 1357 """ 1358 build = self.translate(build) 1359 call = self.build_call('controlfiles', build=build, 1360 control_path=control_path) 1361 return self.run_call(call) 1362 1363 1364 @remote_devserver_call() 1365 def list_suite_controls(self, build, suite_name=''): 1366 """Ask the devserver to list contents of all control files for |build|. 1367 1368 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1369 whose control files' contents the caller wants returned. 1370 @param suite_name: The name of the suite for which we require control 1371 files. 1372 @return None on failure, or a dict of contents of all control files 1373 (e.g. {'path1': "#Copyright controls ***", ..., 1374 pathX': "#Copyright controls ***"} 1375 @raise DevServerException upon any return code that's not HTTP OK. 1376 """ 1377 build = self.translate(build) 1378 call = self.build_call('list_suite_controls', build=build, 1379 suite_name=suite_name) 1380 return json.load(cStringIO.StringIO(self.run_call(call))) 1381 1382 1383class ImageServer(ImageServerBase): 1384 """Class for DevServer that handles RPCs related to CrOS images. 1385 1386 The calls to devserver to stage artifacts, including stage and download, are 1387 made in async mode. That is, when caller makes an RPC |stage| to request 1388 devserver to stage certain artifacts, devserver handles the call and starts 1389 staging artifacts in a new thread, and return |Success| without waiting for 1390 staging being completed. When caller receives message |Success|, it polls 1391 devserver's is_staged call until all artifacts are staged. 1392 Such mechanism is designed to prevent cherrypy threads in devserver being 1393 running out, as staging artifacts might take long time, and cherrypy starts 1394 with a fixed number of threads that handle devserver rpc. 1395 """ 1396 1397 class ArtifactUrls(object): 1398 """A container for URLs of staged artifacts. 1399 1400 Attributes: 1401 full_payload: URL for downloading a staged full release update 1402 mton_payload: URL for downloading a staged M-to-N release update 1403 nton_payload: URL for downloading a staged N-to-N release update 1404 1405 """ 1406 def __init__(self, full_payload=None, mton_payload=None, 1407 nton_payload=None): 1408 self.full_payload = full_payload 1409 self.mton_payload = mton_payload 1410 self.nton_payload = nton_payload 1411 1412 1413 def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''): 1414 """Polling devserver.is_staged until all artifacts are staged. 1415 1416 @param archive_url: Google Storage URL for the build. 1417 @param artifacts: Comma separated list of artifacts to download. 1418 @param files: Comma separated list of files to download. 1419 @return: True if all artifacts are staged in devserver. 1420 """ 1421 kwargs = {'archive_url': archive_url, 1422 'artifacts': artifacts, 1423 'files': files} 1424 return self._poll_is_staged(**kwargs) 1425 1426 1427 @remote_devserver_call() 1428 def call_and_wait(self, call_name, archive_url, artifacts, files, 1429 error_message, expected_response=SUCCESS): 1430 """Helper method to make a urlopen call, and wait for artifacts staged. 1431 1432 @param call_name: name of devserver rpc call. 1433 @param archive_url: Google Storage URL for the build.. 1434 @param artifacts: Comma separated list of artifacts to download. 1435 @param files: Comma separated list of files to download. 1436 @param expected_response: Expected response from rpc, default to 1437 |Success|. If it's set to None, do not compare 1438 the actual response. Any response is consider 1439 to be good. 1440 @param error_message: Error message to be thrown if response does not 1441 match expected_response. 1442 1443 @return: The response from rpc. 1444 @raise DevServerException upon any return code that's expected_response. 1445 1446 """ 1447 kwargs = {'archive_url': archive_url, 1448 'artifacts': artifacts, 1449 'files': files} 1450 return self._call_and_wait(call_name, error_message, 1451 expected_response, **kwargs) 1452 1453 1454 @remote_devserver_call() 1455 def stage_artifacts(self, image=None, artifacts=None, files='', 1456 archive_url=None): 1457 """Tell the devserver to download and stage |artifacts| from |image|. 1458 1459 This is the main call point for staging any specific artifacts for a 1460 given build. To see the list of artifacts one can stage see: 1461 1462 ~src/platfrom/dev/artifact_info.py. 1463 1464 This is maintained along with the actual devserver code. 1465 1466 @param image: the image to fetch and stage. 1467 @param artifacts: A list of artifacts. 1468 @param files: A list of files to stage. 1469 @param archive_url: Optional parameter that has the archive_url to stage 1470 this artifact from. Default is specified in autotest config + 1471 image. 1472 1473 @raise DevServerException upon any return code that's not HTTP OK. 1474 """ 1475 if not artifacts and not files: 1476 raise DevServerException('Must specify something to stage.') 1477 image = self.translate(image) 1478 self._stage_artifacts(image, artifacts, files, archive_url) 1479 1480 1481 @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS) 1482 def list_image_dir(self, image): 1483 """List the contents of the image stage directory, on the devserver. 1484 1485 @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>. 1486 1487 @raise DevServerException upon any return code that's not HTTP OK. 1488 """ 1489 image = self.translate(image) 1490 logging.info('Requesting contents from devserver %s for image %s', 1491 self.url(), image) 1492 archive_url = _get_storage_server_for_artifacts() + image 1493 call = self.build_call('list_image_dir', archive_url=archive_url) 1494 response = self.run_call(call, readline=True) 1495 for line in response: 1496 logging.info(line) 1497 1498 1499 def trigger_download(self, image, synchronous=True): 1500 """Tell the devserver to download and stage |image|. 1501 1502 Tells the devserver to fetch |image| from the image storage server 1503 named by _get_image_storage_server(). 1504 1505 If |synchronous| is True, waits for the entire download to finish 1506 staging before returning. Otherwise only the artifacts necessary 1507 to start installing images onto DUT's will be staged before returning. 1508 A caller can then call finish_download to guarantee the rest of the 1509 artifacts have finished staging. 1510 1511 @param image: the image to fetch and stage. 1512 @param synchronous: if True, waits until all components of the image are 1513 staged before returning. 1514 1515 @raise DevServerException upon any return code that's not HTTP OK. 1516 1517 """ 1518 image = self.translate(image) 1519 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE 1520 self._trigger_download(image, artifacts, files='', 1521 synchronous=synchronous) 1522 1523 1524 @remote_devserver_call() 1525 def setup_telemetry(self, build): 1526 """Tell the devserver to setup telemetry for this build. 1527 1528 The devserver will stage autotest and then extract the required files 1529 for telemetry. 1530 1531 @param build: the build to setup telemetry for. 1532 1533 @returns path on the devserver that telemetry is installed to. 1534 """ 1535 build = self.translate(build) 1536 archive_url = _get_image_storage_server() + build 1537 call = self.build_call('setup_telemetry', archive_url=archive_url) 1538 try: 1539 response = self.run_call(call) 1540 except httplib.BadStatusLine as e: 1541 logging.error(e) 1542 raise DevServerException('Received Bad Status line, Devserver %s ' 1543 'might have gone down while handling ' 1544 'the call: %s' % (self.url(), call)) 1545 return response 1546 1547 1548 def finish_download(self, image): 1549 """Tell the devserver to finish staging |image|. 1550 1551 If trigger_download is called with synchronous=False, it will return 1552 before all artifacts have been staged. This method contacts the 1553 devserver and blocks until all staging is completed and should be 1554 called after a call to trigger_download. 1555 1556 @param image: the image to fetch and stage. 1557 @raise DevServerException upon any return code that's not HTTP OK. 1558 """ 1559 image = self.translate(image) 1560 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST 1561 self._finish_download(image, artifacts, files='') 1562 1563 1564 def get_update_url(self, image): 1565 """Returns the url that should be passed to the updater. 1566 1567 @param image: the image that was fetched. 1568 """ 1569 image = self.translate(image) 1570 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 1571 type=str) 1572 return (url_pattern % (self.url(), image)) 1573 1574 1575 def get_staged_file_url(self, filename, image): 1576 """Returns the url of a staged file for this image on the devserver.""" 1577 return '/'.join([self._get_image_url(image), filename]) 1578 1579 1580 def get_full_payload_url(self, image): 1581 """Returns a URL to a staged full payload. 1582 1583 @param image: the image that was fetched. 1584 1585 @return A fully qualified URL that can be used for downloading the 1586 payload. 1587 1588 """ 1589 return self._get_image_url(image) + '/update.gz' 1590 1591 1592 def get_test_image_url(self, image): 1593 """Returns a URL to a staged test image. 1594 1595 @param image: the image that was fetched. 1596 1597 @return A fully qualified URL that can be used for downloading the 1598 image. 1599 1600 """ 1601 return self._get_image_url(image) + '/chromiumos_test_image.bin' 1602 1603 1604 @remote_devserver_call() 1605 def get_dependencies_file(self, build): 1606 """Ask the dev server for the contents of the suite dependencies file. 1607 1608 Ask the dev server at |self._dev_server| for the contents of the 1609 pre-processed suite dependencies file (at DEPENDENCIES_FILE) 1610 for |build|. 1611 1612 @param build: The build (e.g. x86-mario-release/R21-2333.0.0) 1613 whose dependencies the caller is interested in. 1614 @return The contents of the dependencies file, which should eval to 1615 a dict of dicts, as per bin_utils/suite_preprocessor.py. 1616 @raise DevServerException upon any return code that's not HTTP OK. 1617 """ 1618 build = self.translate(build) 1619 call = self.build_call('controlfiles', 1620 build=build, control_path=DEPENDENCIES_FILE) 1621 return self.run_call(call) 1622 1623 1624 @remote_devserver_call() 1625 def get_latest_build_in_gs(self, board): 1626 """Ask the devservers for the latest offical build in Google Storage. 1627 1628 @param board: The board for who we want the latest official build. 1629 @return A string of the returned build rambi-release/R37-5868.0.0 1630 @raise DevServerException upon any return code that's not HTTP OK. 1631 """ 1632 call = self.build_call( 1633 'xbuddy_translate/remote/%s/latest-official' % board, 1634 image_dir=_get_image_storage_server()) 1635 image_name = self.run_call(call) 1636 return os.path.dirname(image_name) 1637 1638 1639 def translate(self, build_name): 1640 """Translate the build name if it's in LATEST format. 1641 1642 If the build name is in the format [builder]/LATEST, return the latest 1643 build in Google Storage otherwise return the build name as is. 1644 1645 @param build_name: build_name to check. 1646 1647 @return The actual build name to use. 1648 """ 1649 match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I) 1650 if not match: 1651 return build_name 1652 translated_build = self.get_latest_build_in_gs(match.groups()[0]) 1653 logging.debug('Translated relative build %s to %s', build_name, 1654 translated_build) 1655 return translated_build 1656 1657 1658 @classmethod 1659 @remote_devserver_call() 1660 def get_latest_build(cls, target, milestone=''): 1661 """Ask all the devservers for the latest build for a given target. 1662 1663 @param target: The build target, typically a combination of the board 1664 and the type of build e.g. x86-mario-release. 1665 @param milestone: For latest build set to '', for builds only in a 1666 specific milestone set to a str of format Rxx 1667 (e.g. R16). Default: ''. Since we are dealing with a 1668 webserver sending an empty string, '', ensures that 1669 the variable in the URL is ignored as if it was set 1670 to None. 1671 @return A string of the returned build e.g. R20-2226.0.0. 1672 @raise DevServerException upon any return code that's not HTTP OK. 1673 """ 1674 calls = cls.build_all_calls('latestbuild', target=target, 1675 milestone=milestone) 1676 latest_builds = [] 1677 for call in calls: 1678 latest_builds.append(cls.run_call(call)) 1679 1680 return max(latest_builds, key=version.LooseVersion) 1681 1682 1683 @remote_devserver_call() 1684 def _kill_au_process_for_host(self, **kwargs): 1685 """Kill the triggerred auto_update process if error happens in cros_au. 1686 1687 @param kwargs: Arguments to make kill_au_proc devserver call. 1688 """ 1689 call = self.build_call('kill_au_proc', **kwargs) 1690 response = self.run_call(call) 1691 if not response == 'True': 1692 raise DevServerException( 1693 'Failed to kill the triggerred CrOS auto_update process' 1694 'on devserver %s, the response is %s' % ( 1695 self.url(), response)) 1696 1697 1698 def kill_au_process_for_host(self, host_name, pid): 1699 """Kill the triggerred auto_update process if error happens. 1700 1701 Usually this function is used to clear all potential left au processes 1702 of the given host name. 1703 1704 If pid is specified, the devserver will further check the given pid to 1705 make sure the process is killed. This is used for the case that the au 1706 process has started in background, but then provision fails due to 1707 some unknown issues very fast. In this case, when 'kill_au_proc' is 1708 called, there's no corresponding background track log created for this 1709 ongoing au process, which prevents this RPC call from killing this au 1710 process. 1711 1712 @param host_name: The DUT's hostname. 1713 @param pid: The ongoing au process's pid. 1714 1715 @return: True if successfully kill the auto-update process for host. 1716 """ 1717 kwargs = {'host_name': host_name, 'pid': pid} 1718 try: 1719 self._kill_au_process_for_host(**kwargs) 1720 except DevServerException: 1721 return False 1722 1723 return True 1724 1725 1726 @remote_devserver_call() 1727 def _clean_track_log(self, **kwargs): 1728 """Clean track log for the current auto-update process.""" 1729 call = self.build_call('handler_cleanup', **kwargs) 1730 self.run_call(call) 1731 1732 1733 def clean_track_log(self, host_name, pid): 1734 """Clean track log for the current auto-update process. 1735 1736 @param host_name: The host name to be updated. 1737 @param pid: The auto-update process id. 1738 1739 @return: True if track log is successfully cleaned, False otherwise. 1740 """ 1741 if not pid: 1742 return False 1743 1744 kwargs = {'host_name': host_name, 'pid': pid} 1745 try: 1746 self._clean_track_log(**kwargs) 1747 except DevServerException as e: 1748 logging.debug('Failed to clean track_status_file on ' 1749 'devserver for host %s and process id %s: %s', 1750 host_name, pid, str(e)) 1751 return False 1752 1753 return True 1754 1755 1756 def _get_au_log_filename(self, log_dir, host_name, pid): 1757 """Return the auto-update log's filename.""" 1758 return os.path.join(log_dir, CROS_AU_LOG_FILENAME % ( 1759 host_name, pid)) 1760 1761 def _read_json_response_from_devserver(self, response): 1762 """Reads the json response from the devserver. 1763 1764 This is extracted to its own function so that it can be easily mocked. 1765 @param response: the response for a devserver. 1766 """ 1767 try: 1768 return json.loads(response) 1769 except ValueError as e: 1770 logging.debug('Failed to load json response: %s', response) 1771 raise DevServerException(e) 1772 1773 1774 @remote_devserver_call() 1775 def _collect_au_log(self, log_dir, **kwargs): 1776 """Collect logs from devserver after cros-update process is finished. 1777 1778 Collect the logs that recording the whole cros-update process, and 1779 write it to sysinfo path of a job. 1780 1781 The example log file name that is stored is like: 1782 '1220-repair/sysinfo/CrOS_update_host_name_pid.log' 1783 1784 @param host_name: the DUT's hostname. 1785 @param pid: the auto-update process id on devserver. 1786 @param log_dir: The directory to save the cros-update process log 1787 retrieved from devserver. 1788 """ 1789 call = self.build_call('collect_cros_au_log', **kwargs) 1790 response = self.run_call(call) 1791 if not os.path.exists(log_dir): 1792 os.mkdir(log_dir) 1793 write_file = self._get_au_log_filename( 1794 log_dir, kwargs['host_name'], kwargs['pid']) 1795 logging.debug('Saving auto-update logs into %s', write_file) 1796 1797 au_logs = self._read_json_response_from_devserver(response) 1798 1799 try: 1800 for k, v in au_logs['host_logs'].items(): 1801 log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid']) 1802 log_path = os.path.join(log_dir, log_name) 1803 with open(log_path, 'w') as out_log: 1804 out_log.write(v) 1805 except IOError as e: 1806 raise DevServerException('Failed to write auto-update hostlogs: ' 1807 '%s' % e) 1808 1809 try: 1810 with open(write_file, 'w') as out_log: 1811 out_log.write(au_logs['cros_au_log']) 1812 except: 1813 raise DevServerException('Failed to write auto-update logs into ' 1814 '%s' % write_file) 1815 1816 1817 def collect_au_log(self, host_name, pid, log_dir): 1818 """Collect logs from devserver after cros-update process is finished. 1819 1820 @param host_name: the DUT's hostname. 1821 @param pid: the auto-update process id on devserver. 1822 @param log_dir: The directory to save the cros-update process log 1823 retrieved from devserver. 1824 1825 @return: True if auto-update log is successfully collected, False 1826 otherwise. 1827 """ 1828 if not pid: 1829 return False 1830 1831 kwargs = {'host_name': host_name, 'pid': pid} 1832 try: 1833 self._collect_au_log(log_dir, **kwargs) 1834 except DevServerException as e: 1835 logging.debug('Failed to collect auto-update log on ' 1836 'devserver for host %s and process id %s: %s', 1837 host_name, pid, str(e)) 1838 return False 1839 1840 return True 1841 1842 1843 @remote_devserver_call() 1844 def _trigger_auto_update(self, **kwargs): 1845 """Trigger auto-update by calling devserver.cros_au. 1846 1847 @param kwargs: Arguments to make cros_au devserver call. 1848 1849 @return: a tuple indicates whether the RPC call cros_au succeeds and 1850 the auto-update process id running on devserver. 1851 """ 1852 host_name = kwargs['host_name'] 1853 call = self.build_call('cros_au', async=True, **kwargs) 1854 try: 1855 response = self.run_call(call) 1856 logging.info( 1857 'Received response from devserver for cros_au call: %r', 1858 response) 1859 except httplib.BadStatusLine as e: 1860 logging.error(e) 1861 raise DevServerException('Received Bad Status line, Devserver %s ' 1862 'might have gone down while handling ' 1863 'the call: %s' % (self.url(), call)) 1864 1865 return response 1866 1867 1868 def _check_for_auto_update_finished(self, pid, wait=True, **kwargs): 1869 """Polling devserver.get_au_status to get current auto-update status. 1870 1871 The current auto-update status is used to identify whether the update 1872 process is finished. 1873 1874 @param pid: The background process id for auto-update in devserver. 1875 @param kwargs: keyword arguments to make get_au_status devserver call. 1876 @param wait: Should the check wait for completion. 1877 1878 @return: True if auto-update is finished for a given dut. 1879 """ 1880 logging.debug('Check the progress for auto-update process %r', pid) 1881 kwargs['pid'] = pid 1882 call = self.build_call('get_au_status', **kwargs) 1883 1884 def all_finished(): 1885 """Call devserver.get_au_status rpc to check if auto-update 1886 is finished. 1887 1888 @return: True if auto-update is finished for a given dut. False 1889 otherwise. 1890 @rasies DevServerException, the exception is a wrapper of all 1891 exceptions that were raised when devserver tried to 1892 download the artifacts. devserver raises an HTTPError or 1893 a CmdError when an exception was raised in the code. Such 1894 exception should be re-raised here to stop the caller from 1895 waiting. If the call to devserver failed for connection 1896 issue, a URLError exception is raised, and caller should 1897 retry the call to avoid such network flakiness. 1898 1899 """ 1900 try: 1901 au_status = self.run_call(call) 1902 response = json.loads(au_status) 1903 # This is a temp fix to fit both dict and tuple returning 1904 # values. The dict check will be removed after a corresponding 1905 # devserver CL is deployed. 1906 if isinstance(response, dict): 1907 if response.get('detailed_error_msg'): 1908 raise DevServerException( 1909 response.get('detailed_error_msg')) 1910 1911 if response.get('finished'): 1912 logging.debug('CrOS auto-update is finished') 1913 return True 1914 else: 1915 logging.debug('Current CrOS auto-update status: %s', 1916 response.get('status')) 1917 return False 1918 1919 if not response[0]: 1920 logging.debug('Current CrOS auto-update status: %s', 1921 response[1]) 1922 return False 1923 else: 1924 logging.debug('CrOS auto-update is finished') 1925 return True 1926 except urllib2.HTTPError as e: 1927 error_markup = e.read() 1928 raise DevServerException(_strip_http_message(error_markup)) 1929 except urllib2.URLError as e: 1930 # Could be connection issue, retry it. 1931 # For example: <urlopen error [Errno 111] Connection refused> 1932 logging.warning('URLError (%r): Retrying connection to ' 1933 'devserver to check auto-update status.', e) 1934 return False 1935 except error.CmdError: 1936 # Retry if SSH failed to connect to the devserver. 1937 logging.warning('CmdError: Retrying SSH connection to check ' 1938 'auto-update status.') 1939 return False 1940 except socket.error as e: 1941 # Could be some temporary devserver connection issues. 1942 logging.warning('Socket Error (%r): Retrying connection to ' 1943 'devserver to check auto-update status.', e) 1944 return False 1945 except ValueError as e: 1946 raise DevServerException( 1947 '%s (Got AU status: %r)' % (str(e), au_status)) 1948 1949 if wait: 1950 bin_utils.poll_for_condition( 1951 all_finished, 1952 exception=bin_utils.TimeoutError(), 1953 timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60, 1954 sleep_interval=CROS_AU_POLLING_INTERVAL) 1955 1956 return True 1957 else: 1958 return all_finished() 1959 1960 1961 def check_for_auto_update_finished(self, response, wait=True, **kwargs): 1962 """Processing response of 'cros_au' and polling for auto-update status. 1963 1964 Will wait for the whole auto-update process is finished. 1965 1966 @param response: The response from RPC 'cros_au' 1967 @param kwargs: keyword arguments to make get_au_status devserver call. 1968 1969 @return: a tuple includes two elements. 1970 finished: True if the operation has completed. 1971 raised_error: None if everything works well or the raised error. 1972 pid: the auto-update process id on devserver. 1973 """ 1974 1975 pid = 0 1976 raised_error = None 1977 finished = False 1978 try: 1979 response = json.loads(response) 1980 if response[0]: 1981 pid = response[1] 1982 # If provision is kicked off asynchronously, pid will be -1. 1983 # If provision is not successfully kicked off , pid continues 1984 # to be 0. 1985 if pid > 0: 1986 logging.debug('start process %r for auto_update in ' 1987 'devserver', pid) 1988 finished = self._check_for_auto_update_finished( 1989 pid, wait=wait, **kwargs) 1990 except Exception as e: 1991 logging.debug('Failed to trigger auto-update process on devserver') 1992 finished = True 1993 raised_error = e 1994 finally: 1995 return finished, raised_error, pid 1996 1997 1998 def _parse_AU_error(self, response): 1999 """Parse auto_update error returned from devserver.""" 2000 return re.split('\n', response)[-1] 2001 2002 2003 def _classify_exceptions(self, target_error): 2004 """Parse the error that was raised from auto_update. 2005 2006 @param target_error: A single string representing one time provision 2007 error happened in auto_update(). 2008 2009 @return: If target_error is empty, return None. Otherwise, return a 2010 classified exception type (string) from _EXCEPTION_PATTERNS 2011 or 'Unknown exception'. Current patterns in _EXCEPTION_PATTERNS are 2012 very specific so that errors cannot match more than one pattern. 2013 """ 2014 if not target_error: 2015 return None 2016 2017 for err_pattern, classification in _EXCEPTION_PATTERNS: 2018 match = re.match(err_pattern, target_error) 2019 if match: 2020 return classification 2021 2022 return '(0) Unknown exception' 2023 2024 2025 def _check_error_message(self, error_patterns_to_check, error_msg): 2026 """Detect whether specific error pattern exist in error message. 2027 2028 @param error_patterns_to_check: the error patterns to check 2029 @param error_msg: the error message which may include any error 2030 pattern. 2031 2032 @return A boolean variable, True if error_msg contains any error 2033 pattern in error_patterns_to_check, False otherwise. 2034 """ 2035 for err in error_patterns_to_check: 2036 if err in error_msg: 2037 return True 2038 2039 return False 2040 2041 2042 def _is_retryable(self, error_msg): 2043 """Detect whether we will retry auto-update based on error_msg. 2044 2045 @param error_msg: The given error message. 2046 2047 @return A boolean variable which indicates whether we will retry 2048 auto_update with another devserver based on the given error_msg. 2049 """ 2050 # For now we just hard-code the error message we think it's suspicious. 2051 # When we get more date about what's the json response when devserver 2052 # is overloaded, we can update this part. 2053 retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE, 2054 'is not pingable'] 2055 return self._check_error_message(retryable_error_patterns, error_msg) 2056 2057 2058 def _should_use_original_payload(self, error_msg): 2059 devserver_error_patterns = ['DevserverCannotStartError'] 2060 return self._check_error_message(devserver_error_patterns, error_msg) 2061 2062 2063 def _parse_buildname_safely(self, build_name): 2064 """Parse a given buildname safely. 2065 2066 @param build_name: the build name to be parsed. 2067 2068 @return: a tuple (board, build_type, milestone) 2069 """ 2070 try: 2071 board, build_type, milestone, _ = server_utils.ParseBuildName( 2072 build_name) 2073 except server_utils.ParseBuildNameException: 2074 logging.warning('Unable to parse build name %s for metrics. ' 2075 'Continuing anyway.', build_name) 2076 board, build_type, milestone = ('', '', '') 2077 2078 return board, build_type, milestone 2079 2080 2081 def _emit_auto_update_metrics(self, error_list, duration_list, 2082 is_au_success, board, build_type, milestone, 2083 dut_host_name, is_aue2etest): 2084 """Send metrics for auto_update. 2085 2086 Please note: to avoid reaching or exceeding the monarch field 2087 cardinality limit, we avoid a metric that includes both dut hostname 2088 and other high cardinality fields. 2089 2090 @param error_list: a list of errors happened in provision. Usually it 2091 contains 1 ~ AU_RETRY_LIMIT errors since we only retry provision 2092 for several times. 2093 @param duration_list: a list of provision duration time, counted by 2094 seconds. 2095 @param is_au_success: a field in metrics, representing whether this 2096 auto_update succeeds or not. 2097 @param board: a field in metrics representing which board this 2098 auto_update tries to update. 2099 @param build_type: a field in metrics representing which build type this 2100 auto_update tries to update. 2101 @param milestone: a field in metrics representing which milestone this 2102 auto_update tries to update. 2103 @param dut_host_name: a field in metrics representing which DUT this 2104 auto_update tries to update. 2105 @param is_aue2etest: a field in metrics representing if provision was 2106 done as part of the autoupdate_EndToEndTest. 2107 """ 2108 # Per-devserver cros_update metric. 2109 c1 = metrics.Counter( 2110 'chromeos/autotest/provision/cros_update_per_devserver') 2111 f1 = {'dev_server': self.resolved_hostname, 2112 'success': is_au_success, 2113 'board': board, 2114 'build_type': build_type, 2115 'milestone': milestone, 2116 'error': '', 2117 'is_aue2etest': is_aue2etest} 2118 # Per-DUT cros_update metric. 2119 c2 = metrics.Counter('chromeos/autotest/provision/cros_update_by_dut') 2120 f2 = {'success': is_au_success, 2121 'board': board, 2122 'dut_host_name': dut_host_name, 2123 'error': '', 2124 'is_aue2etest': is_aue2etest} 2125 # Per auto_update metric. 2126 c3 = metrics.Counter( 2127 'chromeos/autotest/provision/cros_update_failure_by_devserver') 2128 f3 = {'dev_server': self.resolved_hostname, 2129 'board': board, 2130 'build_type': build_type, 2131 'milestone': milestone, 2132 'error': '', 2133 'is_aue2etest': is_aue2etest} 2134 # Per auto_update duration metric. 2135 c4 = metrics.Counter( 2136 'chromeos/autotest/provision/auto_update_duration_by_devserver') 2137 c5 = metrics.Counter( 2138 'chromeos/autotest/provision/provision_duration_by_devserver') 2139 f_duration = {'dev_server': self.resolved_hostname, 2140 'success': is_au_success, 2141 'board': board, 2142 'build_type': build_type, 2143 'milestone': milestone, 2144 'duration_seconds': 0, 2145 'is_aue2etest': is_aue2etest} 2146 2147 # Add a field |error| here. Current error's pattern is manually 2148 # specified in _EXCEPTION_PATTERNS. 2149 if not error_list: 2150 c1.increment(fields=f1) 2151 c2.increment(fields=f2) 2152 else: 2153 # In metrics, use the first error as the real provision errors. 2154 raised_error = str(self._classify_exceptions(error_list[0])) 2155 f1['error'] = raised_error 2156 c1.increment(fields=f1) 2157 f2['error'] = raised_error 2158 c2.increment(fields=f2) 2159 2160 # Record all errors in metrics cros_update_failure_by_devserver, 2161 # to make it truly reflect whether there're some particular errors 2162 # hit lab frequently. Previously, all errors raised in the second 2163 # try of auto_update will be eaten. 2164 for err in error_list: 2165 f3['error'] = str(self._classify_exceptions(err)) 2166 c3.increment(fields=f3) 2167 2168 total_provision_duration = 0 2169 for per_au_duration in duration_list: 2170 total_provision_duration += per_au_duration 2171 f_duration['duration_seconds'] = per_au_duration 2172 c4.increment(fields=f_duration) 2173 2174 f_duration['duration_seconds'] = total_provision_duration 2175 c5.increment(fields=f_duration) 2176 2177 def _parse_buildname_from_gs_uri(self, uri): 2178 """Get parameters needed for AU metrics when build_name is not known. 2179 2180 autoupdate_EndToEndTest is run with two Google Storage URIs from the 2181 gs://chromeos-releases bucket. URIs in this bucket do not have the 2182 build_name in the format samus-release/R60-0000.0.0. 2183 2184 We can get the milestone and board by checking the instructions.json 2185 file contained in the bucket with the payloads. 2186 2187 @param uri: The partial uri we received from autoupdate_EndToEndTest. 2188 """ 2189 try: 2190 # Get the instructions file that contains info about the build. 2191 gs_file = 'gs://chromeos-releases/' + uri + '/*instructions.json' 2192 files = bin_utils.gs_ls(gs_file) 2193 for f in files: 2194 gs_folder, _, instruction_file = f.rpartition('/') 2195 self.stage_artifacts(image=uri, 2196 files=[instruction_file], 2197 archive_url=gs_folder) 2198 json_file = self.get_staged_file_url(instruction_file, uri) 2199 response = urllib2.urlopen(json_file) 2200 data = json.load(response) 2201 return data['board'], 'release', data['version']['milestone'] 2202 except (ValueError, error.CmdError, urllib2.URLError) as e: 2203 logging.debug('Problem getting values for metrics: %s', e) 2204 logging.warning('Unable to parse build name %s from AU test for ' 2205 'metrics. Continuing anyway.', uri) 2206 2207 return '', '', '' 2208 2209 2210 def auto_update(self, host_name, build_name, original_board=None, 2211 original_release_version=None, log_dir=None, 2212 force_update=False, full_update=False, 2213 payload_filename=None, force_original=False, 2214 clobber_stateful=True, quick_provision=False): 2215 """Auto-update a CrOS host. 2216 2217 @param host_name: The hostname of the DUT to auto-update. 2218 @param build_name: The build name to be auto-updated on the DUT. 2219 @param original_board: The original board of the DUT to auto-update. 2220 @param original_release_version: The release version of the DUT's 2221 current build. 2222 @param log_dir: The log directory to store auto-update logs from 2223 devserver. 2224 @param force_update: Force an update even if the version installed 2225 is the same. Default: False. 2226 @param full_update: If True, do not run stateful update, directly 2227 force a full reimage. If False, try stateful 2228 update first if the dut is already installed 2229 with the same version. 2230 @param payload_filename: Used to specify the exact file to 2231 use for autoupdating. If None, the payload 2232 will be determined by build_name. You 2233 must have already staged this file before 2234 passing it in here. 2235 @param force_original: Whether to force stateful update with the 2236 original payload. 2237 @param clobber_stateful: If True do a clean install of stateful. 2238 @param quick_provision: Attempt to use quick provision path first. 2239 2240 @return A set (is_success, pid) in which: 2241 1. is_success indicates whether this auto_update succeeds. 2242 2. pid is the process id of the successful autoupdate run. 2243 2244 @raise DevServerException if auto_update fails and is not retryable. 2245 @raise RetryableProvisionException if it fails and is retryable. 2246 """ 2247 kwargs = {'host_name': host_name, 2248 'build_name': build_name, 2249 'force_update': force_update, 2250 'full_update': full_update, 2251 'clobber_stateful': clobber_stateful, 2252 'quick_provision': quick_provision} 2253 2254 is_aue2etest = payload_filename is not None 2255 2256 if is_aue2etest: 2257 kwargs['payload_filename'] = payload_filename 2258 2259 error_msg = 'CrOS auto-update failed for host %s: %s' 2260 error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s' 2261 is_au_success = False 2262 au_log_dir = os.path.join(log_dir, 2263 AUTO_UPDATE_LOG_DIR) if log_dir else None 2264 error_list = [] 2265 retry_with_another_devserver = False 2266 duration_list = [] 2267 2268 if is_aue2etest: 2269 board, build_type, milestone = self._parse_buildname_from_gs_uri( 2270 build_name) 2271 else: 2272 board, build_type, milestone = self._parse_buildname_safely( 2273 build_name) 2274 2275 for au_attempt in range(AU_RETRY_LIMIT): 2276 logging.debug('Start CrOS auto-update for host %s at %d time(s).', 2277 host_name, au_attempt + 1) 2278 start_time = time.time() 2279 # No matter _trigger_auto_update succeeds or fails, the auto-update 2280 # track_status_file should be cleaned, and the auto-update execute 2281 # log should be collected to directory sysinfo. Also, the error 2282 # raised by _trigger_auto_update should be displayed. 2283 try: 2284 # Try update with stateful.tgz of old release version in the 2285 # last try of auto-update. 2286 if force_original and original_release_version: 2287 # Monitor this case in monarch 2288 original_build = '%s/%s' % (original_board, 2289 original_release_version) 2290 c = metrics.Counter( 2291 'chromeos/autotest/provision/' 2292 'cros_update_with_original_build') 2293 f = {'dev_server': self.resolved_hostname, 2294 'board': board, 2295 'build_type': build_type, 2296 'milestone': milestone, 2297 'original_build': original_build} 2298 c.increment(fields=f) 2299 2300 logging.debug('Try updating stateful partition of the ' 2301 'host with the same version of its current ' 2302 'rootfs partition: %s', original_build) 2303 response = self._trigger_auto_update( 2304 original_build=original_build, **kwargs) 2305 else: 2306 response = self._trigger_auto_update(**kwargs) 2307 except DevServerException as e: 2308 logging.debug(error_msg_attempt, au_attempt+1, str(e)) 2309 error_list.append(str(e)) 2310 else: 2311 _, raised_error, pid = self.check_for_auto_update_finished( 2312 response, **kwargs) 2313 2314 # Error happens in _collect_au_log won't be raised. 2315 if au_log_dir: 2316 is_collect_success = self.collect_au_log( 2317 kwargs['host_name'], pid, au_log_dir) 2318 else: 2319 is_collect_success = True 2320 2321 # Error happens in _clean_track_log won't be raised. 2322 if pid >= 0: 2323 is_clean_success = self.clean_track_log( 2324 kwargs['host_name'], pid) 2325 else: 2326 is_clean_success = True 2327 2328 # If any error is raised previously, log it and retry 2329 # auto-update. Otherwise, claim a successful CrOS auto-update. 2330 if (not raised_error and is_clean_success and 2331 is_collect_success): 2332 logging.debug('CrOS auto-update succeed for host %s', 2333 host_name) 2334 is_au_success = True 2335 break 2336 else: 2337 if not self.kill_au_process_for_host(kwargs['host_name'], 2338 pid): 2339 logging.debug('Failed to kill auto_update process %d', 2340 pid) 2341 if raised_error: 2342 logging.debug(error_msg_attempt, au_attempt+1, 2343 str(raised_error)) 2344 if au_log_dir: 2345 logging.debug('Please see error details in log %s', 2346 self._get_au_log_filename( 2347 au_log_dir, 2348 kwargs['host_name'], 2349 pid)) 2350 error_list.append(self._parse_AU_error(str(raised_error))) 2351 if self._is_retryable(str(raised_error)): 2352 retry_with_another_devserver = True 2353 2354 if self._should_use_original_payload(str(raised_error)): 2355 force_original = True 2356 2357 finally: 2358 duration_list.append(int(time.time() - start_time)) 2359 if retry_with_another_devserver: 2360 break 2361 2362 if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1: 2363 time.sleep(CROS_AU_RETRY_INTERVAL) 2364 # Use the IP of DUT if the hostname failed. 2365 host_name_ip = socket.gethostbyname(host_name) 2366 kwargs['host_name'] = host_name_ip 2367 logging.debug( 2368 'AU failed, trying IP instead of hostname: %s', 2369 host_name_ip) 2370 2371 self._emit_auto_update_metrics(error_list, duration_list, is_au_success, 2372 board, build_type, milestone, host_name, 2373 is_aue2etest) 2374 2375 if is_au_success: 2376 return (is_au_success, pid) 2377 2378 # If errors happen in the CrOS AU process, report the concatenation 2379 # of the errors happening in first & second provision. 2380 # If error happens in RPCs of cleaning track log, collecting 2381 # auto-update logs, or killing auto-update processes, just report a 2382 # common error here. 2383 if error_list: 2384 real_error = '' 2385 for i in range(len(error_list)): 2386 real_error += '%d) %s, ' % ( 2387 i, ' '.join(error_list[i].splitlines())) 2388 if retry_with_another_devserver: 2389 raise RetryableProvisionException( 2390 error_msg % (host_name, real_error)) 2391 else: 2392 raise DevServerException( 2393 error_msg % (host_name, real_error)) 2394 else: 2395 raise DevServerException(error_msg % ( 2396 host_name, ('RPC calls after the whole auto-update ' 2397 'process failed.'))) 2398 2399 2400class AndroidBuildServer(ImageServerBase): 2401 """Class for DevServer that handles RPCs related to Android builds. 2402 2403 The calls to devserver to stage artifacts, including stage and download, are 2404 made in async mode. That is, when caller makes an RPC |stage| to request 2405 devserver to stage certain artifacts, devserver handles the call and starts 2406 staging artifacts in a new thread, and return |Success| without waiting for 2407 staging being completed. When caller receives message |Success|, it polls 2408 devserver's is_staged call until all artifacts are staged. 2409 Such mechanism is designed to prevent cherrypy threads in devserver being 2410 running out, as staging artifacts might take long time, and cherrypy starts 2411 with a fixed number of threads that handle devserver rpc. 2412 """ 2413 2414 def wait_for_artifacts_staged(self, target, build_id, branch, 2415 archive_url=None, artifacts='', files=''): 2416 """Polling devserver.is_staged until all artifacts are staged. 2417 2418 @param target: Target of the android build to stage, e.g., 2419 shamu-userdebug. 2420 @param build_id: Build id of the android build to stage. 2421 @param branch: Branch of the android build to stage. 2422 @param archive_url: Google Storage URL for the build. 2423 @param artifacts: Comma separated list of artifacts to download. 2424 @param files: Comma separated list of files to download. 2425 2426 @return: True if all artifacts are staged in devserver. 2427 """ 2428 kwargs = {'target': target, 2429 'build_id': build_id, 2430 'branch': branch, 2431 'artifacts': artifacts, 2432 'files': files, 2433 'os_type': 'android'} 2434 if archive_url: 2435 kwargs['archive_url'] = archive_url 2436 return self._poll_is_staged(**kwargs) 2437 2438 2439 @remote_devserver_call() 2440 def call_and_wait(self, call_name, target, build_id, branch, archive_url, 2441 artifacts, files, error_message, 2442 expected_response=SUCCESS): 2443 """Helper method to make a urlopen call, and wait for artifacts staged. 2444 2445 @param call_name: name of devserver rpc call. 2446 @param target: Target of the android build to stage, e.g., 2447 shamu-userdebug. 2448 @param build_id: Build id of the android build to stage. 2449 @param branch: Branch of the android build to stage. 2450 @param archive_url: Google Storage URL for the CrOS build. 2451 @param artifacts: Comma separated list of artifacts to download. 2452 @param files: Comma separated list of files to download. 2453 @param expected_response: Expected response from rpc, default to 2454 |Success|. If it's set to None, do not compare 2455 the actual response. Any response is consider 2456 to be good. 2457 @param error_message: Error message to be thrown if response does not 2458 match expected_response. 2459 2460 @return: The response from rpc. 2461 @raise DevServerException upon any return code that's expected_response. 2462 2463 """ 2464 kwargs = {'target': target, 2465 'build_id': build_id, 2466 'branch': branch, 2467 'artifacts': artifacts, 2468 'files': files, 2469 'os_type': 'android'} 2470 if archive_url: 2471 kwargs['archive_url'] = archive_url 2472 return self._call_and_wait(call_name, error_message, expected_response, 2473 **kwargs) 2474 2475 2476 @remote_devserver_call() 2477 def stage_artifacts(self, target=None, build_id=None, branch=None, 2478 image=None, artifacts=None, files='', archive_url=None): 2479 """Tell the devserver to download and stage |artifacts| from |image|. 2480 2481 This is the main call point for staging any specific artifacts for a 2482 given build. To see the list of artifacts one can stage see: 2483 2484 ~src/platfrom/dev/artifact_info.py. 2485 2486 This is maintained along with the actual devserver code. 2487 2488 @param target: Target of the android build to stage, e.g., 2489 shamu-userdebug. 2490 @param build_id: Build id of the android build to stage. 2491 @param branch: Branch of the android build to stage. 2492 @param image: Name of a build to test, in the format of 2493 branch/target/build_id 2494 @param artifacts: A list of artifacts. 2495 @param files: A list of files to stage. 2496 @param archive_url: Optional parameter that has the archive_url to stage 2497 this artifact from. Default is specified in autotest config + 2498 image. 2499 2500 @raise DevServerException upon any return code that's not HTTP OK. 2501 """ 2502 if image and not target and not build_id and not branch: 2503 branch, target, build_id = utils.parse_launch_control_build(image) 2504 if not target or not build_id or not branch: 2505 raise DevServerException('Must specify all build info (target, ' 2506 'build_id and branch) to stage.') 2507 2508 android_build_info = {'target': target, 2509 'build_id': build_id, 2510 'branch': branch} 2511 if not artifacts and not files: 2512 raise DevServerException('Must specify something to stage.') 2513 if not all(android_build_info.values()): 2514 raise DevServerException( 2515 'To stage an Android build, must specify target, build id ' 2516 'and branch.') 2517 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2518 self._stage_artifacts(build, artifacts, files, archive_url, 2519 **android_build_info) 2520 2521 def get_pull_url(self, target, build_id, branch): 2522 """Get the url to pull files from the devserver. 2523 2524 @param target: Target of the android build, e.g., shamu_userdebug 2525 @param build_id: Build id of the android build. 2526 @param branch: Branch of the android build. 2527 2528 @return A url to pull files from the dev server given a specific 2529 android build. 2530 """ 2531 return os.path.join(self.url(), 'static', branch, target, build_id) 2532 2533 2534 def trigger_download(self, target, build_id, branch, artifacts=None, 2535 files='', os='android', synchronous=True): 2536 """Tell the devserver to download and stage an Android build. 2537 2538 Tells the devserver to fetch an Android build from the image storage 2539 server named by _get_image_storage_server(). 2540 2541 If |synchronous| is True, waits for the entire download to finish 2542 staging before returning. Otherwise only the artifacts necessary 2543 to start installing images onto DUT's will be staged before returning. 2544 A caller can then call finish_download to guarantee the rest of the 2545 artifacts have finished staging. 2546 2547 @param target: Target of the android build to stage, e.g., 2548 shamu-userdebug. 2549 @param build_id: Build id of the android build to stage. 2550 @param branch: Branch of the android build to stage. 2551 @param artifacts: A string of artifacts separated by comma. If None, 2552 use the default artifacts for Android or Brillo build. 2553 @param files: String of file seperated by commas. 2554 @param os: OS artifacts to download (android/brillo). 2555 @param synchronous: if True, waits until all components of the image are 2556 staged before returning. 2557 2558 @raise DevServerException upon any return code that's not HTTP OK. 2559 2560 """ 2561 android_build_info = {'target': target, 2562 'build_id': build_id, 2563 'branch': branch} 2564 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2565 if not artifacts: 2566 board = target.split('-')[0] 2567 artifacts = ( 2568 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2569 board, os)) 2570 self._trigger_download(build, artifacts, files=files, 2571 synchronous=synchronous, **android_build_info) 2572 2573 2574 def finish_download(self, target, build_id, branch, os='android'): 2575 """Tell the devserver to finish staging an Android build. 2576 2577 If trigger_download is called with synchronous=False, it will return 2578 before all artifacts have been staged. This method contacts the 2579 devserver and blocks until all staging is completed and should be 2580 called after a call to trigger_download. 2581 2582 @param target: Target of the android build to stage, e.g., 2583 shamu-userdebug. 2584 @param build_id: Build id of the android build to stage. 2585 @param branch: Branch of the android build to stage. 2586 @param os: OS artifacts to download (android/brillo). 2587 2588 @raise DevServerException upon any return code that's not HTTP OK. 2589 """ 2590 android_build_info = {'target': target, 2591 'build_id': build_id, 2592 'branch': branch} 2593 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2594 board = target.split('-')[0] 2595 artifacts = ( 2596 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2597 board)) 2598 self._finish_download(build, artifacts, files='', **android_build_info) 2599 2600 2601 def get_staged_file_url(self, filename, target, build_id, branch): 2602 """Returns the url of a staged file for this image on the devserver. 2603 2604 @param filename: Name of the file. 2605 @param target: Target of the android build to stage, e.g., 2606 shamu-userdebug. 2607 @param build_id: Build id of the android build to stage. 2608 @param branch: Branch of the android build to stage. 2609 2610 @return: The url of a staged file for this image on the devserver. 2611 """ 2612 android_build_info = {'target': target, 2613 'build_id': build_id, 2614 'branch': branch, 2615 'os_type': 'android'} 2616 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2617 return '/'.join([self._get_image_url(build), filename]) 2618 2619 2620 @remote_devserver_call() 2621 def translate(self, build_name): 2622 """Translate the build name if it's in LATEST format. 2623 2624 If the build name is in the format [branch]/[target]/LATEST, return the 2625 latest build in Launch Control otherwise return the build name as is. 2626 2627 @param build_name: build_name to check. 2628 2629 @return The actual build name to use. 2630 """ 2631 branch, target, build_id = utils.parse_launch_control_build(build_name) 2632 if build_id.upper() != 'LATEST': 2633 return build_name 2634 call = self.build_call('latestbuild', branch=branch, target=target, 2635 os_type='android') 2636 translated_build_id = self.run_call(call) 2637 translated_build = (ANDROID_BUILD_NAME_PATTERN % 2638 {'branch': branch, 2639 'target': target, 2640 'build_id': translated_build_id}) 2641 logging.debug('Translated relative build %s to %s', build_name, 2642 translated_build) 2643 return translated_build 2644 2645 2646def _is_load_healthy(load): 2647 """Check if devserver's load meets the minimum threshold. 2648 2649 @param load: The devserver's load stats to check. 2650 2651 @return: True if the load meets the minimum threshold. Return False 2652 otherwise. 2653 2654 """ 2655 # Threshold checks, including CPU load. 2656 if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD: 2657 logging.debug('CPU load of devserver %s is at %s%%, which is higher ' 2658 'than the threshold of %s%%', load['devserver'], 2659 load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD) 2660 return False 2661 if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO: 2662 logging.debug('Network IO of devserver %s is at %i Bps, which is ' 2663 'higher than the threshold of %i bytes per second.', 2664 load['devserver'], load[DevServer.NETWORK_IO], 2665 DevServer.MAX_NETWORK_IO) 2666 return False 2667 return True 2668 2669 2670def _compare_load(devserver1, devserver2): 2671 """Comparator function to compare load between two devservers. 2672 2673 @param devserver1: A dictionary of devserver load stats to be compared. 2674 @param devserver2: A dictionary of devserver load stats to be compared. 2675 2676 @return: Negative value if the load of `devserver1` is less than the load 2677 of `devserver2`. Return positive value otherwise. 2678 2679 """ 2680 return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO]) 2681 2682 2683def _get_subnet_for_host_ip(host_ip, 2684 restricted_subnets=utils.RESTRICTED_SUBNETS): 2685 """Get the subnet for a given host IP. 2686 2687 @param host_ip: the IP of a DUT. 2688 @param restricted_subnets: A list of restriected subnets. 2689 2690 @return: a (subnet_ip, mask_bits) tuple. If no matched subnet for the 2691 host_ip, return (None, None). 2692 """ 2693 for subnet_ip, mask_bits in restricted_subnets: 2694 if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits): 2695 return subnet_ip, mask_bits 2696 2697 return None, None 2698 2699 2700def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None): 2701 """Get the devserver with the least load. 2702 2703 Iterate through all devservers and get the one with least load. 2704 2705 TODO(crbug.com/486278): Devserver with required build already staged should 2706 take higher priority. This will need check_health call to be able to verify 2707 existence of a given build/artifact. Also, in case all devservers are 2708 overloaded, the logic here should fall back to the old behavior that randomly 2709 selects a devserver based on the hash of the image name/url. 2710 2711 @param devserver_type: Type of devserver to select from. Default is set to 2712 ImageServer. 2713 @param hostname: Hostname of the dut that the devserver is used for. The 2714 picked devserver needs to respect the location of the host if 2715 `prefer_local_devserver` is set to True or `restricted_subnets` is 2716 set. 2717 2718 @return: Name of the devserver with the least load. 2719 2720 """ 2721 logging.debug('Get the least loaded %r', devserver_type) 2722 devservers, can_retry = devserver_type.get_available_devservers( 2723 hostname) 2724 # If no healthy devservers available and can_retry is False, return None. 2725 # Otherwise, relax the constrain on hostname, allow all devservers to be 2726 # available. 2727 if not devserver_type.get_healthy_devserver('', devservers): 2728 if not can_retry: 2729 return None 2730 else: 2731 devservers, _ = devserver_type.get_available_devservers() 2732 2733 # get_devserver_load call needs to be made in a new process to allow force 2734 # timeout using signal. 2735 output = multiprocessing.Queue() 2736 processes = [] 2737 for devserver in devservers: 2738 processes.append(multiprocessing.Process( 2739 target=devserver_type.get_devserver_load_wrapper, 2740 args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output))) 2741 2742 for p in processes: 2743 p.start() 2744 for p in processes: 2745 p.join() 2746 loads = [output.get() for p in processes] 2747 # Filter out any load failed to be retrieved or does not support load check. 2748 loads = [load for load in loads if load and DevServer.CPU_LOAD in load and 2749 DevServer.is_free_disk_ok(load) and 2750 DevServer.is_apache_client_count_ok(load)] 2751 if not loads: 2752 logging.debug('Failed to retrieve load stats from any devserver. No ' 2753 'load balancing can be applied.') 2754 return None 2755 loads = [load for load in loads if _is_load_healthy(load)] 2756 if not loads: 2757 logging.error('No devserver has the capacity to be selected.') 2758 return None 2759 loads = sorted(loads, cmp=_compare_load) 2760 return loads[0]['devserver'] 2761 2762 2763def resolve(build, hostname=None, ban_list=None): 2764 """Resolve a devserver can be used for given build and hostname. 2765 2766 @param build: Name of a build to stage on devserver, e.g., 2767 ChromeOS build: daisy-release/R50-1234.0.0 2768 Launch Control build: git_mnc_release/shamu-eng 2769 @param hostname: Hostname of a devserver for, default is None, which means 2770 devserver is not restricted by the network location of the host. 2771 @param ban_list: The blacklist of devservers shouldn't be chosen. 2772 2773 @return: A DevServer instance that can be used to stage given build for the 2774 given host. 2775 """ 2776 if utils.is_launch_control_build(build): 2777 return AndroidBuildServer.resolve(build, hostname) 2778 else: 2779 return ImageServer.resolve(build, hostname, ban_list=ban_list) 2780