dev_server.py revision 2c32d6b593c4987a525ef162d6704fa6d6d7c0b0
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from distutils import version 6import cStringIO 7import HTMLParser 8import httplib 9import json 10import logging 11import multiprocessing 12import os 13import re 14import socket 15import time 16import urllib2 17import urlparse 18 19from autotest_lib.client.bin import utils as bin_utils 20from autotest_lib.client.common_lib import android_utils 21from autotest_lib.client.common_lib import error 22from autotest_lib.client.common_lib import global_config 23from autotest_lib.client.common_lib import utils 24from autotest_lib.client.common_lib.cros import retry 25from autotest_lib.server import utils as server_utils 26# TODO(cmasone): redo this class using requests module; http://crosbug.com/30107 27 28try: 29 from chromite.lib import metrics 30except ImportError: 31 metrics = utils.metrics_mock 32 33 34CONFIG = global_config.global_config 35# This file is generated at build time and specifies, per suite and per test, 36# the DEPENDENCIES list specified in each control file. It's a dict of dicts: 37# {'bvt': {'/path/to/autotest/control/site_tests/test1/control': ['dep1']} 38# 'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']} 39# 'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'], 40# '/path/to/autotest/control/site_tests/test3/control': ['dep3']} 41# } 42DEPENDENCIES_FILE = 'test_suites/dependency_info' 43# Number of seconds for caller to poll devserver's is_staged call to check if 44# artifacts are staged. 45_ARTIFACT_STAGE_POLLING_INTERVAL = 5 46# Artifacts that should be staged when client calls devserver RPC to stage an 47# image. 48_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful' 49# Artifacts that should be staged when client calls devserver RPC to stage an 50# image with autotest artifact. 51_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,' 52 'control_files,stateful,' 53 'autotest_packages') 54# Artifacts that should be staged when client calls devserver RPC to stage an 55# Android build. 56_BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions') 57SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value( 58 'CROS', 'skip_devserver_health_check', type=bool) 59# Number of seconds for the call to get devserver load to time out. 60TIMEOUT_GET_DEVSERVER_LOAD = 2.0 61 62# Android artifact path in devserver 63ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value( 64 'CROS', 'android_build_name_pattern', type=str).replace('\\', '') 65 66# Return value from a devserver RPC indicating the call succeeded. 67SUCCESS = 'Success' 68 69# The timeout minutes for a given devserver ssh call. 70DEVSERVER_SSH_TIMEOUT_MINS = 1 71 72# Error message for invalid devserver response. 73ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error' 74 75# Error message for devserver call timedout. 76ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout' 77 78# The timeout minutes for waiting a devserver staging. 79DEVSERVER_IS_STAGING_RETRY_MIN = 100 80 81# The timeout minutes for waiting a DUT auto-update finished. 82DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100 83 84# The total times of devserver triggering CrOS auto-update. 85AU_RETRY_LIMIT = 3 86 87# Number of seconds for caller to poll devserver's get_au_status call to 88# check if cros auto-update is finished. 89CROS_AU_POLLING_INTERVAL = 10 90 91# Number of seconds for intervals between retrying auto-update calls. 92CROS_AU_RETRY_INTERVAL = 20 93 94# The file name for auto-update logs. 95CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log' 96 97# Provision error patterns. 98# People who see this should know that they shouldn't change these 99# classification strings. These strings are used for monitoring provision 100# failures. Any changes may mess up the stats. 101_EXCEPTION_PATTERNS = [ 102 # Raised when devserver portfile does not exist on host. 103 (r".*Devserver portfile does not exist!.*$", 104 '(1) Devserver portfile does not exist on host'), 105 # Raised when devserver cannot copy packages to host. 106 (r".*Could not copy .* to device.*$", 107 '(2) Cannot copy packages to host'), 108 # Raised when devserver fails to run specific commands on host. 109 (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$", 110 '(3) Fail to run specific command on host'), 111 # Raised when new build fails to boot on the host. 112 (r'.*RootfsUpdateError: Build .* failed to boot on.*$', 113 '(4) Build failed to boot on host'), 114 # Raised when the auto-update process is timed out. 115 (r'.*The CrOS auto-update process is timed out, ' 116 'thus will be terminated.*$', 117 '(5) Auto-update is timed out'), 118 # Raised when the host is not pingable. 119 (r".*DeviceNotPingableError.*$", 120 '(6) Host is not pingable during auto-update'), 121 # Raised when hosts have unexpected status after rootfs update. 122 (r'.*Update failed with unexpected update status: ' 123 'UPDATE_STATUS_IDLE.*$', 124 '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs ' 125 'update'), 126 # Raised when devserver returns non-json response to shard/drone. 127 (r'.*No JSON object could be decoded.*$', 128 '(8) Devserver returned non-json object'), 129 # Raised when devserver loses host's ssh connection 130 (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$', 131 "(9) Devserver lost host's ssh connection"), 132 # Raised when error happens in writing files to host 133 (r'.*Write failed\: Broken pipe.*$', 134 "(10) Broken pipe while writing or connecting to host")] 135 136PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value( 137 'CROS', 'prefer_local_devserver', type=bool, default=False) 138 139ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value( 140 'CROS', 'enable_ssh_connection_for_devserver', type=bool, 141 default=False) 142 143# Directory to save auto-update logs 144AUTO_UPDATE_LOG_DIR = 'autoupdate_logs' 145 146DEFAULT_SUBNET_MASKBIT = 19 147 148 149class DevServerException(Exception): 150 """Raised when the dev server returns a non-200 HTTP response.""" 151 pass 152 153 154class DevServerOverloadException(Exception): 155 """Raised when the dev server returns a 502 HTTP response.""" 156 pass 157 158 159class MarkupStripper(HTMLParser.HTMLParser): 160 """HTML parser that strips HTML tags, coded characters like & 161 162 Works by, basically, not doing anything for any tags, and only recording 163 the content of text nodes in an internal data structure. 164 """ 165 def __init__(self): 166 self.reset() 167 self.fed = [] 168 169 170 def handle_data(self, d): 171 """Consume content of text nodes, store it away.""" 172 self.fed.append(d) 173 174 175 def get_data(self): 176 """Concatenate and return all stored data.""" 177 return ''.join(self.fed) 178 179 180def _strip_http_message(message): 181 """Strip the HTTP marker from the an HTTP message. 182 183 @param message: A string returned by an HTTP call. 184 185 @return: A string with HTTP marker being stripped. 186 """ 187 strip = MarkupStripper() 188 try: 189 strip.feed(message.decode('utf_32')) 190 except UnicodeDecodeError: 191 strip.feed(message) 192 return strip.get_data() 193 194 195def _get_image_storage_server(): 196 return CONFIG.get_config_value('CROS', 'image_storage_server', type=str) 197 198 199def _get_canary_channel_server(): 200 """ 201 Get the url of the canary-channel server, 202 eg: gsutil://chromeos-releases/canary-channel/<board>/<release> 203 204 @return: The url to the canary channel server. 205 """ 206 return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str) 207 208 209def _get_storage_server_for_artifacts(artifacts=None): 210 """Gets the appropriate storage server for the given artifacts. 211 212 @param artifacts: A list of artifacts we need to stage. 213 @return: The address of the storage server that has these artifacts. 214 The default image storage server if no artifacts are specified. 215 """ 216 factory_artifact = global_config.global_config.get_config_value( 217 'CROS', 'factory_artifact', type=str, default='') 218 if artifacts and factory_artifact and factory_artifact in artifacts: 219 return _get_canary_channel_server() 220 return _get_image_storage_server() 221 222 223def _reverse_lookup_from_config(address): 224 """Look up hostname for the given IP address. 225 226 This uses the hostname-address map from the config file. 227 228 If multiple hostnames map to the same IP address, the first one 229 defined in the configuration file takes precedence. 230 231 @param address: IP address string 232 @returns: hostname string, or original input if not found 233 """ 234 for hostname, addr in _get_hostname_addr_map().iteritems(): 235 if addr == address: 236 return hostname 237 return address 238 239 240def _get_hostname_addr_map(): 241 """Get hostname address mapping from config. 242 243 @return: dict mapping server hostnames to addresses 244 """ 245 return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP') 246 247 248def _get_dev_server_list(): 249 return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[]) 250 251 252def _get_crash_server_list(): 253 return CONFIG.get_config_value('CROS', 'crash_server', type=list, 254 default=[]) 255 256 257def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN, 258 exception_to_raise=DevServerException): 259 """A decorator to use with remote devserver calls. 260 261 This decorator converts urllib2.HTTPErrors into DevServerExceptions 262 with any embedded error info converted into plain text. The method 263 retries on urllib2.URLError or error.CmdError to avoid devserver flakiness. 264 """ 265 #pylint: disable=C0111 266 267 def inner_decorator(method): 268 label = method.__name__ if hasattr(method, '__name__') else None 269 def metrics_wrapper(*args, **kwargs): 270 @retry.retry((urllib2.URLError, error.CmdError, 271 DevServerOverloadException), 272 timeout_min=timeout_min, 273 exception_to_raise=exception_to_raise, 274 label=label) 275 def wrapper(): 276 """This wrapper actually catches the HTTPError.""" 277 try: 278 return method(*args, **kwargs) 279 except urllib2.HTTPError as e: 280 error_markup = e.read() 281 raise DevServerException(_strip_http_message(error_markup)) 282 283 try: 284 return wrapper() 285 except Exception as e: 286 if ERR_MSG_FOR_TIMED_OUT_CALL in str(e): 287 dev_server = None 288 if args and isinstance(args[0], DevServer): 289 dev_server = args[0].hostname 290 elif 'devserver' in kwargs: 291 dev_server = get_hostname(kwargs['devserver']) 292 293 logging.debug('RPC call %s has timed out on devserver %s.', 294 label, dev_server) 295 c = metrics.Counter( 296 'chromeos/autotest/devserver/call_timeout') 297 c.increment(fields={'dev_server': dev_server, 298 'healthy': label}) 299 300 raise 301 302 return metrics_wrapper 303 304 return inner_decorator 305 306 307def get_hostname(url): 308 """Get the hostname portion of a URL 309 310 schema://hostname:port/path 311 312 @param url: a Url string 313 @return: a hostname string 314 """ 315 return urlparse.urlparse(url).hostname 316 317 318class DevServer(object): 319 """Base class for all DevServer-like server stubs. 320 321 This is the base class for interacting with all Dev Server-like servers. 322 A caller should instantiate a sub-class of DevServer with: 323 324 host = SubClassServer.resolve(build) 325 server = SubClassServer(host) 326 """ 327 _MIN_FREE_DISK_SPACE_GB = 20 328 _MAX_APACHE_CLIENT_COUNT = 75 329 # Threshold for the CPU load percentage for a devserver to be selected. 330 MAX_CPU_LOAD = 80.0 331 # Threshold for the network IO, set to 80MB/s 332 MAX_NETWORK_IO = 1024 * 1024 * 80 333 DISK_IO = 'disk_total_bytes_per_second' 334 NETWORK_IO = 'network_total_bytes_per_second' 335 CPU_LOAD = 'cpu_percent' 336 FREE_DISK = 'free_disk' 337 AU_PROCESS = 'au_process_count' 338 STAGING_THREAD_COUNT = 'staging_thread_count' 339 APACHE_CLIENT_COUNT = 'apache_client_count' 340 341 342 def __init__(self, devserver): 343 self._devserver = devserver 344 345 346 def url(self): 347 """Returns the url for this devserver.""" 348 return self._devserver 349 350 351 @property 352 def hostname(self): 353 """Return devserver hostname parsed from the devserver URL. 354 355 Note that this is likely parsed from the devserver URL from 356 shadow_config.ini, meaning that the "hostname" part of the 357 devserver URL is actually an IP address. 358 359 @return hostname string 360 """ 361 return get_hostname(self.url()) 362 363 364 @property 365 def resolved_hostname(self): 366 """Return devserver hostname, resolved from its IP address. 367 368 Unlike the hostname property, this property attempts to look up 369 the proper hostname from the devserver IP address. If lookup 370 fails, then fall back to whatever the hostname property would 371 have returned. 372 373 @return hostname string 374 """ 375 return _reverse_lookup_from_config(self.hostname) 376 377 378 @staticmethod 379 def get_server_url(url): 380 """Get the devserver url from a repo url, which includes build info. 381 382 @param url: A job repo url. 383 384 @return A devserver url, e.g., http://127.0.0.10:8080 385 """ 386 res = urlparse.urlparse(url) 387 if res.netloc: 388 return res.scheme + '://' + res.netloc 389 390 391 @classmethod 392 def get_devserver_load_wrapper(cls, devserver, timeout_sec, output): 393 """A wrapper function to call get_devserver_load in parallel. 394 395 @param devserver: url of the devserver. 396 @param timeout_sec: Number of seconds before time out the devserver 397 call. 398 @param output: An output queue to save results to. 399 """ 400 load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0) 401 if load: 402 load['devserver'] = devserver 403 output.put(load) 404 405 406 @classmethod 407 def get_devserver_load(cls, devserver, 408 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 409 """Returns True if the |devserver| is healthy to stage build. 410 411 @param devserver: url of the devserver. 412 @param timeout_min: How long to wait in minutes before deciding the 413 the devserver is not up (float). 414 415 @return: A dictionary of the devserver's load. 416 417 """ 418 call = cls._build_call(devserver, 'check_health') 419 @remote_devserver_call(timeout_min=timeout_min) 420 def get_load(devserver=devserver): 421 """Inner method that makes the call.""" 422 return cls.run_call(call, timeout=timeout_min*60) 423 424 try: 425 return json.load(cStringIO.StringIO(get_load(devserver=devserver))) 426 except Exception as e: 427 logging.error('Devserver call failed: "%s", timeout: %s seconds,' 428 ' Error: %s', call, timeout_min * 60, e) 429 430 431 @classmethod 432 def is_free_disk_ok(cls, load): 433 """Check if a devserver has enough free disk. 434 435 @param load: A dict of the load of the devserver. 436 437 @return: True if the devserver has enough free disk or disk check is 438 skipped in global config. 439 440 """ 441 if SKIP_DEVSERVER_HEALTH_CHECK: 442 logging.debug('devserver health check is skipped.') 443 elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB: 444 return False 445 446 return True 447 448 449 @classmethod 450 def is_apache_client_count_ok(cls, load): 451 """Check if a devserver has enough Apache connections available. 452 453 Apache server by default has maximum of 150 concurrent connections. If 454 a devserver has too many live connections, it likely indicates the 455 server is busy handling many long running download requests, e.g., 456 downloading stateful partitions. It is better not to add more requests 457 to it. 458 459 @param load: A dict of the load of the devserver. 460 461 @return: True if the devserver has enough Apache connections available, 462 or disk check is skipped in global config. 463 464 """ 465 if SKIP_DEVSERVER_HEALTH_CHECK: 466 logging.debug('devserver health check is skipped.') 467 elif cls.APACHE_CLIENT_COUNT not in load: 468 logging.debug('Apache client count is not collected from devserver.') 469 elif (load[cls.APACHE_CLIENT_COUNT] > 470 cls._MAX_APACHE_CLIENT_COUNT): 471 return False 472 473 return True 474 475 476 @classmethod 477 def devserver_healthy(cls, devserver, 478 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 479 """Returns True if the |devserver| is healthy to stage build. 480 481 @param devserver: url of the devserver. 482 @param timeout_min: How long to wait in minutes before deciding the 483 the devserver is not up (float). 484 485 @return: True if devserver is healthy. Return False otherwise. 486 487 """ 488 c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy') 489 reason = '' 490 healthy = False 491 load = cls.get_devserver_load(devserver, timeout_min=timeout_min) 492 try: 493 if not load: 494 # Failed to get the load of devserver. 495 reason = '(1) Failed to get load.' 496 return False 497 498 apache_ok = cls.is_apache_client_count_ok(load) 499 if not apache_ok: 500 reason = '(2) Apache client count too high.' 501 logging.error('Devserver check_health failed. Live Apache client ' 502 'count is too high: %d.', 503 load[cls.APACHE_CLIENT_COUNT]) 504 return False 505 506 disk_ok = cls.is_free_disk_ok(load) 507 if not disk_ok: 508 reason = '(3) Disk space too low.' 509 logging.error('Devserver check_health failed. Free disk space is ' 510 'low. Only %dGB is available.', 511 load[cls.FREE_DISK]) 512 healthy = bool(disk_ok) 513 return disk_ok 514 finally: 515 c.increment(fields={'dev_server': cls(devserver).resolved_hostname, 516 'healthy': healthy, 517 'reason': reason}) 518 # Monitor how many AU processes the devserver is currently running. 519 if load is not None and load.get(DevServer.AU_PROCESS): 520 c_au = metrics.Gauge( 521 'chromeos/autotest/devserver/devserver_au_count') 522 c_au.set( 523 load.get(DevServer.AU_PROCESS), 524 fields={'dev_server': cls(devserver).resolved_hostname}) 525 526 527 @staticmethod 528 def _build_call(host, method, **kwargs): 529 """Build a URL to |host| that calls |method|, passing |kwargs|. 530 531 Builds a URL that calls |method| on the dev server defined by |host|, 532 passing a set of key/value pairs built from the dict |kwargs|. 533 534 @param host: a string that is the host basename e.g. http://server:90. 535 @param method: the dev server method to call. 536 @param kwargs: a dict mapping arg names to arg values. 537 @return the URL string. 538 """ 539 argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems())) 540 return "%(host)s/%(method)s?%(argstr)s" % dict( 541 host=host, method=method, argstr=argstr) 542 543 544 def build_call(self, method, **kwargs): 545 """Builds a devserver RPC string that is used by 'run_call()'. 546 547 @param method: remote devserver method to call. 548 """ 549 return self._build_call(self._devserver, method, **kwargs) 550 551 552 @classmethod 553 def build_all_calls(cls, method, **kwargs): 554 """Builds a list of URLs that makes RPC calls on all devservers. 555 556 Build a URL that calls |method| on the dev server, passing a set 557 of key/value pairs built from the dict |kwargs|. 558 559 @param method: the dev server method to call. 560 @param kwargs: a dict mapping arg names to arg values 561 562 @return the URL string 563 """ 564 calls = [] 565 # Note we use cls.servers as servers is class specific. 566 for server in cls.servers(): 567 if cls.devserver_healthy(server): 568 calls.append(cls._build_call(server, method, **kwargs)) 569 570 return calls 571 572 573 @classmethod 574 def run_call(cls, call, readline=False, timeout=None): 575 """Invoke a given devserver call using urllib.open. 576 577 Open the URL with HTTP, and return the text of the response. Exceptions 578 may be raised as for urllib2.urlopen(). 579 580 @param call: a url string that calls a method to a devserver. 581 @param readline: whether read http response line by line. 582 @param timeout: The timeout seconds for this urlopen call. 583 584 @return the results of this call. 585 """ 586 if timeout is not None: 587 return utils.urlopen_socket_timeout( 588 call, timeout=timeout).read() 589 elif readline: 590 response = urllib2.urlopen(call) 591 return [line.rstrip() for line in response] 592 else: 593 return urllib2.urlopen(call).read() 594 595 596 @staticmethod 597 def servers(): 598 """Returns a list of servers that can serve as this type of server.""" 599 raise NotImplementedError() 600 601 602 @classmethod 603 def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT, 604 unrestricted_only=False): 605 """Get the devservers in the same subnet of the given ip. 606 607 @param ip: The IP address of a dut to look for devserver. 608 @param mask_bits: Number of mask bits. Default is 19. 609 @param unrestricted_only: Set to True to select from devserver in 610 unrestricted subnet only. Default is False. 611 612 @return: A list of devservers in the same subnet of the given ip. 613 614 """ 615 # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so 616 # we need a dict to return the full devserver path once the IPs are 617 # filtered in get_servers_in_same_subnet. 618 server_names = {} 619 all_devservers = [] 620 devservers = (cls.get_unrestricted_devservers() if unrestricted_only 621 else cls.servers()) 622 for server in devservers: 623 server_name = get_hostname(server) 624 server_names[server_name] = server 625 all_devservers.append(server_name) 626 devservers = utils.get_servers_in_same_subnet(ip, mask_bits, 627 all_devservers) 628 return [server_names[s] for s in devservers] 629 630 631 @classmethod 632 def get_unrestricted_devservers( 633 cls, restricted_subnets=utils.RESTRICTED_SUBNETS): 634 """Get the devservers not in any restricted subnet specified in 635 restricted_subnets. 636 637 @param restricted_subnets: A list of restriected subnets. 638 639 @return: A list of devservers not in any restricted subnet. 640 641 """ 642 if not restricted_subnets: 643 return cls.servers() 644 645 devservers = [] 646 for server in cls.servers(): 647 server_name = get_hostname(server) 648 if not utils.get_restricted_subnet(server_name, restricted_subnets): 649 devservers.append(server) 650 return devservers 651 652 653 @classmethod 654 def get_healthy_devserver(cls, build, devservers, ban_list=None): 655 """"Get a healthy devserver instance from the list of devservers. 656 657 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 658 @param devservers: The devserver list to be chosen out a healthy one. 659 @param ban_list: The blacklist of devservers we don't want to choose. 660 Default is None. 661 662 @return: A DevServer object of a healthy devserver. Return None if no 663 healthy devserver is found. 664 665 """ 666 while devservers: 667 hash_index = hash(build) % len(devservers) 668 devserver = devservers.pop(hash_index) 669 if ban_list and devserver in ban_list: 670 continue 671 672 if cls.devserver_healthy(devserver): 673 return cls(devserver) 674 675 676 @classmethod 677 def get_available_devservers(cls, hostname=None, 678 prefer_local_devserver=PREFER_LOCAL_DEVSERVER, 679 restricted_subnets=utils.RESTRICTED_SUBNETS): 680 """Get devservers in the same subnet of the given hostname. 681 682 @param hostname: Hostname of a DUT to choose devserver for. 683 684 @return: A tuple of (devservers, can_retry), devservers is a list of 685 devservers that's available for the given hostname. can_retry 686 is a flag that indicate if caller can retry the selection of 687 devserver if no devserver in the returned devservers can be 688 used. For example, if hostname is in a restricted subnet, 689 can_retry will be False. 690 """ 691 host_ip = None 692 if hostname: 693 host_ip = bin_utils.get_ip_address(hostname) 694 if not host_ip: 695 logging.error('Failed to get IP address of %s. Will pick a ' 696 'devserver without subnet constraint.', hostname) 697 698 if not host_ip: 699 return cls.get_unrestricted_devservers(restricted_subnets), False 700 701 # Go through all restricted subnet settings and check if the DUT is 702 # inside a restricted subnet. If so, only return the devservers in the 703 # restricted subnet and doesn't allow retry. 704 if host_ip and restricted_subnets: 705 for subnet_ip, mask_bits in restricted_subnets: 706 if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits): 707 logging.debug('The host %s (%s) is in a restricted subnet. ' 708 'Try to locate a devserver inside subnet ' 709 '%s:%d.', hostname, host_ip, subnet_ip, 710 mask_bits) 711 devservers = cls.get_devservers_in_same_subnet( 712 subnet_ip, mask_bits) 713 return devservers, False 714 715 # If prefer_local_devserver is set to True and the host is not in 716 # restricted subnet, pick a devserver in the same subnet if possible. 717 # Set can_retry to True so it can pick a different devserver if all 718 # devservers in the same subnet are down. 719 if prefer_local_devserver: 720 return (cls.get_devservers_in_same_subnet( 721 host_ip, DEFAULT_SUBNET_MASKBIT, True), True) 722 723 return cls.get_unrestricted_devservers(restricted_subnets), False 724 725 726 @classmethod 727 def resolve(cls, build, hostname=None, ban_list=None): 728 """"Resolves a build to a devserver instance. 729 730 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 731 @param hostname: The hostname of dut that requests a devserver. It's 732 used to make sure a devserver in the same subnet is 733 preferred. 734 @param ban_list: The blacklist of devservers shouldn't be chosen. 735 736 @raise DevServerException: If no devserver is available. 737 """ 738 tried_devservers = set() 739 devservers, can_retry = cls.get_available_devservers(hostname) 740 if devservers: 741 tried_devservers |= set(devservers) 742 743 devserver = cls.get_healthy_devserver(build, devservers, 744 ban_list=ban_list) 745 746 if not devserver and can_retry: 747 # Find available devservers without dut location constrain. 748 devservers, _ = cls.get_available_devservers() 749 devserver = cls.get_healthy_devserver(build, devservers, 750 ban_list=ban_list) 751 if devservers: 752 tried_devservers |= set(devservers) 753 if devserver: 754 return devserver 755 else: 756 error_msg = ('All devservers are currently down: %s. ' 757 'dut hostname: %s' % 758 (tried_devservers, hostname)) 759 logging.error(error_msg) 760 raise DevServerException(error_msg) 761 762 763 @classmethod 764 def random(cls): 765 """Return a random devserver that's available. 766 767 Devserver election in `resolve` method is based on a hash of the 768 build that a caller wants to stage. The purpose is that different 769 callers requesting for the same build can get the same devserver, 770 while the lab is able to distribute different builds across all 771 devservers. That helps to reduce the duplication of builds across 772 all devservers. 773 This function returns a random devserver, by passing a random 774 pseudo build name to `resolve `method. 775 """ 776 return cls.resolve(build=str(time.time())) 777 778 779class CrashServer(DevServer): 780 """Class of DevServer that symbolicates crash dumps.""" 781 782 @staticmethod 783 def servers(): 784 return _get_crash_server_list() 785 786 787 @remote_devserver_call() 788 def symbolicate_dump(self, minidump_path, build): 789 """Ask the devserver to symbolicate the dump at minidump_path. 790 791 Stage the debug symbols for |build| and, if that works, ask the 792 devserver to symbolicate the dump at |minidump_path|. 793 794 @param minidump_path: the on-disk path of the minidump. 795 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 796 whose debug symbols are needed for symbolication. 797 @return The contents of the stack trace 798 @raise DevServerException upon any return code that's not HTTP OK. 799 """ 800 try: 801 import requests 802 except ImportError: 803 logging.warning("Can't 'import requests' to connect to dev server.") 804 return '' 805 f = {'dev_server': self.resolved_hostname} 806 c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump') 807 c.increment(fields=f) 808 # Symbolicate minidump. 809 m = 'chromeos/autotest/crashserver/symbolicate_dump_duration' 810 with metrics.SecondsTimer(m, fields=f): 811 call = self.build_call('symbolicate_dump', 812 archive_url=_get_image_storage_server() + build) 813 request = requests.post( 814 call, files={'minidump': open(minidump_path, 'rb')}) 815 if request.status_code == requests.codes.OK: 816 return request.text 817 818 error_fd = cStringIO.StringIO(request.text) 819 raise urllib2.HTTPError( 820 call, request.status_code, request.text, request.headers, 821 error_fd) 822 823 824 @classmethod 825 def get_available_devservers(cls, hostname): 826 """Get all available crash servers. 827 828 Crash server election doesn't need to count the location of hostname. 829 830 @param hostname: Hostname of a DUT to choose devserver for. 831 832 @return: A tuple of (all crash servers, False). can_retry is set to 833 False, as all crash servers are returned. There is no point to 834 retry. 835 """ 836 return cls.servers(), False 837 838 839class ImageServerBase(DevServer): 840 """Base class for devservers used to stage builds. 841 842 CrOS and Android builds are staged in different ways as they have different 843 sets of artifacts. This base class abstracts the shared functions between 844 the two types of ImageServer. 845 """ 846 847 @classmethod 848 def servers(cls): 849 """Returns a list of servers that can serve as a desired type of 850 devserver. 851 """ 852 return _get_dev_server_list() 853 854 855 def _get_image_url(self, image): 856 """Returns the url of the directory for this image on the devserver. 857 858 @param image: the image that was fetched. 859 """ 860 image = self.translate(image) 861 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 862 type=str) 863 return (url_pattern % (self.url(), image)).replace('update', 'static') 864 865 866 @staticmethod 867 def create_metadata(server_name, image, artifacts=None, files=None): 868 """Create a metadata dictionary given the staged items. 869 870 The metadata can be send to metadata db along with stats. 871 872 @param server_name: name of the devserver, e.g 172.22.33.44. 873 @param image: The name of the image. 874 @param artifacts: A list of artifacts. 875 @param files: A list of files. 876 877 @return A metadata dictionary. 878 879 """ 880 metadata = {'devserver': server_name, 881 'image': image, 882 '_type': 'devserver'} 883 if artifacts: 884 metadata['artifacts'] = ' '.join(artifacts) 885 if files: 886 metadata['files'] = ' '.join(files) 887 return metadata 888 889 890 @classmethod 891 def run_ssh_call(cls, call, readline=False, timeout=None): 892 """Construct an ssh-based rpc call, and execute it. 893 894 @param call: a url string that calls a method to a devserver. 895 @param readline: whether read http response line by line. 896 @param timeout: The timeout seconds for ssh call. 897 898 @return the results of this call. 899 """ 900 hostname = get_hostname(call) 901 ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call)) 902 timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60 903 try: 904 result = utils.run(ssh_call, timeout=timeout_seconds) 905 except error.CmdError as e: 906 logging.debug('Error occurred with exit_code %d when executing the ' 907 'ssh call: %s.', e.result_obj.exit_status, 908 e.result_obj.stderr) 909 c = metrics.Counter('chromeos/autotest/devserver/ssh_failure') 910 c.increment(fields={'dev_server': hostname}) 911 raise 912 response = result.stdout 913 914 # If the curl command's returned HTTP response contains certain 915 # exception string, raise the DevServerException of the response. 916 if 'DownloaderException' in response: 917 raise DevServerException(_strip_http_message(response)) 918 919 if readline: 920 # Remove line terminators and trailing whitespace 921 response = response.splitlines() 922 return [line.rstrip() for line in response] 923 924 return response 925 926 927 @classmethod 928 def run_call(cls, call, readline=False, timeout=None): 929 """Invoke a given devserver call using urllib.open or ssh. 930 931 Open the URL with HTTP or SSH-based HTTP, and return the text of the 932 response. Exceptions may be raised as for urllib2.urlopen() or 933 utils.run(). 934 935 @param call: a url string that calls a method to a devserver. 936 @param readline: whether read http response line by line. 937 @param timeout: The timeout seconds for urlopen call or ssh call. 938 939 @return the results of this call. 940 """ 941 if not ENABLE_SSH_CONNECTION_FOR_DEVSERVER: 942 return super(ImageServerBase, cls).run_call( 943 call, readline=readline, timeout=timeout) 944 else: 945 return cls.run_ssh_call( 946 call, readline=readline, timeout=timeout) 947 948 949 @classmethod 950 def download_file(cls, remote_file, local_file, timeout=None): 951 """Download file from devserver. 952 953 The format of remote_file should be: 954 http://devserver_ip:8082/static/board/... 955 956 @param remote_file: The URL of the file on devserver that need to be 957 downloaded. 958 @param local_file: The path of the file saved to local. 959 @param timeout: The timeout seconds for this call. 960 """ 961 response = cls.run_call(remote_file, timeout=timeout) 962 with open(local_file, 'w') as out_log: 963 out_log.write(response) 964 965 966 def _poll_is_staged(self, **kwargs): 967 """Polling devserver.is_staged until all artifacts are staged. 968 969 @param kwargs: keyword arguments to make is_staged devserver call. 970 971 @return: True if all artifacts are staged in devserver. 972 """ 973 call = self.build_call('is_staged', **kwargs) 974 975 def all_staged(): 976 """Call devserver.is_staged rpc to check if all files are staged. 977 978 @return: True if all artifacts are staged in devserver. False 979 otherwise. 980 @rasies DevServerException, the exception is a wrapper of all 981 exceptions that were raised when devserver tried to download 982 the artifacts. devserver raises an HTTPError or a CmdError 983 when an exception was raised in the code. Such exception 984 should be re-raised here to stop the caller from waiting. 985 If the call to devserver failed for connection issue, a 986 URLError exception is raised, and caller should retry the 987 call to avoid such network flakiness. 988 989 """ 990 try: 991 result = self.run_call(call) 992 logging.debug('whether artifact is staged: %r', result) 993 return result == 'True' 994 except urllib2.HTTPError as e: 995 error_markup = e.read() 996 raise DevServerException(_strip_http_message(error_markup)) 997 except urllib2.URLError as e: 998 # Could be connection issue, retry it. 999 # For example: <urlopen error [Errno 111] Connection refused> 1000 logging.error('URLError happens in is_stage: %r', e) 1001 return False 1002 except error.CmdError as e: 1003 # Retry if SSH failed to connect to the devserver. 1004 logging.warning('CmdError happens in is_stage: %r, will retry', e) 1005 return False 1006 1007 bin_utils.poll_for_condition( 1008 all_staged, 1009 exception=bin_utils.TimeoutError(), 1010 timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60, 1011 sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL) 1012 1013 return True 1014 1015 1016 def _call_and_wait(self, call_name, error_message, 1017 expected_response=SUCCESS, **kwargs): 1018 """Helper method to make a urlopen call, and wait for artifacts staged. 1019 1020 @param call_name: name of devserver rpc call. 1021 @param error_message: Error message to be thrown if response does not 1022 match expected_response. 1023 @param expected_response: Expected response from rpc, default to 1024 |Success|. If it's set to None, do not compare 1025 the actual response. Any response is consider 1026 to be good. 1027 @param kwargs: keyword arguments to make is_staged devserver call. 1028 1029 @return: The response from rpc. 1030 @raise DevServerException upon any return code that's expected_response. 1031 1032 """ 1033 call = self.build_call(call_name, async=True, **kwargs) 1034 try: 1035 response = self.run_call(call) 1036 logging.debug('response for RPC: %r', response) 1037 if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response: 1038 logging.debug('Proxy error happens in RPC call, ' 1039 'will retry in 30 seconds') 1040 time.sleep(30) 1041 raise DevServerOverloadException() 1042 except httplib.BadStatusLine as e: 1043 logging.error(e) 1044 raise DevServerException('Received Bad Status line, Devserver %s ' 1045 'might have gone down while handling ' 1046 'the call: %s' % (self.url(), call)) 1047 1048 if expected_response and not response == expected_response: 1049 raise DevServerException(error_message) 1050 1051 # `os_type` is needed in build a devserver call, but not needed for 1052 # wait_for_artifacts_staged, since that method is implemented by 1053 # each ImageServerBase child class. 1054 if 'os_type' in kwargs: 1055 del kwargs['os_type'] 1056 self.wait_for_artifacts_staged(**kwargs) 1057 return response 1058 1059 1060 def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs): 1061 """Tell the devserver to download and stage |artifacts| from |image| 1062 specified by kwargs. 1063 1064 This is the main call point for staging any specific artifacts for a 1065 given build. To see the list of artifacts one can stage see: 1066 1067 ~src/platfrom/dev/artifact_info.py. 1068 1069 This is maintained along with the actual devserver code. 1070 1071 @param artifacts: A list of artifacts. 1072 @param files: A list of files to stage. 1073 @param archive_url: Optional parameter that has the archive_url to stage 1074 this artifact from. Default is specified in autotest config + 1075 image. 1076 @param kwargs: keyword arguments that specify the build information, to 1077 make stage devserver call. 1078 1079 @raise DevServerException upon any return code that's not HTTP OK. 1080 """ 1081 if not archive_url: 1082 archive_url = _get_storage_server_for_artifacts(artifacts) + build 1083 1084 artifacts_arg = ','.join(artifacts) if artifacts else '' 1085 files_arg = ','.join(files) if files else '' 1086 error_message = ("staging %s for %s failed;" 1087 "HTTP OK not accompanied by 'Success'." % 1088 ('artifacts=%s files=%s ' % (artifacts_arg, files_arg), 1089 build)) 1090 1091 staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' % 1092 (build, artifacts, files, archive_url)) 1093 logging.info('Staging artifacts on devserver %s: %s', 1094 self.url(), staging_info) 1095 success = False 1096 try: 1097 arguments = {'archive_url': archive_url, 1098 'artifacts': artifacts_arg, 1099 'files': files_arg} 1100 if kwargs: 1101 arguments.update(kwargs) 1102 # TODO(akeshet): canonicalize artifacts_arg before using it as a 1103 # metric field (as it stands it is a not-very-well-controlled 1104 # string). 1105 f = {'artifacts': artifacts_arg, 1106 'dev_server': self.resolved_hostname} 1107 with metrics.SecondsTimer( 1108 'chromeos/autotest/devserver/stage_artifact_duration', 1109 fields=f): 1110 self.call_and_wait(call_name='stage', error_message=error_message, 1111 **arguments) 1112 logging.info('Finished staging artifacts: %s', staging_info) 1113 success = True 1114 except (bin_utils.TimeoutError, error.TimeoutException): 1115 logging.error('stage_artifacts timed out: %s', staging_info) 1116 raise DevServerException( 1117 'stage_artifacts timed out: %s' % staging_info) 1118 finally: 1119 f = {'success': success, 1120 'artifacts': artifacts_arg, 1121 'dev_server': self.resolved_hostname} 1122 metrics.Counter('chromeos/autotest/devserver/stage_artifact' 1123 ).increment(fields=f) 1124 1125 1126 def call_and_wait(self, *args, **kwargs): 1127 """Helper method to make a urlopen call, and wait for artifacts staged. 1128 1129 This method needs to be overridden in the subclass to implement the 1130 logic to call _call_and_wait. 1131 """ 1132 raise NotImplementedError 1133 1134 1135 def _trigger_download(self, build, artifacts, files, synchronous=True, 1136 **kwargs_build_info): 1137 """Tell the devserver to download and stage image specified in 1138 kwargs_build_info. 1139 1140 Tells the devserver to fetch |image| from the image storage server 1141 named by _get_image_storage_server(). 1142 1143 If |synchronous| is True, waits for the entire download to finish 1144 staging before returning. Otherwise only the artifacts necessary 1145 to start installing images onto DUT's will be staged before returning. 1146 A caller can then call finish_download to guarantee the rest of the 1147 artifacts have finished staging. 1148 1149 @param synchronous: if True, waits until all components of the image are 1150 staged before returning. 1151 @param kwargs_build_info: Dictionary of build information. 1152 For CrOS, it is None as build is the CrOS image name. 1153 For Android, it is {'target': target, 1154 'build_id': build_id, 1155 'branch': branch} 1156 1157 @raise DevServerException upon any return code that's not HTTP OK. 1158 1159 """ 1160 if kwargs_build_info: 1161 archive_url = None 1162 else: 1163 archive_url = _get_image_storage_server() + build 1164 error_message = ("trigger_download for %s failed;" 1165 "HTTP OK not accompanied by 'Success'." % build) 1166 kwargs = {'archive_url': archive_url, 1167 'artifacts': artifacts, 1168 'files': files, 1169 'error_message': error_message} 1170 if kwargs_build_info: 1171 kwargs.update(kwargs_build_info) 1172 1173 logging.info('trigger_download starts for %s', build) 1174 try: 1175 response = self.call_and_wait(call_name='stage', **kwargs) 1176 logging.info('trigger_download finishes for %s', build) 1177 except (bin_utils.TimeoutError, error.TimeoutException): 1178 logging.error('trigger_download timed out for %s.', build) 1179 raise DevServerException( 1180 'trigger_download timed out for %s.' % build) 1181 was_successful = response == SUCCESS 1182 if was_successful and synchronous: 1183 self._finish_download(build, artifacts, files, **kwargs_build_info) 1184 1185 1186 def _finish_download(self, build, artifacts, files, **kwargs_build_info): 1187 """Tell the devserver to finish staging image specified in 1188 kwargs_build_info. 1189 1190 If trigger_download is called with synchronous=False, it will return 1191 before all artifacts have been staged. This method contacts the 1192 devserver and blocks until all staging is completed and should be 1193 called after a call to trigger_download. 1194 1195 @param kwargs_build_info: Dictionary of build information. 1196 For CrOS, it is None as build is the CrOS image name. 1197 For Android, it is {'target': target, 1198 'build_id': build_id, 1199 'branch': branch} 1200 1201 @raise DevServerException upon any return code that's not HTTP OK. 1202 """ 1203 archive_url = _get_image_storage_server() + build 1204 error_message = ("finish_download for %s failed;" 1205 "HTTP OK not accompanied by 'Success'." % build) 1206 kwargs = {'archive_url': archive_url, 1207 'artifacts': artifacts, 1208 'files': files, 1209 'error_message': error_message} 1210 if kwargs_build_info: 1211 kwargs.update(kwargs_build_info) 1212 try: 1213 self.call_and_wait(call_name='stage', **kwargs) 1214 except (bin_utils.TimeoutError, error.TimeoutException): 1215 logging.error('finish_download timed out for %s', build) 1216 raise DevServerException( 1217 'finish_download timed out for %s.' % build) 1218 1219 1220 @remote_devserver_call() 1221 def locate_file(self, file_name, artifacts, build, build_info): 1222 """Locate a file with the given file_name on devserver. 1223 1224 This method calls devserver RPC `locate_file` to look up a file with 1225 the given file name inside specified build artifacts. 1226 1227 @param file_name: Name of the file to look for a file. 1228 @param artifacts: A list of artifact names to search for the file. 1229 @param build: Name of the build. For Android, it's None as build_info 1230 should be used. 1231 @param build_info: Dictionary of build information. 1232 For CrOS, it is None as build is the CrOS image name. 1233 For Android, it is {'target': target, 1234 'build_id': build_id, 1235 'branch': branch} 1236 1237 @return: A devserver url to the file. 1238 @raise DevServerException upon any return code that's not HTTP OK. 1239 """ 1240 if not build and not build_info: 1241 raise DevServerException('You must specify build information to ' 1242 'look for file %s in artifacts %s.' % 1243 (file_name, artifacts)) 1244 kwargs = {'file_name': file_name, 1245 'artifacts': artifacts} 1246 if build_info: 1247 build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info 1248 kwargs.update(build_info) 1249 # Devserver treats Android and Brillo build in the same way as they 1250 # are both retrieved from Launch Control and have similar build 1251 # artifacts. Therefore, os_type for devserver calls is `android` for 1252 # both Android and Brillo builds. 1253 kwargs['os_type'] = 'android' 1254 else: 1255 build_path = build 1256 kwargs['build'] = build 1257 call = self.build_call('locate_file', async=False, **kwargs) 1258 try: 1259 file_path = self.run_call(call) 1260 return os.path.join(self.url(), 'static', build_path, file_path) 1261 except httplib.BadStatusLine as e: 1262 logging.error(e) 1263 raise DevServerException('Received Bad Status line, Devserver %s ' 1264 'might have gone down while handling ' 1265 'the call: %s' % (self.url(), call)) 1266 1267 1268 @remote_devserver_call() 1269 def list_control_files(self, build, suite_name=''): 1270 """Ask the devserver to list all control files for |build|. 1271 1272 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1273 whose control files the caller wants listed. 1274 @param suite_name: The name of the suite for which we require control 1275 files. 1276 @return None on failure, or a list of control file paths 1277 (e.g. server/site_tests/autoupdate/control) 1278 @raise DevServerException upon any return code that's not HTTP OK. 1279 """ 1280 build = self.translate(build) 1281 call = self.build_call('controlfiles', build=build, 1282 suite_name=suite_name) 1283 return self.run_call(call, readline=True) 1284 1285 1286 @remote_devserver_call() 1287 def get_control_file(self, build, control_path): 1288 """Ask the devserver for the contents of a control file. 1289 1290 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1291 whose control file the caller wants to fetch. 1292 @param control_path: The file to fetch 1293 (e.g. server/site_tests/autoupdate/control) 1294 @return The contents of the desired file. 1295 @raise DevServerException upon any return code that's not HTTP OK. 1296 """ 1297 build = self.translate(build) 1298 call = self.build_call('controlfiles', build=build, 1299 control_path=control_path) 1300 return self.run_call(call) 1301 1302 1303 @remote_devserver_call() 1304 def list_suite_controls(self, build, suite_name=''): 1305 """Ask the devserver to list contents of all control files for |build|. 1306 1307 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1308 whose control files' contents the caller wants returned. 1309 @param suite_name: The name of the suite for which we require control 1310 files. 1311 @return None on failure, or a dict of contents of all control files 1312 (e.g. {'path1': "#Copyright controls ***", ..., 1313 pathX': "#Copyright controls ***"} 1314 @raise DevServerException upon any return code that's not HTTP OK. 1315 """ 1316 build = self.translate(build) 1317 call = self.build_call('list_suite_controls', build=build, 1318 suite_name=suite_name) 1319 return json.load(cStringIO.StringIO(self.run_call(call))) 1320 1321 1322class ImageServer(ImageServerBase): 1323 """Class for DevServer that handles RPCs related to CrOS images. 1324 1325 The calls to devserver to stage artifacts, including stage and download, are 1326 made in async mode. That is, when caller makes an RPC |stage| to request 1327 devserver to stage certain artifacts, devserver handles the call and starts 1328 staging artifacts in a new thread, and return |Success| without waiting for 1329 staging being completed. When caller receives message |Success|, it polls 1330 devserver's is_staged call until all artifacts are staged. 1331 Such mechanism is designed to prevent cherrypy threads in devserver being 1332 running out, as staging artifacts might take long time, and cherrypy starts 1333 with a fixed number of threads that handle devserver rpc. 1334 """ 1335 1336 class ArtifactUrls(object): 1337 """A container for URLs of staged artifacts. 1338 1339 Attributes: 1340 full_payload: URL for downloading a staged full release update 1341 mton_payload: URL for downloading a staged M-to-N release update 1342 nton_payload: URL for downloading a staged N-to-N release update 1343 1344 """ 1345 def __init__(self, full_payload=None, mton_payload=None, 1346 nton_payload=None): 1347 self.full_payload = full_payload 1348 self.mton_payload = mton_payload 1349 self.nton_payload = nton_payload 1350 1351 1352 def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''): 1353 """Polling devserver.is_staged until all artifacts are staged. 1354 1355 @param archive_url: Google Storage URL for the build. 1356 @param artifacts: Comma separated list of artifacts to download. 1357 @param files: Comma separated list of files to download. 1358 @return: True if all artifacts are staged in devserver. 1359 """ 1360 kwargs = {'archive_url': archive_url, 1361 'artifacts': artifacts, 1362 'files': files} 1363 return self._poll_is_staged(**kwargs) 1364 1365 1366 @remote_devserver_call() 1367 def call_and_wait(self, call_name, archive_url, artifacts, files, 1368 error_message, expected_response=SUCCESS): 1369 """Helper method to make a urlopen call, and wait for artifacts staged. 1370 1371 @param call_name: name of devserver rpc call. 1372 @param archive_url: Google Storage URL for the build.. 1373 @param artifacts: Comma separated list of artifacts to download. 1374 @param files: Comma separated list of files to download. 1375 @param expected_response: Expected response from rpc, default to 1376 |Success|. If it's set to None, do not compare 1377 the actual response. Any response is consider 1378 to be good. 1379 @param error_message: Error message to be thrown if response does not 1380 match expected_response. 1381 1382 @return: The response from rpc. 1383 @raise DevServerException upon any return code that's expected_response. 1384 1385 """ 1386 kwargs = {'archive_url': archive_url, 1387 'artifacts': artifacts, 1388 'files': files} 1389 return self._call_and_wait(call_name, error_message, 1390 expected_response, **kwargs) 1391 1392 1393 @remote_devserver_call() 1394 def stage_artifacts(self, image=None, artifacts=None, files='', 1395 archive_url=None): 1396 """Tell the devserver to download and stage |artifacts| from |image|. 1397 1398 This is the main call point for staging any specific artifacts for a 1399 given build. To see the list of artifacts one can stage see: 1400 1401 ~src/platfrom/dev/artifact_info.py. 1402 1403 This is maintained along with the actual devserver code. 1404 1405 @param image: the image to fetch and stage. 1406 @param artifacts: A list of artifacts. 1407 @param files: A list of files to stage. 1408 @param archive_url: Optional parameter that has the archive_url to stage 1409 this artifact from. Default is specified in autotest config + 1410 image. 1411 1412 @raise DevServerException upon any return code that's not HTTP OK. 1413 """ 1414 if not artifacts and not files: 1415 raise DevServerException('Must specify something to stage.') 1416 image = self.translate(image) 1417 self._stage_artifacts(image, artifacts, files, archive_url) 1418 1419 1420 @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS) 1421 def list_image_dir(self, image): 1422 """List the contents of the image stage directory, on the devserver. 1423 1424 @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>. 1425 1426 @raise DevServerException upon any return code that's not HTTP OK. 1427 """ 1428 image = self.translate(image) 1429 logging.info('Requesting contents from devserver %s for image %s', 1430 self.url(), image) 1431 archive_url = _get_storage_server_for_artifacts() + image 1432 call = self.build_call('list_image_dir', archive_url=archive_url) 1433 response = self.run_call(call, readline=True) 1434 for line in response: 1435 logging.info(line) 1436 1437 1438 def trigger_download(self, image, synchronous=True): 1439 """Tell the devserver to download and stage |image|. 1440 1441 Tells the devserver to fetch |image| from the image storage server 1442 named by _get_image_storage_server(). 1443 1444 If |synchronous| is True, waits for the entire download to finish 1445 staging before returning. Otherwise only the artifacts necessary 1446 to start installing images onto DUT's will be staged before returning. 1447 A caller can then call finish_download to guarantee the rest of the 1448 artifacts have finished staging. 1449 1450 @param image: the image to fetch and stage. 1451 @param synchronous: if True, waits until all components of the image are 1452 staged before returning. 1453 1454 @raise DevServerException upon any return code that's not HTTP OK. 1455 1456 """ 1457 image = self.translate(image) 1458 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE 1459 self._trigger_download(image, artifacts, files='', 1460 synchronous=synchronous) 1461 1462 1463 @remote_devserver_call() 1464 def setup_telemetry(self, build): 1465 """Tell the devserver to setup telemetry for this build. 1466 1467 The devserver will stage autotest and then extract the required files 1468 for telemetry. 1469 1470 @param build: the build to setup telemetry for. 1471 1472 @returns path on the devserver that telemetry is installed to. 1473 """ 1474 build = self.translate(build) 1475 archive_url = _get_image_storage_server() + build 1476 call = self.build_call('setup_telemetry', archive_url=archive_url) 1477 try: 1478 response = self.run_call(call) 1479 except httplib.BadStatusLine as e: 1480 logging.error(e) 1481 raise DevServerException('Received Bad Status line, Devserver %s ' 1482 'might have gone down while handling ' 1483 'the call: %s' % (self.url(), call)) 1484 return response 1485 1486 1487 def finish_download(self, image): 1488 """Tell the devserver to finish staging |image|. 1489 1490 If trigger_download is called with synchronous=False, it will return 1491 before all artifacts have been staged. This method contacts the 1492 devserver and blocks until all staging is completed and should be 1493 called after a call to trigger_download. 1494 1495 @param image: the image to fetch and stage. 1496 @raise DevServerException upon any return code that's not HTTP OK. 1497 """ 1498 image = self.translate(image) 1499 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST 1500 self._finish_download(image, artifacts, files='') 1501 1502 1503 def get_update_url(self, image): 1504 """Returns the url that should be passed to the updater. 1505 1506 @param image: the image that was fetched. 1507 """ 1508 image = self.translate(image) 1509 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 1510 type=str) 1511 return (url_pattern % (self.url(), image)) 1512 1513 1514 def get_staged_file_url(self, filename, image): 1515 """Returns the url of a staged file for this image on the devserver.""" 1516 return '/'.join([self._get_image_url(image), filename]) 1517 1518 1519 def get_full_payload_url(self, image): 1520 """Returns a URL to a staged full payload. 1521 1522 @param image: the image that was fetched. 1523 1524 @return A fully qualified URL that can be used for downloading the 1525 payload. 1526 1527 """ 1528 return self._get_image_url(image) + '/update.gz' 1529 1530 1531 def get_test_image_url(self, image): 1532 """Returns a URL to a staged test image. 1533 1534 @param image: the image that was fetched. 1535 1536 @return A fully qualified URL that can be used for downloading the 1537 image. 1538 1539 """ 1540 return self._get_image_url(image) + '/chromiumos_test_image.bin' 1541 1542 1543 @remote_devserver_call() 1544 def get_dependencies_file(self, build): 1545 """Ask the dev server for the contents of the suite dependencies file. 1546 1547 Ask the dev server at |self._dev_server| for the contents of the 1548 pre-processed suite dependencies file (at DEPENDENCIES_FILE) 1549 for |build|. 1550 1551 @param build: The build (e.g. x86-mario-release/R21-2333.0.0) 1552 whose dependencies the caller is interested in. 1553 @return The contents of the dependencies file, which should eval to 1554 a dict of dicts, as per bin_utils/suite_preprocessor.py. 1555 @raise DevServerException upon any return code that's not HTTP OK. 1556 """ 1557 build = self.translate(build) 1558 call = self.build_call('controlfiles', 1559 build=build, control_path=DEPENDENCIES_FILE) 1560 return self.run_call(call) 1561 1562 1563 @remote_devserver_call() 1564 def get_latest_build_in_gs(self, board): 1565 """Ask the devservers for the latest offical build in Google Storage. 1566 1567 @param board: The board for who we want the latest official build. 1568 @return A string of the returned build rambi-release/R37-5868.0.0 1569 @raise DevServerException upon any return code that's not HTTP OK. 1570 """ 1571 call = self.build_call( 1572 'xbuddy_translate/remote/%s/latest-official' % board, 1573 image_dir=_get_image_storage_server()) 1574 image_name = self.run_call(call) 1575 return os.path.dirname(image_name) 1576 1577 1578 def translate(self, build_name): 1579 """Translate the build name if it's in LATEST format. 1580 1581 If the build name is in the format [builder]/LATEST, return the latest 1582 build in Google Storage otherwise return the build name as is. 1583 1584 @param build_name: build_name to check. 1585 1586 @return The actual build name to use. 1587 """ 1588 match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I) 1589 if not match: 1590 return build_name 1591 translated_build = self.get_latest_build_in_gs(match.groups()[0]) 1592 logging.debug('Translated relative build %s to %s', build_name, 1593 translated_build) 1594 return translated_build 1595 1596 1597 @classmethod 1598 @remote_devserver_call() 1599 def get_latest_build(cls, target, milestone=''): 1600 """Ask all the devservers for the latest build for a given target. 1601 1602 @param target: The build target, typically a combination of the board 1603 and the type of build e.g. x86-mario-release. 1604 @param milestone: For latest build set to '', for builds only in a 1605 specific milestone set to a str of format Rxx 1606 (e.g. R16). Default: ''. Since we are dealing with a 1607 webserver sending an empty string, '', ensures that 1608 the variable in the URL is ignored as if it was set 1609 to None. 1610 @return A string of the returned build e.g. R20-2226.0.0. 1611 @raise DevServerException upon any return code that's not HTTP OK. 1612 """ 1613 calls = cls.build_all_calls('latestbuild', target=target, 1614 milestone=milestone) 1615 latest_builds = [] 1616 for call in calls: 1617 latest_builds.append(cls.run_call(call)) 1618 1619 return max(latest_builds, key=version.LooseVersion) 1620 1621 1622 @remote_devserver_call() 1623 def _kill_au_process_for_host(self, **kwargs): 1624 """Kill the triggerred auto_update process if error happens in cros_au. 1625 1626 @param kwargs: Arguments to make kill_au_proc devserver call. 1627 """ 1628 call = self.build_call('kill_au_proc', **kwargs) 1629 response = self.run_call(call) 1630 if not response == 'True': 1631 raise DevServerException( 1632 'Failed to kill the triggerred CrOS auto_update process' 1633 'on devserver %s, the response is %s' % ( 1634 self.url(), response)) 1635 1636 1637 def kill_au_process_for_host(self, host_name, pid): 1638 """Kill the triggerred auto_update process if error happens. 1639 1640 Usually this function is used to clear all potential left au processes 1641 of the given host name. 1642 1643 If pid is specified, the devserver will further check the given pid to 1644 make sure the process is killed. This is used for the case that the au 1645 process has started in background, but then provision fails due to 1646 some unknown issues very fast. In this case, when 'kill_au_proc' is 1647 called, there's no corresponding background track log created for this 1648 ongoing au process, which prevents this RPC call from killing this au 1649 process. 1650 1651 @param host_name: The DUT's hostname. 1652 @param pid: The ongoing au process's pid. 1653 1654 @return: True if successfully kill the auto-update process for host. 1655 """ 1656 kwargs = {'host_name': host_name, 'pid': pid} 1657 try: 1658 self._kill_au_process_for_host(**kwargs) 1659 except DevServerException: 1660 return False 1661 1662 return True 1663 1664 1665 @remote_devserver_call() 1666 def _clean_track_log(self, **kwargs): 1667 """Clean track log for the current auto-update process.""" 1668 call = self.build_call('handler_cleanup', **kwargs) 1669 self.run_call(call) 1670 1671 1672 def clean_track_log(self, host_name, pid): 1673 """Clean track log for the current auto-update process. 1674 1675 @param host_name: The host name to be updated. 1676 @param pid: The auto-update process id. 1677 1678 @return: True if track log is successfully cleaned, False otherwise. 1679 """ 1680 if not pid: 1681 return False 1682 1683 kwargs = {'host_name': host_name, 'pid': pid} 1684 try: 1685 self._clean_track_log(**kwargs) 1686 except DevServerException as e: 1687 logging.debug('Failed to clean track_status_file on ' 1688 'devserver for host %s and process id %s: %s', 1689 host_name, pid, str(e)) 1690 return False 1691 1692 return True 1693 1694 1695 def _get_au_log_filename(self, log_dir, host_name, pid): 1696 """Return the auto-update log's filename.""" 1697 return os.path.join(log_dir, CROS_AU_LOG_FILENAME % ( 1698 host_name, pid)) 1699 1700 @remote_devserver_call() 1701 def _collect_au_log(self, log_dir, **kwargs): 1702 """Collect logs from devserver after cros-update process is finished. 1703 1704 Collect the logs that recording the whole cros-update process, and 1705 write it to sysinfo path of a job. 1706 1707 The example log file name that is stored is like: 1708 '1220-repair/sysinfo/CrOS_update_host_name_pid.log' 1709 1710 @param host_name: the DUT's hostname. 1711 @param pid: the auto-update process id on devserver. 1712 @param log_dir: The directory to save the cros-update process log 1713 retrieved from devserver. 1714 """ 1715 call = self.build_call('collect_cros_au_log', **kwargs) 1716 response = self.run_call(call) 1717 if not os.path.exists(log_dir): 1718 os.mkdir(log_dir) 1719 write_file = self._get_au_log_filename( 1720 log_dir, kwargs['host_name'], kwargs['pid']) 1721 logging.debug('Saving auto-update logs into %s', write_file) 1722 try: 1723 with open(write_file, 'w') as out_log: 1724 out_log.write(response) 1725 except: 1726 raise DevServerException('Failed to write auto-update logs into ' 1727 '%s' % write_file) 1728 1729 1730 def collect_au_log(self, host_name, pid, log_dir): 1731 """Collect logs from devserver after cros-update process is finished. 1732 1733 @param host_name: the DUT's hostname. 1734 @param pid: the auto-update process id on devserver. 1735 @param log_dir: The directory to save the cros-update process log 1736 retrieved from devserver. 1737 1738 @return: True if auto-update log is successfully collected, False 1739 otherwise. 1740 """ 1741 if not pid: 1742 return False 1743 1744 kwargs = {'host_name': host_name, 'pid': pid} 1745 try: 1746 self._collect_au_log(log_dir, **kwargs) 1747 except DevServerException as e: 1748 logging.debug('Failed to collect auto-update log on ' 1749 'devserver for host %s and process id %s: %s', 1750 host_name, pid, str(e)) 1751 return False 1752 1753 return True 1754 1755 1756 @remote_devserver_call() 1757 def _trigger_auto_update(self, **kwargs): 1758 """Trigger auto-update by calling devserver.cros_au. 1759 1760 @param kwargs: Arguments to make cros_au devserver call. 1761 1762 @return: a tuple indicates whether the RPC call cros_au succeeds and 1763 the auto-update process id running on devserver. 1764 """ 1765 host_name = kwargs['host_name'] 1766 call = self.build_call('cros_au', async=True, **kwargs) 1767 try: 1768 response = self.run_call(call) 1769 logging.info( 1770 'Received response from devserver for cros_au call: %r', 1771 response) 1772 except httplib.BadStatusLine as e: 1773 logging.error(e) 1774 raise DevServerException('Received Bad Status line, Devserver %s ' 1775 'might have gone down while handling ' 1776 'the call: %s' % (self.url(), call)) 1777 1778 return response 1779 1780 1781 def _wait_for_auto_update_finished(self, pid, **kwargs): 1782 """Polling devserver.get_au_status to get current auto-update status. 1783 1784 The current auto-update status is used to identify whether the update 1785 process is finished. 1786 1787 @param pid: The background process id for auto-update in devserver. 1788 @param kwargs: keyword arguments to make get_au_status devserver call. 1789 1790 @return: True if auto-update is finished for a given dut. 1791 """ 1792 logging.debug('Check the progress for auto-update process %r', pid) 1793 kwargs['pid'] = pid 1794 call = self.build_call('get_au_status', **kwargs) 1795 1796 def all_finished(): 1797 """Call devserver.get_au_status rpc to check if auto-update 1798 is finished. 1799 1800 @return: True if auto-update is finished for a given dut. False 1801 otherwise. 1802 @rasies DevServerException, the exception is a wrapper of all 1803 exceptions that were raised when devserver tried to 1804 download the artifacts. devserver raises an HTTPError or 1805 a CmdError when an exception was raised in the code. Such 1806 exception should be re-raised here to stop the caller from 1807 waiting. If the call to devserver failed for connection 1808 issue, a URLError exception is raised, and caller should 1809 retry the call to avoid such network flakiness. 1810 1811 """ 1812 try: 1813 au_status = self.run_call(call) 1814 response = json.loads(au_status) 1815 # This is a temp fix to fit both dict and tuple returning 1816 # values. The dict check will be removed after a corresponding 1817 # devserver CL is deployed. 1818 if isinstance(response, dict): 1819 if response.get('detailed_error_msg'): 1820 raise DevServerException( 1821 response.get('detailed_error_msg')) 1822 1823 if response.get('finished'): 1824 logging.debug('CrOS auto-update is finished') 1825 return True 1826 else: 1827 logging.debug('Current CrOS auto-update status: %s', 1828 response.get('status')) 1829 return False 1830 1831 if not response[0]: 1832 logging.debug('Current CrOS auto-update status: %s', 1833 response[1]) 1834 return False 1835 else: 1836 logging.debug('CrOS auto-update is finished') 1837 return True 1838 except urllib2.HTTPError as e: 1839 error_markup = e.read() 1840 raise DevServerException(_strip_http_message(error_markup)) 1841 except urllib2.URLError as e: 1842 # Could be connection issue, retry it. 1843 # For example: <urlopen error [Errno 111] Connection refused> 1844 logging.warning('URLError (%r): Retrying connection to ' 1845 'devserver to check auto-update status.', e) 1846 return False 1847 except error.CmdError: 1848 # Retry if SSH failed to connect to the devserver. 1849 logging.warning('CmdError: Retrying SSH connection to check ' 1850 'auto-update status.') 1851 return False 1852 except socket.error as e: 1853 # Could be some temporary devserver connection issues. 1854 logging.warning('Socket Error (%r): Retrying connection to ' 1855 'devserver to check auto-update status.', e) 1856 return False 1857 except ValueError as e: 1858 raise DevServerException( 1859 '%s (Got AU status: %r)' % (str(e), au_status)) 1860 1861 bin_utils.poll_for_condition( 1862 all_finished, 1863 exception=bin_utils.TimeoutError(), 1864 timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60, 1865 sleep_interval=CROS_AU_POLLING_INTERVAL) 1866 1867 return True 1868 1869 1870 def wait_for_auto_update_finished(self, response, **kwargs): 1871 """Processing response of 'cros_au' and polling for auto-update status. 1872 1873 Will wait for the whole auto-update process is finished. 1874 1875 @param response: The response from RPC 'cros_au' 1876 @param kwargs: keyword arguments to make get_au_status devserver call. 1877 1878 @return: a tuple includes two elements. 1879 raised_error: None if everything works well or the raised error. 1880 pid: the auto-update process id on devserver. 1881 """ 1882 1883 pid = 0 1884 raised_error = None 1885 try: 1886 response = json.loads(response) 1887 if response[0]: 1888 pid = response[1] 1889 logging.debug('start process %r for auto_update in devserver', 1890 pid) 1891 self._wait_for_auto_update_finished(pid, **kwargs) 1892 except Exception as e: 1893 logging.debug('Failed to trigger auto-update process on devserver') 1894 raised_error = e 1895 finally: 1896 return raised_error, pid 1897 1898 1899 def _parse_AU_error(self, response): 1900 """Parse auto_update error returned from devserver.""" 1901 return re.split('\n', response)[-1] 1902 1903 1904 def _classify_exceptions(self, error_list): 1905 """Parse the error that was raised from auto_update. 1906 1907 @param error_list: The list of errors (string) happened in auto-update 1908 1909 @return: A classified exception type (string) from _EXCEPTION_PATTERNS 1910 or 'Unknown exception'. Current patterns in _EXCEPTION_PATTERNS are 1911 very specific so that errors cannot match more than one pattern. 1912 """ 1913 raised_error = '' 1914 if not error_list: 1915 return raised_error 1916 else: 1917 target_error = error_list[0] 1918 1919 for err_pattern, classification in _EXCEPTION_PATTERNS: 1920 match = re.match(err_pattern, target_error) 1921 if match: 1922 return classification 1923 1924 return '(0) Unknown exception' 1925 1926 def _is_retryable(self, error_msg): 1927 """Detect whether we will retry auto-update based on error_msg. 1928 1929 @param error_msg: The given error message. 1930 1931 @return A boolean variable which indicates whether we will retry 1932 auto_update with another devserver based on the given error_msg. 1933 """ 1934 # For now we just hard-code the error message we think it's suspicious. 1935 # When we get more date about what's the json response when devserver 1936 # is overloaded, we can update this part. 1937 retryable_errors = ['No JSON object could be decoded', 1938 'is not pingable'] 1939 for err in retryable_errors: 1940 if err in error_msg: 1941 return True 1942 1943 return False 1944 1945 1946 def _parse_buildname_safely(self, build_name): 1947 """Parse a given buildname safely. 1948 1949 @param build_name: the build name to be parsed. 1950 1951 @return: a tuple (board, build_type, milestone) 1952 """ 1953 try: 1954 board, build_type, milestone, _ = server_utils.ParseBuildName( 1955 build_name) 1956 except server_utils.ParseBuildNameException: 1957 logging.warning('Unable to parse build name %s for metrics. ' 1958 'Continuing anyway.', build_name) 1959 board, build_type, milestone = ('', '', '') 1960 1961 return board, build_type, milestone 1962 1963 1964 def auto_update(self, host_name, build_name, original_board=None, 1965 original_release_version=None, log_dir=None, 1966 force_update=False, full_update=False): 1967 """Auto-update a CrOS host. 1968 1969 @param host_name: The hostname of the DUT to auto-update. 1970 @param build_name: The build name to be auto-updated on the DUT. 1971 @param original_board: The original board of the DUT to auto-update. 1972 @param original_release_version: The release version of the DUT's 1973 current build. 1974 @param log_dir: The log directory to store auto-update logs from 1975 devserver. 1976 @param force_update: Force an update even if the version installed 1977 is the same. Default: False. 1978 @param full_update: If True, do not run stateful update, directly 1979 force a full reimage. If False, try stateful 1980 update first if the dut is already installed 1981 with the same version. 1982 1983 @return A set (is_success, is_retryable) in which: 1984 1. is_success indicates whether this auto_update succeeds. 1985 2. is_retryable indicates whether we should retry auto_update if 1986 if it fails. 1987 1988 @raise DevServerException if auto_update fails and is not retryable. 1989 """ 1990 kwargs = {'host_name': host_name, 1991 'build_name': build_name, 1992 'force_update': force_update, 1993 'full_update': full_update} 1994 1995 error_msg = 'CrOS auto-update failed for host %s: %s' 1996 error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s' 1997 is_au_success = False 1998 au_log_dir = os.path.join(log_dir, 1999 AUTO_UPDATE_LOG_DIR) if log_dir else None 2000 error_list = [] 2001 retry_with_another_devserver = False 2002 board, build_type, milestone = self._parse_buildname_safely(build_name) 2003 2004 for au_attempt in range(AU_RETRY_LIMIT): 2005 logging.debug('Start CrOS auto-update for host %s at %d time(s).', 2006 host_name, au_attempt + 1) 2007 # No matter _trigger_auto_update succeeds or fails, the auto-update 2008 # track_status_file should be cleaned, and the auto-update execute 2009 # log should be collected to directory sysinfo. Also, the error 2010 # raised by _trigger_auto_update should be displayed. 2011 try: 2012 # Try update with stateful.tgz of old release version in the 2013 # last try of auto-update. 2014 if (au_attempt > 0 and au_attempt == AU_RETRY_LIMIT - 1 and 2015 original_release_version): 2016 # Monitor this case in monarch 2017 original_build = '%s/%s' % (original_board, 2018 original_release_version) 2019 c = metrics.Counter( 2020 'chromeos/autotest/provision/' 2021 'cros_update_with_original_build') 2022 f = {'dev_server': self.resolved_hostname, 2023 'board': board, 2024 'build_type': build_type, 2025 'milestone': milestone, 2026 'original_build': original_build} 2027 c.increment(fields=f) 2028 2029 logging.debug('Try updating stateful partition of the ' 2030 'host with the same version of its current ' 2031 'rootfs partition: %s', original_build) 2032 response = self._trigger_auto_update( 2033 original_build=original_build, **kwargs) 2034 else: 2035 response = self._trigger_auto_update(**kwargs) 2036 except DevServerException as e: 2037 logging.debug(error_msg_attempt, au_attempt+1, str(e)) 2038 error_list.append(str(e)) 2039 else: 2040 raised_error, pid = self.wait_for_auto_update_finished(response, 2041 **kwargs) 2042 # Error happens in _clean_track_log won't be raised. Auto-update 2043 # process will be retried. 2044 # TODO(xixuan): Change kwargs['host_name'] back to host_name 2045 # if crbug.com/651974 is fixed: host_name represents the host 2046 # name of the host, and kwargs['host_name'] could be host_name 2047 # or the IP of this host. 2048 is_clean_success = self.clean_track_log(kwargs['host_name'], pid) 2049 # Error happens in _collect_au_log won't be raised. Auto-update 2050 # process will be retried. 2051 if au_log_dir: 2052 is_collect_success = self.collect_au_log( 2053 kwargs['host_name'], pid, au_log_dir) 2054 else: 2055 is_collect_success = True 2056 # If any error is raised previously, log it and retry 2057 # auto-update. Otherwise, claim a success CrOS auto-update. 2058 if not raised_error and is_clean_success and is_collect_success: 2059 logging.debug('CrOS auto-update succeed for host %s', 2060 host_name) 2061 is_au_success = True 2062 break 2063 else: 2064 if not self.kill_au_process_for_host(kwargs['host_name'], 2065 pid): 2066 logging.debug('Failed to kill auto_update process %d', 2067 pid) 2068 if raised_error: 2069 logging.debug(error_msg_attempt, au_attempt+1, 2070 str(raised_error)) 2071 if au_log_dir: 2072 logging.debug('Please see error details in log %s', 2073 self._get_au_log_filename( 2074 au_log_dir, 2075 kwargs['host_name'], 2076 pid)) 2077 error_list.append(self._parse_AU_error(str(raised_error))) 2078 if self._is_retryable(str(raised_error)): 2079 retry_with_another_devserver = True 2080 2081 finally: 2082 if retry_with_another_devserver: 2083 break 2084 2085 if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1: 2086 time.sleep(CROS_AU_RETRY_INTERVAL) 2087 # TODO(kevcheng): Remove this once crbug.com/651974 is 2088 # fixed. 2089 # DNS is broken in the cassandra lab, so use the IP of the 2090 # hostname instead if it fails. Not rename host_name here 2091 # for error msg reporting. 2092 host_name_ip = socket.gethostbyname(host_name) 2093 kwargs['host_name'] = host_name_ip 2094 logging.debug( 2095 'AU failed, trying IP instead of hostname: %s', 2096 host_name_ip) 2097 2098 # Note: To avoid reaching or exceeding the monarch field cardinality 2099 # limit, we avoid a metric that includes both dut hostname and other 2100 # high cardinality fields. 2101 # Per-devserver cros_update metric. 2102 c = metrics.Counter( 2103 'chromeos/autotest/provision/cros_update_by_devserver') 2104 # Add a field |error| here. Current error's pattern is manually 2105 # specified in _EXCEPTION_PATTERNS. 2106 raised_error = self._classify_exceptions(error_list) 2107 f = {'dev_server': self.resolved_hostname, 2108 'success': is_au_success, 2109 'board': board, 2110 'build_type': build_type, 2111 'milestone': milestone, 2112 'error': raised_error} 2113 c.increment(fields=f) 2114 2115 # Per-DUT cros_update metric. 2116 c = metrics.Counter('chromeos/autotest/provision/cros_update_per_dut') 2117 f = {'success': is_au_success, 2118 'board': board, 2119 'error': raised_error, 2120 'dut_host_name': host_name} 2121 c.increment(fields=f) 2122 2123 if is_au_success or retry_with_another_devserver: 2124 return (is_au_success, retry_with_another_devserver) 2125 2126 # If errors happen in the CrOS AU process, report the first error 2127 # since the following errors might be caused by the first error. 2128 # If error happens in RPCs of cleaning track log, collecting 2129 # auto-update logs, or killing auto-update processes, just report 2130 # them together. 2131 if error_list: 2132 raise DevServerException(error_msg % (host_name, error_list[0])) 2133 else: 2134 raise DevServerException(error_msg % ( 2135 host_name, ('RPC calls after the whole auto-update ' 2136 'process failed.'))) 2137 2138 2139class AndroidBuildServer(ImageServerBase): 2140 """Class for DevServer that handles RPCs related to Android builds. 2141 2142 The calls to devserver to stage artifacts, including stage and download, are 2143 made in async mode. That is, when caller makes an RPC |stage| to request 2144 devserver to stage certain artifacts, devserver handles the call and starts 2145 staging artifacts in a new thread, and return |Success| without waiting for 2146 staging being completed. When caller receives message |Success|, it polls 2147 devserver's is_staged call until all artifacts are staged. 2148 Such mechanism is designed to prevent cherrypy threads in devserver being 2149 running out, as staging artifacts might take long time, and cherrypy starts 2150 with a fixed number of threads that handle devserver rpc. 2151 """ 2152 2153 def wait_for_artifacts_staged(self, target, build_id, branch, 2154 archive_url=None, artifacts='', files=''): 2155 """Polling devserver.is_staged until all artifacts are staged. 2156 2157 @param target: Target of the android build to stage, e.g., 2158 shamu-userdebug. 2159 @param build_id: Build id of the android build to stage. 2160 @param branch: Branch of the android build to stage. 2161 @param archive_url: Google Storage URL for the build. 2162 @param artifacts: Comma separated list of artifacts to download. 2163 @param files: Comma separated list of files to download. 2164 2165 @return: True if all artifacts are staged in devserver. 2166 """ 2167 kwargs = {'target': target, 2168 'build_id': build_id, 2169 'branch': branch, 2170 'artifacts': artifacts, 2171 'files': files, 2172 'os_type': 'android'} 2173 if archive_url: 2174 kwargs['archive_url'] = archive_url 2175 return self._poll_is_staged(**kwargs) 2176 2177 2178 @remote_devserver_call() 2179 def call_and_wait(self, call_name, target, build_id, branch, archive_url, 2180 artifacts, files, error_message, 2181 expected_response=SUCCESS): 2182 """Helper method to make a urlopen call, and wait for artifacts staged. 2183 2184 @param call_name: name of devserver rpc call. 2185 @param target: Target of the android build to stage, e.g., 2186 shamu-userdebug. 2187 @param build_id: Build id of the android build to stage. 2188 @param branch: Branch of the android build to stage. 2189 @param archive_url: Google Storage URL for the CrOS build. 2190 @param artifacts: Comma separated list of artifacts to download. 2191 @param files: Comma separated list of files to download. 2192 @param expected_response: Expected response from rpc, default to 2193 |Success|. If it's set to None, do not compare 2194 the actual response. Any response is consider 2195 to be good. 2196 @param error_message: Error message to be thrown if response does not 2197 match expected_response. 2198 2199 @return: The response from rpc. 2200 @raise DevServerException upon any return code that's expected_response. 2201 2202 """ 2203 kwargs = {'target': target, 2204 'build_id': build_id, 2205 'branch': branch, 2206 'artifacts': artifacts, 2207 'files': files, 2208 'os_type': 'android'} 2209 if archive_url: 2210 kwargs['archive_url'] = archive_url 2211 return self._call_and_wait(call_name, error_message, expected_response, 2212 **kwargs) 2213 2214 2215 @remote_devserver_call() 2216 def stage_artifacts(self, target=None, build_id=None, branch=None, 2217 image=None, artifacts=None, files='', archive_url=None): 2218 """Tell the devserver to download and stage |artifacts| from |image|. 2219 2220 This is the main call point for staging any specific artifacts for a 2221 given build. To see the list of artifacts one can stage see: 2222 2223 ~src/platfrom/dev/artifact_info.py. 2224 2225 This is maintained along with the actual devserver code. 2226 2227 @param target: Target of the android build to stage, e.g., 2228 shamu-userdebug. 2229 @param build_id: Build id of the android build to stage. 2230 @param branch: Branch of the android build to stage. 2231 @param image: Name of a build to test, in the format of 2232 branch/target/build_id 2233 @param artifacts: A list of artifacts. 2234 @param files: A list of files to stage. 2235 @param archive_url: Optional parameter that has the archive_url to stage 2236 this artifact from. Default is specified in autotest config + 2237 image. 2238 2239 @raise DevServerException upon any return code that's not HTTP OK. 2240 """ 2241 if image and not target and not build_id and not branch: 2242 branch, target, build_id = utils.parse_launch_control_build(image) 2243 if not target or not build_id or not branch: 2244 raise DevServerException('Must specify all build info (target, ' 2245 'build_id and branch) to stage.') 2246 2247 android_build_info = {'target': target, 2248 'build_id': build_id, 2249 'branch': branch} 2250 if not artifacts and not files: 2251 raise DevServerException('Must specify something to stage.') 2252 if not all(android_build_info.values()): 2253 raise DevServerException( 2254 'To stage an Android build, must specify target, build id ' 2255 'and branch.') 2256 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2257 self._stage_artifacts(build, artifacts, files, archive_url, 2258 **android_build_info) 2259 2260 def get_pull_url(self, target, build_id, branch): 2261 """Get the url to pull files from the devserver. 2262 2263 @param target: Target of the android build, e.g., shamu_userdebug 2264 @param build_id: Build id of the android build. 2265 @param branch: Branch of the android build. 2266 2267 @return A url to pull files from the dev server given a specific 2268 android build. 2269 """ 2270 return os.path.join(self.url(), 'static', branch, target, build_id) 2271 2272 2273 def trigger_download(self, target, build_id, branch, artifacts=None, 2274 files='', os='android', synchronous=True): 2275 """Tell the devserver to download and stage an Android build. 2276 2277 Tells the devserver to fetch an Android build from the image storage 2278 server named by _get_image_storage_server(). 2279 2280 If |synchronous| is True, waits for the entire download to finish 2281 staging before returning. Otherwise only the artifacts necessary 2282 to start installing images onto DUT's will be staged before returning. 2283 A caller can then call finish_download to guarantee the rest of the 2284 artifacts have finished staging. 2285 2286 @param target: Target of the android build to stage, e.g., 2287 shamu-userdebug. 2288 @param build_id: Build id of the android build to stage. 2289 @param branch: Branch of the android build to stage. 2290 @param artifacts: A string of artifacts separated by comma. If None, 2291 use the default artifacts for Android or Brillo build. 2292 @param files: String of file seperated by commas. 2293 @param os: OS artifacts to download (android/brillo). 2294 @param synchronous: if True, waits until all components of the image are 2295 staged before returning. 2296 2297 @raise DevServerException upon any return code that's not HTTP OK. 2298 2299 """ 2300 android_build_info = {'target': target, 2301 'build_id': build_id, 2302 'branch': branch} 2303 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2304 if not artifacts: 2305 board = target.split('-')[0] 2306 artifacts = ( 2307 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2308 board, os)) 2309 self._trigger_download(build, artifacts, files=files, 2310 synchronous=synchronous, **android_build_info) 2311 2312 2313 def finish_download(self, target, build_id, branch, os='android'): 2314 """Tell the devserver to finish staging an Android build. 2315 2316 If trigger_download is called with synchronous=False, it will return 2317 before all artifacts have been staged. This method contacts the 2318 devserver and blocks until all staging is completed and should be 2319 called after a call to trigger_download. 2320 2321 @param target: Target of the android build to stage, e.g., 2322 shamu-userdebug. 2323 @param build_id: Build id of the android build to stage. 2324 @param branch: Branch of the android build to stage. 2325 @param os: OS artifacts to download (android/brillo). 2326 2327 @raise DevServerException upon any return code that's not HTTP OK. 2328 """ 2329 android_build_info = {'target': target, 2330 'build_id': build_id, 2331 'branch': branch} 2332 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2333 board = target.split('-')[0] 2334 artifacts = ( 2335 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2336 board)) 2337 self._finish_download(build, artifacts, files='', **android_build_info) 2338 2339 2340 def get_staged_file_url(self, filename, target, build_id, branch): 2341 """Returns the url of a staged file for this image on the devserver. 2342 2343 @param filename: Name of the file. 2344 @param target: Target of the android build to stage, e.g., 2345 shamu-userdebug. 2346 @param build_id: Build id of the android build to stage. 2347 @param branch: Branch of the android build to stage. 2348 2349 @return: The url of a staged file for this image on the devserver. 2350 """ 2351 android_build_info = {'target': target, 2352 'build_id': build_id, 2353 'branch': branch, 2354 'os_type': 'android'} 2355 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2356 return '/'.join([self._get_image_url(build), filename]) 2357 2358 2359 @remote_devserver_call() 2360 def translate(self, build_name): 2361 """Translate the build name if it's in LATEST format. 2362 2363 If the build name is in the format [branch]/[target]/LATEST, return the 2364 latest build in Launch Control otherwise return the build name as is. 2365 2366 @param build_name: build_name to check. 2367 2368 @return The actual build name to use. 2369 """ 2370 branch, target, build_id = utils.parse_launch_control_build(build_name) 2371 if build_id.upper() != 'LATEST': 2372 return build_name 2373 call = self.build_call('latestbuild', branch=branch, target=target, 2374 os_type='android') 2375 translated_build_id = self.run_call(call) 2376 translated_build = (ANDROID_BUILD_NAME_PATTERN % 2377 {'branch': branch, 2378 'target': target, 2379 'build_id': translated_build_id}) 2380 logging.debug('Translated relative build %s to %s', build_name, 2381 translated_build) 2382 return translated_build 2383 2384 2385def _is_load_healthy(load): 2386 """Check if devserver's load meets the minimum threshold. 2387 2388 @param load: The devserver's load stats to check. 2389 2390 @return: True if the load meets the minimum threshold. Return False 2391 otherwise. 2392 2393 """ 2394 # Threshold checks, including CPU load. 2395 if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD: 2396 logging.debug('CPU load of devserver %s is at %s%%, which is higher ' 2397 'than the threshold of %s%%', load['devserver'], 2398 load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD) 2399 return False 2400 if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO: 2401 logging.debug('Network IO of devserver %s is at %i Bps, which is ' 2402 'higher than the threshold of %i bytes per second.', 2403 load['devserver'], load[DevServer.NETWORK_IO], 2404 DevServer.MAX_NETWORK_IO) 2405 return False 2406 return True 2407 2408 2409def _compare_load(devserver1, devserver2): 2410 """Comparator function to compare load between two devservers. 2411 2412 @param devserver1: A dictionary of devserver load stats to be compared. 2413 @param devserver2: A dictionary of devserver load stats to be compared. 2414 2415 @return: Negative value if the load of `devserver1` is less than the load 2416 of `devserver2`. Return positive value otherwise. 2417 2418 """ 2419 return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO]) 2420 2421 2422def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None): 2423 """Get the devserver with the least load. 2424 2425 Iterate through all devservers and get the one with least load. 2426 2427 TODO(crbug.com/486278): Devserver with required build already staged should 2428 take higher priority. This will need check_health call to be able to verify 2429 existence of a given build/artifact. Also, in case all devservers are 2430 overloaded, the logic here should fall back to the old behavior that randomly 2431 selects a devserver based on the hash of the image name/url. 2432 2433 @param devserver_type: Type of devserver to select from. Default is set to 2434 ImageServer. 2435 @param hostname: Hostname of the dut that the devserver is used for. The 2436 picked devserver needs to respect the location of the host if 2437 `prefer_local_devserver` is set to True or `restricted_subnets` is 2438 set. 2439 2440 @return: Name of the devserver with the least load. 2441 2442 """ 2443 devservers, can_retry = devserver_type.get_available_devservers( 2444 hostname) 2445 # If no healthy devservers available and can_retry is False, return None. 2446 # Otherwise, relax the constrain on hostname, allow all devservers to be 2447 # available. 2448 if not devserver_type.get_healthy_devserver('', devservers): 2449 if not can_retry: 2450 return None 2451 else: 2452 devservers, _ = devserver_type.get_available_devservers() 2453 2454 # get_devserver_load call needs to be made in a new process to allow force 2455 # timeout using signal. 2456 output = multiprocessing.Queue() 2457 processes = [] 2458 for devserver in devservers: 2459 processes.append(multiprocessing.Process( 2460 target=devserver_type.get_devserver_load_wrapper, 2461 args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output))) 2462 2463 for p in processes: 2464 p.start() 2465 for p in processes: 2466 p.join() 2467 loads = [output.get() for p in processes] 2468 # Filter out any load failed to be retrieved or does not support load check. 2469 loads = [load for load in loads if load and DevServer.CPU_LOAD in load and 2470 DevServer.is_free_disk_ok(load) and 2471 DevServer.is_apache_client_count_ok(load)] 2472 if not loads: 2473 logging.debug('Failed to retrieve load stats from any devserver. No ' 2474 'load balancing can be applied.') 2475 return None 2476 loads = [load for load in loads if _is_load_healthy(load)] 2477 if not loads: 2478 logging.error('No devserver has the capacity to be selected.') 2479 return None 2480 loads = sorted(loads, cmp=_compare_load) 2481 return loads[0]['devserver'] 2482 2483 2484def resolve(build, hostname=None, ban_list=None): 2485 """Resolve a devserver can be used for given build and hostname. 2486 2487 @param build: Name of a build to stage on devserver, e.g., 2488 ChromeOS build: daisy-release/R50-1234.0.0 2489 Launch Control build: git_mnc_release/shamu-eng 2490 @param hostname: Hostname of a devserver for, default is None, which means 2491 devserver is not restricted by the network location of the host. 2492 @param ban_list: The blacklist of devservers shouldn't be chosen. 2493 2494 @return: A DevServer instance that can be used to stage given build for the 2495 given host. 2496 """ 2497 if utils.is_launch_control_build(build): 2498 return AndroidBuildServer.resolve(build, hostname) 2499 else: 2500 return ImageServer.resolve(build, hostname, ban_list=ban_list) 2501