afe_lock_machine.py revision d97422aef5709f0a3b16b4efebc397c891940c95
1#!/usr/bin/python2 2# 3# Copyright 2015 Google INc. All Rights Reserved. 4"""This module controls locking and unlocking of test machines.""" 5 6from __future__ import print_function 7 8import argparse 9import getpass 10import os 11import sys 12import traceback 13 14from utils import logger 15from utils import machines 16 17 18class AFELockException(Exception): 19 """Base class for exceptions in this module.""" 20 21 22class MachineNotPingable(AFELockException): 23 """Raised when machine does not respond to ping.""" 24 25 26class MissingHostInfo(AFELockException): 27 """Raised when cannot find info about machine on machine servers.""" 28 29 30class UpdateNonLocalMachine(AFELockException): 31 """Raised when user requests to add/remove a ChromeOS HW Lab machine..""" 32 33 34class DuplicateAdd(AFELockException): 35 """Raised when user requests to add a machine that's already on the server.""" 36 37 38class UpdateServerError(AFELockException): 39 """Raised when attempt to add/remove a machine from local server fails.""" 40 41 42class LockingError(AFELockException): 43 """Raised when server fails to lock/unlock machine as requested.""" 44 45 46class DontOwnLock(AFELockException): 47 """Raised when user attmepts to unlock machine locked by someone else.""" 48 # This should not be raised if the user specified '--force' 49 50 51class NoAFEServer(AFELockException): 52 """Raised when cannot find/access the autotest server.""" 53 54 55class AFEAccessError(AFELockException): 56 """Raised when cannot get information about lab machine from lab server.""" 57 58 59class AFELockManager(object): 60 """Class for locking/unlocking machines vie Autotest Front End servers. 61 62 This class contains methods for checking the locked status of machines 63 on both the ChromeOS HW Lab AFE server and a local AFE server. It also 64 has methods for adding/removing machines from the local server, and for 65 changing the lock status of machines on either server. For the ChromeOS 66 HW Lab, it only allows access to the toolchain team lab machines, as 67 defined in toolchain-utils/crosperf/default_remotes. By default it will 68 look for a local server on chrotomation2.mtv.corp.google.com, but an 69 alternative local AFE server can be supplied, if desired. 70 71 !!!IMPORTANT NOTE!!! The AFE server can only be called from the main 72 thread/process of a program. If you launch threads and try to call it 73 from a thread, you will get an error. This has to do with restrictions 74 in the Python virtual machine (and signal handling) and cannot be changed. 75 """ 76 77 LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com' 78 79 def __init__(self, 80 remotes, 81 force_option, 82 chromeos_root, 83 local_server, 84 local=True, 85 log=None): 86 """Initializes an AFELockManager object. 87 88 Args: 89 remotes: A list of machine names or ip addresses to be managed. Names 90 and ip addresses should be represented as strings. If the list is 91 empty, the lock manager will get all known machines. 92 force_option: A Boolean indicating whether or not to force an unlock of 93 a machine that was locked by someone else. 94 chromeos_root: The ChromeOS chroot to use for the autotest scripts. 95 local_server: A string containing the name or ip address of the machine 96 that is running an AFE server, which is to be used for managing 97 machines that are not in the ChromeOS HW lab. 98 local: A Boolean indicating whether or not to use/allow a local AFE 99 server to be used (see local_server argument). 100 log: If not None, this is the logger object to be used for writing out 101 informational output messages. It is expected to be an instance of 102 Logger class from utils/logger.py. 103 """ 104 self.chromeos_root = chromeos_root 105 self.user = getpass.getuser() 106 self.logger = log or logger.GetLogger() 107 autotest_path = os.path.join(chromeos_root, 108 'src/third_party/autotest/files') 109 110 sys.path.append(chromeos_root) 111 sys.path.append(autotest_path) 112 sys.path.append(os.path.join(autotest_path, 'server', 'cros')) 113 114 # We have to wait to do these imports until the paths above have 115 # been fixed. 116 # pylint: disable=import-error 117 from client import setup_modules 118 setup_modules.setup(base_path=autotest_path, 119 root_module_name='autotest_lib') 120 121 from dynamic_suite import frontend_wrappers 122 123 self.afe = frontend_wrappers.RetryingAFE(timeout_min=30, 124 delay_sec=10, 125 debug=False, 126 server='cautotest') 127 if not local: 128 self.local_afe = None 129 else: 130 dargs = {} 131 dargs['server'] = local_server or AFELockManager.LOCAL_SERVER 132 # Make sure local server is pingable. 133 error_msg = ('Local autotest server machine %s not responding to ping.' % 134 dargs['server']) 135 self.CheckMachine(dargs['server'], error_msg) 136 self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30, 137 delay_sec=10, 138 debug=False, 139 **dargs) 140 self.local = local 141 self.machines = list(set(remotes)) or [] 142 self.force = force_option 143 self.toolchain_lab_machines = self.GetAllToolchainLabMachines() 144 if not self.machines: 145 self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines() 146 147 def CheckMachine(self, machine, error_msg): 148 """Verifies that machine is responding to ping. 149 150 Args: 151 machine: String containing the name or ip address of machine to check. 152 error_msg: Message to print if ping fails. 153 154 Raises: 155 MachineNotPingable: If machine is not responding to 'ping' 156 """ 157 if not machines.MachineIsPingable(machine, logging_level='none'): 158 cros_machine = machine + '.cros' 159 if not machines.MachineIsPingable(cros_machine, logging_level='none'): 160 raise MachineNotPingable(error_msg) 161 162 def MachineIsKnown(self, machine): 163 """Checks to see if either AFE server knows the given machine. 164 165 Args: 166 machine: String containing name or ip address of machine to check. 167 168 Returns: 169 Boolean indicating if the machine is in the list of known machines for 170 either AFE server. 171 """ 172 if machine in self.toolchain_lab_machines: 173 return True 174 elif self.local_afe and machine in self.GetAllNonlabMachines(): 175 return True 176 177 return False 178 179 def GetAllToolchainLabMachines(self): 180 """Gets a list of all the toolchain machines in the ChromeOS HW lab. 181 182 Returns: 183 A list of names of the toolchain machines in the ChromeOS HW lab. 184 """ 185 machines_file = os.path.join( 186 os.path.dirname(__file__), 'crosperf', 'default_remotes') 187 machine_list = [] 188 with open(machines_file, 'r') as input_file: 189 lines = input_file.readlines() 190 for line in lines: 191 _, remotes = line.split(':') 192 remotes = remotes.strip() 193 for r in remotes.split(): 194 machine_list.append(r.strip()) 195 return machine_list 196 197 def GetAllNonlabMachines(self): 198 """Gets a list of all known machines on the local AFE server. 199 200 Returns: 201 A list of the names of the machines on the local AFE server. 202 """ 203 non_lab_machines = [] 204 if self.local_afe: 205 non_lab_machines = self.local_afe.get_hostnames() 206 return non_lab_machines 207 208 def PrintStatusHeader(self, is_lab_machine): 209 """Prints the status header lines for machines. 210 211 Args: 212 is_lab_machine: Boolean indicating whether to print HW Lab header or 213 local machine header (different spacing). 214 """ 215 if is_lab_machine: 216 print('\nMachine (Board)\t\t\t\t\tStatus') 217 print('---------------\t\t\t\t\t------\n') 218 else: 219 print('\nMachine (Board)\t\tStatus') 220 print('---------------\t\t------\n') 221 222 def RemoveLocalMachine(self, m): 223 """Removes a machine from the local AFE server. 224 225 Args: 226 m: The machine to remove. 227 228 Raises: 229 MissingHostInfo: Can't find machine to be removed. 230 """ 231 if self.local_afe: 232 host_info = self.local_afe.get_hosts(hostname=m) 233 if host_info: 234 host_info = host_info[0] 235 host_info.delete() 236 else: 237 raise MissingHostInfo('Cannot find/delete machine %s.' % m) 238 239 def AddLocalMachine(self, m): 240 """Adds a machine to the local AFE server. 241 242 Args: 243 m: The machine to be added. 244 """ 245 if self.local_afe: 246 error_msg = 'Machine %s is not responding to ping.' % m 247 self.CheckMachine(m, error_msg) 248 self.local_afe.create_host(m) 249 250 def AddMachinesToLocalServer(self): 251 """Adds one or more machines to the local AFE server. 252 253 Verify that the requested machines are legal to add to the local server, 254 i.e. that they are not ChromeOS HW lab machines, and they are not already 255 on the local server. Call AddLocalMachine for each valid machine. 256 257 Raises: 258 DuplicateAdd: Attempt to add a machine that is already on the server. 259 UpdateNonLocalMachine: Attempt to add a ChromeOS HW lab machine. 260 UpdateServerError: Something went wrong while attempting to add a 261 machine. 262 """ 263 for m in self.machines: 264 for cros_name in [m, m + '.cros']: 265 if cros_name in self.toolchain_lab_machines: 266 raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW' 267 'Lab. Cannot add it to local server.' % 268 cros_name) 269 host_info = self.local_afe.get_hosts(hostname=m) 270 if host_info: 271 raise DuplicateAdd('Machine %s is already on the local server.' % m) 272 try: 273 self.AddLocalMachine(m) 274 self.logger.LogOutput('Successfully added %s to local server.' % m) 275 except Exception as e: 276 traceback.print_exc() 277 raise UpdateServerError( 278 'Error occurred while attempting to add %s. %s' % (m, str(e))) 279 280 def RemoveMachinesFromLocalServer(self): 281 """Removes one or more machines from the local AFE server. 282 283 Verify that the requested machines are legal to remove from the local 284 server, i.e. that they are not ChromeOS HW lab machines. Call 285 RemoveLocalMachine for each valid machine. 286 287 Raises: 288 UpdateServerError: Something went wrong while attempting to remove a 289 machine. 290 """ 291 for m in self.machines: 292 for cros_name in [m, m + '.cros']: 293 if cros_name in self.toolchain_lab_machines: 294 raise UpdateNonLocalMachine( 295 'Machine %s is in the ChromeOS HW Lab. ' 296 'This script cannot remove lab machines.' % cros_name) 297 try: 298 self.RemoveLocalMachine(m) 299 self.logger.LogOutput('Successfully removed %s from local server.' % m) 300 except Exception as e: 301 traceback.print_exc() 302 raise UpdateServerError('Error occurred while attempting to remove %s ' 303 '(%s).' % (m, str(e))) 304 305 def ListMachineStates(self, machine_states): 306 """Gets and prints the current status for a list of machines. 307 308 Prints out the current status for all of the machines in the current 309 AFELockManager's list of machines (set when the object is initialized). 310 311 Args: 312 machine_states: A dictionary of the current state of every machine in 313 the current AFELockManager's list of machines. Normally obtained by 314 calling AFELockManager::GetMachineStates. 315 """ 316 local_machines = [] 317 printed_hdr = False 318 for m in machine_states: 319 cros_name = m + '.cros' 320 if (m in self.toolchain_lab_machines or 321 cros_name in self.toolchain_lab_machines): 322 name = m if m in self.toolchain_lab_machines else cros_name 323 if not printed_hdr: 324 self.PrintStatusHeader(True) 325 printed_hdr = True 326 state = machine_states[m] 327 if state['locked']: 328 print('%s (%s)\tlocked by %s since %s' % 329 (name, state['board'], state['locked_by'], state['lock_time'])) 330 else: 331 print('%s (%s)\tunlocked' % (name, state['board'])) 332 else: 333 local_machines.append(m) 334 335 if local_machines: 336 self.PrintStatusHeader(False) 337 for m in local_machines: 338 state = machine_states[m] 339 if state['locked']: 340 print('%s (%s)\tlocked by %s since %s' % 341 (m, state['board'], state['locked_by'], state['lock_time'])) 342 else: 343 print('%s (%s)\tunlocked' % (m, state['board'])) 344 345 def UpdateLockInAFE(self, should_lock_machine, machine): 346 """Calls an AFE server to lock/unlock a machine. 347 348 Args: 349 should_lock_machine: Boolean indicating whether to lock the machine (True) 350 or unlock the machine (False). 351 machine: The machine to update. 352 353 Raises: 354 LockingError: An error occurred while attempting to update the machine 355 state. 356 """ 357 action = 'lock' 358 if not should_lock_machine: 359 action = 'unlock' 360 kwargs = {'locked': should_lock_machine} 361 kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user 362 363 cros_name = machine + '.cros' 364 if cros_name in self.toolchain_lab_machines: 365 machine = cros_name 366 if machine in self.toolchain_lab_machines: 367 m = machine.split('.')[0] 368 afe_server = self.afe 369 else: 370 m = machine 371 afe_server = self.local_afe 372 373 try: 374 afe_server.run('modify_hosts', 375 host_filter_data={'hostname__in': [m]}, 376 update_data=kwargs) 377 except Exception as e: 378 traceback.print_exc() 379 raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e))) 380 381 def UpdateMachines(self, lock_machines): 382 """Sets the locked state of the machines to the requested value. 383 384 The machines updated are the ones in self.machines (specified when the 385 class object was intialized). 386 387 Args: 388 lock_machines: Boolean indicating whether to lock the machines (True) or 389 unlock the machines (False). 390 391 Returns: 392 A list of the machines whose state was successfully updated. 393 """ 394 updated_machines = [] 395 for m in self.machines: 396 self.UpdateLockInAFE(lock_machines, m) 397 # Since we returned from self.UpdateLockInAFE we assume the request 398 # succeeded. 399 if lock_machines: 400 self.logger.LogOutput('Locked machine(s) %s.' % m) 401 else: 402 self.logger.LogOutput('Unlocked machine(s) %s.' % m) 403 updated_machines.append(m) 404 405 return updated_machines 406 407 def _InternalRemoveMachine(self, machine): 408 """Remove machine from internal list of machines. 409 410 Args: 411 machine: Name of machine to be removed from internal list. 412 """ 413 # Check to see if machine is lab machine and if so, make sure it has 414 # ".cros" on the end. 415 cros_machine = machine 416 if machine.find('rack') > 0 and machine.find('row') > 0: 417 if machine.find('.cros') == -1: 418 cros_machine = cros_machine + '.cros' 419 420 self.machines = [m 421 for m in self.machines 422 if m != cros_machine and m != machine] 423 424 def CheckMachineLocks(self, machine_states, cmd): 425 """Check that every machine in requested list is in the proper state. 426 427 If the cmd is 'unlock' verify that every machine is locked by requestor. 428 If the cmd is 'lock' verify that every machine is currently unlocked. 429 430 Args: 431 machine_states: A dictionary of the current state of every machine in 432 the current AFELockManager's list of machines. Normally obtained by 433 calling AFELockManager::GetMachineStates. 434 cmd: The user-requested action for the machines: 'lock' or 'unlock'. 435 436 Raises: 437 DontOwnLock: The lock on a requested machine is owned by someone else. 438 """ 439 for k, state in machine_states.iteritems(): 440 if cmd == 'unlock': 441 if not state['locked']: 442 self.logger.LogWarning('Attempt to unlock already unlocked machine ' 443 '(%s).' % k) 444 self._InternalRemoveMachine(k) 445 446 if state['locked'] and state['locked_by'] != self.user: 447 raise DontOwnLock('Attempt to unlock machine (%s) locked by someone ' 448 'else (%s).' % (k, state['locked_by'])) 449 elif cmd == 'lock': 450 if state['locked']: 451 self.logger.LogWarning('Attempt to lock already locked machine (%s)' % 452 k) 453 self._InternalRemoveMachine(k) 454 455 def HasAFEServer(self, local): 456 """Verifies that the AFELockManager has appropriate AFE server. 457 458 Args: 459 local: Boolean indicating whether we are checking for the local server 460 (True) or for the global server (False). 461 462 Returns: 463 A boolean indicating if the AFELockManager has the requested AFE server. 464 """ 465 if local: 466 return self.local_afe is not None 467 else: 468 return self.afe is not None 469 470 def GetMachineStates(self, cmd=''): 471 """Gets the current state of all the requested machines. 472 473 Gets the current state of all the requested machines, both from the HW lab 474 sever and from the local server. Stores the data in a dictionary keyed 475 by machine name. 476 477 Args: 478 cmd: The command for which we are getting the machine states. This is 479 important because if one of the requested machines is missing we raise 480 an exception, unless the requested command is 'add'. 481 482 Returns: 483 A dictionary of machine states for all the machines in the AFELockManager 484 object. 485 486 Raises: 487 NoAFEServer: Cannot find the HW Lab or local AFE server. 488 AFEAccessError: An error occurred when querying the server about a 489 machine. 490 """ 491 if not self.HasAFEServer(False): 492 raise NoAFEServer('Error: Cannot connect to main AFE server.') 493 494 if self.local and not self.HasAFEServer(True): 495 raise NoAFEServer('Error: Cannot connect to local AFE server.') 496 497 machine_list = {} 498 for m in self.machines: 499 host_info = None 500 cros_name = m + '.cros' 501 if (m in self.toolchain_lab_machines or 502 cros_name in self.toolchain_lab_machines): 503 mod_host = m.split('.')[0] 504 host_info = self.afe.get_hosts(hostname=mod_host) 505 if not host_info: 506 raise AFEAccessError('Unable to get information about %s from main' 507 ' autotest server.' % m) 508 else: 509 host_info = self.local_afe.get_hosts(hostname=m) 510 if not host_info and cmd != 'add': 511 raise AFEAccessError('Unable to get information about %s from ' 512 'local autotest server.' % m) 513 if host_info: 514 host_info = host_info[0] 515 name = host_info.hostname 516 values = {} 517 values['board'] = host_info.platform if host_info.platform else '??' 518 values['locked'] = host_info.locked 519 if host_info.locked: 520 values['locked_by'] = host_info.locked_by 521 values['lock_time'] = host_info.lock_time 522 else: 523 values['locked_by'] = '' 524 values['lock_time'] = '' 525 machine_list[name] = values 526 else: 527 machine_list[m] = {} 528 return machine_list 529 530 531def Main(argv): 532 """Parse the options, initialize lock manager and dispatch proper method. 533 534 Args: 535 argv: The options with which this script was invoked. 536 537 Returns: 538 0 unless an exception is raised. 539 """ 540 parser = argparse.ArgumentParser() 541 542 parser.add_argument('--list', 543 dest='cmd', 544 action='store_const', 545 const='status', 546 help='List current status of all known machines.') 547 parser.add_argument('--lock', 548 dest='cmd', 549 action='store_const', 550 const='lock', 551 help='Lock given machine(s).') 552 parser.add_argument('--unlock', 553 dest='cmd', 554 action='store_const', 555 const='unlock', 556 help='Unlock given machine(s).') 557 parser.add_argument('--status', 558 dest='cmd', 559 action='store_const', 560 const='status', 561 help='List current status of given machine(s).') 562 parser.add_argument('--add_machine', 563 dest='cmd', 564 action='store_const', 565 const='add', 566 help='Add machine to local machine server.') 567 parser.add_argument('--remove_machine', 568 dest='cmd', 569 action='store_const', 570 const='remove', 571 help='Remove machine from the local machine server.') 572 parser.add_argument('--nolocal', 573 dest='local', 574 action='store_false', 575 default=True, 576 help='Do not try to use local machine server.') 577 parser.add_argument('--remote', 578 dest='remote', 579 help='machines on which to operate') 580 parser.add_argument('--chromeos_root', 581 dest='chromeos_root', 582 required=True, 583 help='ChromeOS root to use for autotest scripts.') 584 parser.add_argument('--local_server', 585 dest='local_server', 586 default=None, 587 help='Alternate local autotest server to use.') 588 parser.add_argument('--force', 589 dest='force', 590 action='store_true', 591 default=False, 592 help='Force lock/unlock of machines, even if not' 593 ' current lock owner.') 594 595 options = parser.parse_args(argv) 596 597 if not options.remote and options.cmd != 'status': 598 parser.error('No machines specified for operation.') 599 600 if not os.path.isdir(options.chromeos_root): 601 parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root) 602 603 if not options.cmd: 604 parser.error('No operation selected (--list, --status, --lock, --unlock,' 605 ' --add_machine, --remove_machine).') 606 607 machine_list = [] 608 if options.remote: 609 machine_list = options.remote.split() 610 611 lock_manager = AFELockManager(machine_list, options.force, 612 options.chromeos_root, options.local_server, 613 options.local) 614 615 machine_states = lock_manager.GetMachineStates(cmd=options.cmd) 616 cmd = options.cmd 617 618 if cmd == 'status': 619 lock_manager.ListMachineStates(machine_states) 620 621 elif cmd == 'lock': 622 if not lock_manager.force: 623 lock_manager.CheckMachineLocks(machine_states, cmd) 624 lock_manager.UpdateMachines(True) 625 626 elif cmd == 'unlock': 627 if not lock_manager.force: 628 lock_manager.CheckMachineLocks(machine_states, cmd) 629 lock_manager.UpdateMachines(False) 630 631 elif cmd == 'add': 632 lock_manager.AddMachinesToLocalServer() 633 634 elif cmd == 'remove': 635 lock_manager.RemoveMachinesFromLocalServer() 636 637 return 0 638 639 640if __name__ == '__main__': 641 sys.exit(Main(sys.argv[1:])) 642