afe_lock_machine.py revision f2a3ef46f75d2196a93d3ed27f4d1fcf22b54fbe
1#!/usr/bin/python2 2# 3# Copyright 2015 Google INc. All Rights Reserved. 4"""This module controls locking and unlocking of test machines.""" 5 6from __future__ import print_function 7 8import argparse 9import getpass 10import os 11import sys 12import traceback 13 14from utils import logger 15from utils import machines 16 17 18class AFELockException(Exception): 19 """Base class for exceptions in this module.""" 20 21 22class MachineNotPingable(AFELockException): 23 """Raised when machine does not respond to ping.""" 24 25 26class MissingHostInfo(AFELockException): 27 """Raised when cannot find info about machine on machine servers.""" 28 29 30class UpdateNonLocalMachine(AFELockException): 31 """Raised when user requests to add/remove a ChromeOS HW Lab machine..""" 32 33 34class DuplicateAdd(AFELockException): 35 """Raised when user requests to add a machine that's already on the server.""" 36 37 38class UpdateServerError(AFELockException): 39 """Raised when attempt to add/remove a machine from local server fails.""" 40 41 42class LockingError(AFELockException): 43 """Raised when server fails to lock/unlock machine as requested.""" 44 45 46class DontOwnLock(AFELockException): 47 """Raised when user attmepts to unlock machine locked by someone else.""" 48 # This should not be raised if the user specified '--force' 49 50 51class NoAFEServer(AFELockException): 52 """Raised when cannot find/access the autotest server.""" 53 54 55class AFEAccessError(AFELockException): 56 """Raised when cannot get information about lab machine from lab server.""" 57 58 59class AFELockManager(object): 60 """Class for locking/unlocking machines vie Autotest Front End servers. 61 62 This class contains methods for checking the locked status of machines 63 on both the ChromeOS HW Lab AFE server and a local AFE server. It also 64 has methods for adding/removing machines from the local server, and for 65 changing the lock status of machines on either server. For the ChromeOS 66 HW Lab, it only allows access to the toolchain team lab machines, as 67 defined in toolchain-utils/crosperf/default_remotes. By default it will 68 look for a local server on chrotomation2.mtv.corp.google.com, but an 69 alternative local AFE server can be supplied, if desired. 70 71 !!!IMPORTANT NOTE!!! The AFE server can only be called from the main 72 thread/process of a program. If you launch threads and try to call it 73 from a thread, you will get an error. This has to do with restrictions 74 in the Python virtual machine (and signal handling) and cannot be changed. 75 """ 76 77 LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com' 78 79 def __init__(self, 80 remotes, 81 force_option, 82 chromeos_root, 83 local_server, 84 local=True, 85 log=None): 86 """Initializes an AFELockManager object. 87 88 Args: 89 remotes: A list of machine names or ip addresses to be managed. Names 90 and ip addresses should be represented as strings. If the list is 91 empty, the lock manager will get all known machines. 92 force_option: A Boolean indicating whether or not to force an unlock of 93 a machine that was locked by someone else. 94 chromeos_root: The ChromeOS chroot to use for the autotest scripts. 95 local_server: A string containing the name or ip address of the machine 96 that is running an AFE server, which is to be used for managing 97 machines that are not in the ChromeOS HW lab. 98 local: A Boolean indicating whether or not to use/allow a local AFE 99 server to be used (see local_server argument). 100 log: If not None, this is the logger object to be used for writing out 101 informational output messages. It is expected to be an instance of 102 Logger class from utils/logger.py. 103 """ 104 self.chromeos_root = chromeos_root 105 self.user = getpass.getuser() 106 self.logger = log or logger.GetLogger() 107 autotest_path = os.path.join(chromeos_root, 108 'src/third_party/autotest/files') 109 110 sys.path.append(chromeos_root) 111 sys.path.append(autotest_path) 112 sys.path.append(os.path.join(autotest_path, 'server', 'cros')) 113 114 # We have to wait to do these imports until the paths above have 115 # been fixed. 116 from client import setup_modules 117 setup_modules.setup(base_path=autotest_path, 118 root_module_name='autotest_lib') 119 120 from dynamic_suite import frontend_wrappers 121 122 self.afe = frontend_wrappers.RetryingAFE(timeout_min=30, 123 delay_sec=10, 124 debug=False, 125 server='cautotest') 126 if not local: 127 self.local_afe = None 128 else: 129 dargs = {} 130 dargs['server'] = local_server or AFELockManager.LOCAL_SERVER 131 # Make sure local server is pingable. 132 error_msg = ('Local autotest server machine %s not responding to ping.' % 133 dargs['server']) 134 self.CheckMachine(dargs['server'], error_msg) 135 self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30, 136 delay_sec=10, 137 debug=False, 138 **dargs) 139 self.local = local 140 self.machines = list(set(remotes)) or [] 141 self.force = force_option 142 self.toolchain_lab_machines = self.GetAllToolchainLabMachines() 143 if not self.machines: 144 self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines() 145 146 def CheckMachine(self, machine, error_msg): 147 """Verifies that machine is responding to ping. 148 149 Args: 150 machine: String containing the name or ip address of machine to check. 151 error_msg: Message to print if ping fails. 152 153 Raises: 154 MachineNotPingable: If machine is not responding to 'ping' 155 """ 156 if not machines.MachineIsPingable(machine, logging_level='none'): 157 cros_machine = machine + '.cros' 158 if not machines.MachineIsPingable(cros_machine, logging_level='none'): 159 raise MachineNotPingable(error_msg) 160 161 def MachineIsKnown(self, machine): 162 """Checks to see if either AFE server knows the given machine. 163 164 Args: 165 machine: String containing name or ip address of machine to check. 166 167 Returns: 168 Boolean indicating if the machine is in the list of known machines for 169 either AFE server. 170 """ 171 if machine in self.toolchain_lab_machines: 172 return True 173 elif self.local_afe and machine in self.GetAllNonlabMachines(): 174 return True 175 176 return False 177 178 def GetAllToolchainLabMachines(self): 179 """Gets a list of all the toolchain machines in the ChromeOS HW lab. 180 181 Returns: 182 A list of names of the toolchain machines in the ChromeOS HW lab. 183 """ 184 machines_file = os.path.join( 185 os.path.dirname(__file__), 'crosperf', 'default_remotes') 186 machine_list = [] 187 with open(machines_file, 'r') as input_file: 188 lines = input_file.readlines() 189 for line in lines: 190 _, remotes = line.split(':') 191 remotes = remotes.strip() 192 for r in remotes.split(): 193 machine_list.append(r.strip()) 194 return machine_list 195 196 def GetAllNonlabMachines(self): 197 """Gets a list of all known machines on the local AFE server. 198 199 Returns: 200 A list of the names of the machines on the local AFE server. 201 """ 202 non_lab_machines = [] 203 if self.local_afe: 204 non_lab_machines = self.local_afe.get_hostnames() 205 return non_lab_machines 206 207 def PrintStatusHeader(self, is_lab_machine): 208 """Prints the status header lines for machines. 209 210 Args: 211 is_lab_machine: Boolean indicating whether to print HW Lab header or 212 local machine header (different spacing). 213 """ 214 if is_lab_machine: 215 print('\nMachine (Board)\t\t\t\t\tStatus') 216 print('---------------\t\t\t\t\t------\n') 217 else: 218 print('\nMachine (Board)\t\tStatus') 219 print('---------------\t\t------\n') 220 221 def RemoveLocalMachine(self, m): 222 """Removes a machine from the local AFE server. 223 224 Args: 225 m: The machine to remove. 226 227 Raises: 228 MissingHostInfo: Can't find machine to be removed. 229 """ 230 if self.local_afe: 231 host_info = self.local_afe.get_hosts(hostname=m) 232 if host_info: 233 host_info = host_info[0] 234 host_info.delete() 235 else: 236 raise MissingHostInfo('Cannot find/delete machine %s.' % m) 237 238 def AddLocalMachine(self, m): 239 """Adds a machine to the local AFE server. 240 241 Args: 242 m: The machine to be added. 243 """ 244 if self.local_afe: 245 error_msg = 'Machine %s is not responding to ping.' % m 246 self.CheckMachine(m, error_msg) 247 self.local_afe.create_host(m) 248 249 def AddMachinesToLocalServer(self): 250 """Adds one or more machines to the local AFE server. 251 252 Verify that the requested machines are legal to add to the local server, 253 i.e. that they are not ChromeOS HW lab machines, and they are not already 254 on the local server. Call AddLocalMachine for each valid machine. 255 256 Raises: 257 DuplicateAdd: Attempt to add a machine that is already on the server. 258 UpdateNonLocalMachine: Attempt to add a ChromeOS HW lab machine. 259 UpdateServerError: Something went wrong while attempting to add a 260 machine. 261 """ 262 for m in self.machines: 263 for cros_name in [m, m + '.cros']: 264 if cros_name in self.toolchain_lab_machines: 265 raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW' 266 'Lab. Cannot add it to local server.' % 267 cros_name) 268 host_info = self.local_afe.get_hosts(hostname=m) 269 if host_info: 270 raise DuplicateAdd('Machine %s is already on the local server.' % m) 271 try: 272 self.AddLocalMachine(m) 273 self.logger.LogOutput('Successfully added %s to local server.' % m) 274 except Exception as e: 275 traceback.print_exc() 276 raise UpdateServerError( 277 'Error occurred while attempting to add %s. %s' % (m, str(e))) 278 279 def RemoveMachinesFromLocalServer(self): 280 """Removes one or more machines from the local AFE server. 281 282 Verify that the requested machines are legal to remove from the local 283 server, i.e. that they are not ChromeOS HW lab machines. Call 284 RemoveLocalMachine for each valid machine. 285 286 Raises: 287 UpdateServerError: Something went wrong while attempting to remove a 288 machine. 289 """ 290 for m in self.machines: 291 for cros_name in [m, m + '.cros']: 292 if cros_name in self.toolchain_lab_machines: 293 raise UpdateNonLocalMachine( 294 'Machine %s is in the ChromeOS HW Lab. ' 295 'This script cannot remove lab machines.' % cros_name) 296 try: 297 self.RemoveLocalMachine(m) 298 self.logger.LogOutput('Successfully removed %s from local server.' % m) 299 except Exception as e: 300 traceback.print_exc() 301 raise UpdateServerError('Error occurred while attempting to remove %s ' 302 '(%s).' % (m, str(e))) 303 304 def ListMachineStates(self, machine_states): 305 """Gets and prints the current status for a list of machines. 306 307 Prints out the current status for all of the machines in the current 308 AFELockManager's list of machines (set when the object is initialized). 309 310 Args: 311 machine_states: A dictionary of the current state of every machine in 312 the current AFELockManager's list of machines. Normally obtained by 313 calling AFELockManager::GetMachineStates. 314 """ 315 local_machines = [] 316 printed_hdr = False 317 for m in machine_states: 318 cros_name = m + '.cros' 319 if (m in self.toolchain_lab_machines or 320 cros_name in self.toolchain_lab_machines): 321 name = m if m in self.toolchain_lab_machines else cros_name 322 if not printed_hdr: 323 self.PrintStatusHeader(True) 324 printed_hdr = True 325 state = machine_states[m] 326 if state['locked']: 327 print('%s (%s)\tlocked by %s since %s' % 328 (name, state['board'], state['locked_by'], state['lock_time'])) 329 else: 330 print('%s (%s)\tunlocked' % (name, state['board'])) 331 else: 332 local_machines.append(m) 333 334 if local_machines: 335 self.PrintStatusHeader(False) 336 for m in local_machines: 337 state = machine_states[m] 338 if state['locked']: 339 print('%s (%s)\tlocked by %s since %s' % 340 (m, state['board'], state['locked_by'], state['lock_time'])) 341 else: 342 print('%s (%s)\tunlocked' % (m, state['board'])) 343 344 def UpdateLockInAFE(self, should_lock_machine, machine): 345 """Calls an AFE server to lock/unlock a machine. 346 347 Args: 348 should_lock_machine: Boolean indicating whether to lock the machine (True) 349 or unlock the machine (False). 350 machine: The machine to update. 351 352 Raises: 353 LockingError: An error occurred while attempting to update the machine 354 state. 355 """ 356 action = 'lock' 357 if not should_lock_machine: 358 action = 'unlock' 359 kwargs = {'locked': should_lock_machine} 360 kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user 361 362 cros_name = machine + '.cros' 363 if cros_name in self.toolchain_lab_machines: 364 machine = cros_name 365 if machine in self.toolchain_lab_machines: 366 m = machine.split('.')[0] 367 afe_server = self.afe 368 else: 369 m = machine 370 afe_server = self.local_afe 371 372 try: 373 afe_server.run('modify_hosts', 374 host_filter_data={'hostname__in': [m]}, 375 update_data=kwargs) 376 except Exception as e: 377 traceback.print_exc() 378 raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e))) 379 380 def UpdateMachines(self, lock_machines): 381 """Sets the locked state of the machines to the requested value. 382 383 The machines updated are the ones in self.machines (specified when the 384 class object was intialized). 385 386 Args: 387 lock_machines: Boolean indicating whether to lock the machines (True) or 388 unlock the machines (False). 389 390 Returns: 391 A list of the machines whose state was successfully updated. 392 """ 393 updated_machines = [] 394 for m in self.machines: 395 self.UpdateLockInAFE(lock_machines, m) 396 # Since we returned from self.UpdateLockInAFE we assume the request 397 # succeeded. 398 if lock_machines: 399 self.logger.LogOutput('Locked machine(s) %s.' % m) 400 else: 401 self.logger.LogOutput('Unlocked machine(s) %s.' % m) 402 updated_machines.append(m) 403 404 return updated_machines 405 406 def _InternalRemoveMachine(self, machine): 407 """Remove machine from internal list of machines. 408 409 Args: 410 machine: Name of machine to be removed from internal list. 411 """ 412 # Check to see if machine is lab machine and if so, make sure it has 413 # ".cros" on the end. 414 cros_machine = machine 415 if machine.find('rack') > 0 and machine.find('row') > 0: 416 if machine.find('.cros') == -1: 417 cros_machine = cros_machine + '.cros' 418 419 self.machines = [m 420 for m in self.machines 421 if m != cros_machine and m != machine] 422 423 def CheckMachineLocks(self, machine_states, cmd): 424 """Check that every machine in requested list is in the proper state. 425 426 If the cmd is 'unlock' verify that every machine is locked by requestor. 427 If the cmd is 'lock' verify that every machine is currently unlocked. 428 429 Args: 430 machine_states: A dictionary of the current state of every machine in 431 the current AFELockManager's list of machines. Normally obtained by 432 calling AFELockManager::GetMachineStates. 433 cmd: The user-requested action for the machines: 'lock' or 'unlock'. 434 435 Raises: 436 DontOwnLock: The lock on a requested machine is owned by someone else. 437 """ 438 for k, state in machine_states.iteritems(): 439 if cmd == 'unlock': 440 if not state['locked']: 441 self.logger.LogWarning('Attempt to unlock already unlocked machine ' 442 '(%s).' % k) 443 self._InternalRemoveMachine(k) 444 445 if state['locked'] and state['locked_by'] != self.user: 446 raise DontOwnLock('Attempt to unlock machine (%s) locked by someone ' 447 'else (%s).' % (k, state['locked_by'])) 448 elif cmd == 'lock': 449 if state['locked']: 450 self.logger.LogWarning('Attempt to lock already locked machine (%s)' % 451 k) 452 self._InternalRemoveMachine(k) 453 454 def HasAFEServer(self, local): 455 """Verifies that the AFELockManager has appropriate AFE server. 456 457 Args: 458 local: Boolean indicating whether we are checking for the local server 459 (True) or for the global server (False). 460 461 Returns: 462 A boolean indicating if the AFELockManager has the requested AFE server. 463 """ 464 if local: 465 return self.local_afe is not None 466 else: 467 return self.afe is not None 468 469 def GetMachineStates(self, cmd=''): 470 """Gets the current state of all the requested machines. 471 472 Gets the current state of all the requested machines, both from the HW lab 473 sever and from the local server. Stores the data in a dictionary keyed 474 by machine name. 475 476 Args: 477 cmd: The command for which we are getting the machine states. This is 478 important because if one of the requested machines is missing we raise 479 an exception, unless the requested command is 'add'. 480 481 Returns: 482 A dictionary of machine states for all the machines in the AFELockManager 483 object. 484 485 Raises: 486 NoAFEServer: Cannot find the HW Lab or local AFE server. 487 AFEAccessError: An error occurred when querying the server about a 488 machine. 489 """ 490 if not self.HasAFEServer(False): 491 raise NoAFEServer('Error: Cannot connect to main AFE server.') 492 493 if self.local and not self.HasAFEServer(True): 494 raise NoAFEServer('Error: Cannot connect to local AFE server.') 495 496 machine_list = {} 497 for m in self.machines: 498 host_info = None 499 cros_name = m + '.cros' 500 if (m in self.toolchain_lab_machines or 501 cros_name in self.toolchain_lab_machines): 502 mod_host = m.split('.')[0] 503 host_info = self.afe.get_hosts(hostname=mod_host) 504 if not host_info: 505 raise AFEAccessError('Unable to get information about %s from main' 506 ' autotest server.' % m) 507 else: 508 host_info = self.local_afe.get_hosts(hostname=m) 509 if not host_info and cmd != 'add': 510 raise AFEAccessError('Unable to get information about %s from ' 511 'local autotest server.' % m) 512 if host_info: 513 host_info = host_info[0] 514 name = host_info.hostname 515 values = {} 516 values['board'] = host_info.platform if host_info.platform else '??' 517 values['locked'] = host_info.locked 518 if host_info.locked: 519 values['locked_by'] = host_info.locked_by 520 values['lock_time'] = host_info.lock_time 521 else: 522 values['locked_by'] = '' 523 values['lock_time'] = '' 524 machine_list[name] = values 525 else: 526 machine_list[m] = {} 527 return machine_list 528 529 530def Main(argv): 531 """Parse the options, initialize lock manager and dispatch proper method. 532 533 Args: 534 argv: The options with which this script was invoked. 535 536 Returns: 537 0 unless an exception is raised. 538 """ 539 parser = argparse.ArgumentParser() 540 541 parser.add_argument('--list', 542 dest='cmd', 543 action='store_const', 544 const='status', 545 help='List current status of all known machines.') 546 parser.add_argument('--lock', 547 dest='cmd', 548 action='store_const', 549 const='lock', 550 help='Lock given machine(s).') 551 parser.add_argument('--unlock', 552 dest='cmd', 553 action='store_const', 554 const='unlock', 555 help='Unlock given machine(s).') 556 parser.add_argument('--status', 557 dest='cmd', 558 action='store_const', 559 const='status', 560 help='List current status of given machine(s).') 561 parser.add_argument('--add_machine', 562 dest='cmd', 563 action='store_const', 564 const='add', 565 help='Add machine to local machine server.') 566 parser.add_argument('--remove_machine', 567 dest='cmd', 568 action='store_const', 569 const='remove', 570 help='Remove machine from the local machine server.') 571 parser.add_argument('--nolocal', 572 dest='local', 573 action='store_false', 574 default=True, 575 help='Do not try to use local machine server.') 576 parser.add_argument('--remote', 577 dest='remote', 578 help='machines on which to operate') 579 parser.add_argument('--chromeos_root', 580 dest='chromeos_root', 581 required=True, 582 help='ChromeOS root to use for autotest scripts.') 583 parser.add_argument('--local_server', 584 dest='local_server', 585 default=None, 586 help='Alternate local autotest server to use.') 587 parser.add_argument('--force', 588 dest='force', 589 action='store_true', 590 default=False, 591 help='Force lock/unlock of machines, even if not' 592 ' current lock owner.') 593 594 options = parser.parse_args(argv) 595 596 if not options.remote and options.cmd != 'status': 597 parser.error('No machines specified for operation.') 598 599 if not os.path.isdir(options.chromeos_root): 600 parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root) 601 602 if not options.cmd: 603 parser.error('No operation selected (--list, --status, --lock, --unlock,' 604 ' --add_machine, --remove_machine).') 605 606 machine_list = [] 607 if options.remote: 608 machine_list = options.remote.split() 609 610 lock_manager = AFELockManager(machine_list, options.force, 611 options.chromeos_root, options.local_server, 612 options.local) 613 614 machine_states = lock_manager.GetMachineStates(cmd=options.cmd) 615 cmd = options.cmd 616 617 if cmd == 'status': 618 lock_manager.ListMachineStates(machine_states) 619 620 elif cmd == 'lock': 621 if not lock_manager.force: 622 lock_manager.CheckMachineLocks(machine_states, cmd) 623 lock_manager.UpdateMachines(True) 624 625 elif cmd == 'unlock': 626 if not lock_manager.force: 627 lock_manager.CheckMachineLocks(machine_states, cmd) 628 lock_manager.UpdateMachines(False) 629 630 elif cmd == 'add': 631 lock_manager.AddMachinesToLocalServer() 632 633 elif cmd == 'remove': 634 lock_manager.RemoveMachinesFromLocalServer() 635 636 return 0 637 638 639if __name__ == '__main__': 640 sys.exit(Main(sys.argv[1:])) 641