afe_lock_machine.py revision f3eb80354a8d7dd386c2880c282a767abad6a18a
1#!/usr/bin/python 2# 3# Copyright 2015 Google INc. All Rights Reserved. 4 5import argparse 6import getpass 7import os 8import sys 9import traceback 10 11from utils import logger 12from utils import machines 13from utils import misc 14 15 16class AFELockException(Exception): 17 """Base class for exceptions in this module.""" 18 19 20class MachineNotPingable(AFELockException): 21 """Raised when machine does not respond to ping.""" 22 23 24class MissingHostInfo(AFELockException): 25 """Raised when cannot find info about machine on machine servers.""" 26 27 28class UpdateNonLocalMachine(AFELockException): 29 """Raised when user requests to add/remove a ChromeOS HW Lab machine..""" 30 31 32class DuplicateAdd(AFELockException): 33 """Raised when user requests to add a machine that's already on the server.""" 34 35 36class UpdateServerError(AFELockException): 37 """Raised when attempt to add/remove a machine from local server fails.""" 38 39 40class LockingError(AFELockException): 41 """Raised when server fails to lock/unlock machine as requested.""" 42 43 44class DontOwnLock(AFELockException): 45 """Raised when user attmepts to unlock machine locked by someone else.""" 46 # This should not be raised if the user specified '--force' 47 48 49class NoAFEServer(AFELockException): 50 """Raised when cannot find/access the autotest server.""" 51 52 53class AFEAccessError(AFELockException): 54 """Raised when cannot get information about lab machine from lab server.""" 55 56 57class AFELockManager(object): 58 """Class for locking/unlocking machines vie Autotest Front End servers. 59 60 This class contains methods for checking the locked status of machines 61 on both the ChromeOS HW Lab AFE server and a local AFE server. It also 62 has methods for adding/removing machines from the local server, and for 63 changing the lock status of machines on either server. For the ChromeOS 64 HW Lab, it only allows access to the toolchain team lab machines, as 65 defined in toolchain-utils/crosperf/default_remotes. By default it will 66 look for a local server on chrotomation2.mtv.corp.google.com, but an 67 alternative local AFE server can be supplied, if desired. 68 69 !!!IMPORTANT NOTE!!! The AFE server can only be called from the main 70 thread/process of a program. If you launch threads and try to call it 71 from a thread, you will get an error. This has to do with restrictions 72 in the Python virtual machine (and signal handling) and cannot be changed. 73 """ 74 75 LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com' 76 77 def __init__(self, remotes, force_option, chromeos_root, local_server, 78 local=True, log=None): 79 """Initializes an AFELockManager object. 80 81 Args: 82 remotes: A list of machine names or ip addresses to be managed. Names 83 and ip addresses should be represented as strings. If the list is empty, 84 the lock manager will get all known machines. 85 force_option: A Boolean indicating whether or not to force an unlock of 86 a machine that was locked by someone else. 87 chromeos_root: The ChromeOS chroot to use for the autotest scripts. 88 local_server: A string containing the name or ip address of the machine 89 that is running an AFE server, which is to be used for managing 90 machines that are not in the ChromeOS HW lab. 91 local: A Boolean indicating whether or not to use/allow a local AFE 92 server to be used (see local_server argument). 93 log: If not None, this is the logger object to be used for writing out 94 informational output messages. It is expected to be an instance of 95 Logger class from utils/logger.py. 96 """ 97 self.chromeos_root = chromeos_root 98 self.user = getpass.getuser() 99 self.logger = log or logger.GetLogger() 100 autotest_path = os.path.join(chromeos_root, 101 'src/third_party/autotest/files') 102 103 sys.path.append(chromeos_root) 104 sys.path.append(autotest_path) 105 sys.path.append(os.path.join(autotest_path, 'server', 'cros')) 106 107 # We have to wait to do these imports until the paths above have 108 # been fixed. 109 from client import setup_modules 110 setup_modules.setup(base_path=autotest_path, 111 root_module_name='autotest_lib') 112 113 from dynamic_suite import frontend_wrappers 114 115 self.afe = frontend_wrappers.RetryingAFE(timeout_min=30, 116 delay_sec=10, 117 debug=False, 118 server='cautotest') 119 if not local: 120 self.local_afe = None 121 else: 122 dargs = {} 123 dargs['server'] = local_server or AFELockManager.LOCAL_SERVER 124 # Make sure local server is pingable. 125 error_msg = ('Local autotest server machine %s not responding to ping.' 126 % dargs['server']) 127 self.CheckMachine(dargs['server'], error_msg) 128 self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30, 129 delay_sec=10, 130 debug=False, 131 **dargs) 132 self.local = local 133 self.machines = list(set(remotes)) or [] 134 self.force = force_option 135 self.toolchain_lab_machines = self.GetAllToolchainLabMachines() 136 if not self.machines: 137 self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines() 138 139 def CheckMachine(self, machine, error_msg): 140 """Verifies that machine is responding to ping. 141 142 Args: 143 machine: String containing the name or ip address of machine to check. 144 error_msg: Message to print if ping fails. 145 146 Raises: 147 MachineNotPingable: If machine is not responding to 'ping' 148 """ 149 if not machines.MachineIsPingable(machine, logging_level='none'): 150 raise MachineNotPingable(error_msg) 151 152 def MachineIsKnown(self, machine): 153 """Checks to see if either AFE server knows the given machine. 154 155 Args: 156 machine: String containing name or ip address of machine to check. 157 158 Returns: 159 Boolean indicating if the machine is in the list of known machines for 160 either AFE server. 161 """ 162 if machine in self.toolchain_lab_machines: 163 return True 164 elif self.local_afe and machine in self.GetAllNonlabMachines(): 165 return True 166 167 return False 168 169 def GetAllToolchainLabMachines(self): 170 """Gets a list of all the toolchain machines in the ChromeOS HW lab. 171 172 Returns: 173 A list of names of the toolchain machines in the ChromeOS HW lab. 174 """ 175 machines_file = os.path.join(os.path.dirname(__file__), 176 'crosperf', 'default_remotes') 177 machine_list = [] 178 with open(machines_file, 'r') as input_file: 179 lines = input_file.readlines() 180 for line in lines: 181 board, remotes = line.split(':') 182 remotes = remotes.strip() 183 for r in remotes.split(): 184 machine_list.append(r.strip()) 185 return machine_list 186 187 def GetAllNonlabMachines(self): 188 """Gets a list of all known machines on the local AFE server. 189 190 Returns: 191 A list of the names of the machines on the local AFE server. 192 """ 193 non_lab_machines = [] 194 if self.local_afe: 195 non_lab_machines = self.local_afe.get_hostnames() 196 return non_lab_machines 197 198 def PrintStatusHeader(self, is_lab_machine): 199 """Prints the status header lines for machines. 200 201 Args: Boolean indicating whether to print HW Lab header or local 202 machine header (different spacing). 203 """ 204 if is_lab_machine: 205 print '\nMachine (Board)\t\t\t\tStatus' 206 print '---------------\t\t\t\t------\n' 207 else: 208 print '\nMachine (Board)\t\tStatus' 209 print '---------------\t\t------\n' 210 211 def RemoveLocalMachine(self, m): 212 """Removes a machine from the local AFE server. 213 214 Args: 215 m: The machine to remove. 216 217 Raises: 218 MissingHostInfo: Can't find machine to be removed. 219 """ 220 if self.local_afe: 221 host_info = self.local_afe.get_hosts(hostname=m) 222 if host_info: 223 host_info = host_info[0] 224 host_info.delete() 225 else: 226 raise MissingHostInfo('Cannot find/delete machine %s.' % m) 227 228 def AddLocalMachine(self, m): 229 """Adds a machine to the local AFE server. 230 231 Args: 232 m: The machine to be added. 233 """ 234 if self.local_afe: 235 error_msg = 'Machine %s is not responding to ping.' % m 236 self.CheckMachine(m, error_msg) 237 host = self.local_afe.create_host(m) 238 239 def AddMachinesToLocalServer(self): 240 """Adds one or more machines to the local AFE server. 241 242 Verify that the requested machines are legal to add to the local server, 243 i.e. that they are not ChromeOS HW lab machines, and they are not already 244 on the local server. Call AddLocalMachine for each valid machine. 245 246 Raises: 247 DuplicateAdd: Attempt to add a machine that is already on the server. 248 UpdateNonLocalMachine: Attempt to add a ChromeOS HW lab machine. 249 UpdateServerError: Something went wrong while attempting to add a 250 machine. 251 """ 252 for m in self.machines: 253 if m in self.toolchain_lab_machines: 254 raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW ' 255 'Lab. Cannot add it to local server.' % m) 256 host_info = self.local_afe.get_hosts(hostname=m) 257 if host_info: 258 raise DuplicateAdd('Machine %s is already on the local server.' % m) 259 try: 260 self.AddLocalMachine(m) 261 self.logger.LogOutput('Successfully added %s to local server.' % m) 262 except Exception as e: 263 traceback.print_exc() 264 raise UpdateServerError('Error occurred while attempting to add %s. %s' 265 % (m, str(e))) 266 267 def RemoveMachinesFromLocalServer(self): 268 """Removes one or more machines from the local AFE server. 269 270 Verify that the requested machines are legal to remove from the local 271 server, i.e. that they are not ChromeOS HW lab machines. Call 272 RemoveLocalMachine for each valid machine. 273 274 Raises: 275 UpdateServerError: Something went wrong while attempting to remove a 276 machine. 277 """ 278 for m in self.machines: 279 if m in self.toolchain_lab_machines: 280 raise UpdateNonLocalMachine('Machine %s is in the ChromeOS HW Lab. ' 281 'This script cannot remove lab machines.' 282 % m) 283 try: 284 self.RemoveLocalMachine(m) 285 self.logger.LogOutput('Successfully removed %s from local server.' % m) 286 except Exception as e: 287 traceback.print_exc() 288 raise UpdateServerError('Error occurred while attempting to remove %s ' 289 '(%s).' % (m, str(e))) 290 291 def ListMachineStates(self, machine_states): 292 """Gets and prints the current status for a list of machines. 293 294 Prints out the current status for all of the machines in the current 295 AFELockManager's list of machines (set when the object is initialized). 296 297 Args: 298 machine_states: A dictionary of the current state of every machine in 299 the current AFELockManager's list of machines. Normally obtained by 300 calling AFELockManager::GetMachineStates. 301 """ 302 local_machines = [] 303 printed_hdr = False 304 for m in machine_states: 305 cros_name = m + '.cros' 306 if (m in self.toolchain_lab_machines or 307 cros_name in self.toolchain_lab_machines): 308 if not printed_hdr: 309 self.PrintStatusHeader(True) 310 printed_hdr = True 311 state = machine_states[m] 312 if state['locked']: 313 print ('%s (%s)\tlocked by %s since %s' % 314 (m, state['board'], state['locked_by'], state['lock_time'])) 315 else: 316 print '%s (%s)\tunlocked' % (m, state['board']) 317 else: 318 local_machines.append(m) 319 320 if local_machines: 321 self.PrintStatusHeader(False) 322 for m in local_machines: 323 state = machine_states[m] 324 if state['locked']: 325 print ('%s (%s)\tlocked by %s since %s' % 326 (m, state['board'], state['locked_by'], state['lock_time'])) 327 else: 328 print '%s (%s)\tunlocked' % (m, state['board']) 329 330 331 def UpdateLockInAFE(self, should_lock_machine, machine): 332 """Calls an AFE server to lock/unlock a machine. 333 334 Args: 335 should_lock_machine: Boolean indicating whether to lock the machine (True) 336 or unlock the machine (False). 337 machine: The machine to update. 338 339 Raises: 340 LockingError: An error occurred while attempting to update the machine 341 state. 342 """ 343 action = 'lock' 344 if not should_lock_machine: 345 action = 'unlock' 346 kwargs = {'locked': should_lock_machine} 347 kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user 348 349 if machine in self.toolchain_lab_machines: 350 m = machine.split('.')[0] 351 afe_server = self.afe 352 else: 353 m = machine 354 afe_server = self.local_afe 355 356 try: 357 afe_server.run('modify_hosts', 358 host_filter_data={'hostname__in': [m]}, 359 update_data=kwargs) 360 except Exception as e: 361 traceback.print_exc() 362 raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e))) 363 364 def UpdateMachines(self, lock_machines): 365 """Sets the locked state of the machines to the requested value. 366 367 The machines updated are the ones in self.machines (specified when the 368 class object was intialized). 369 370 Args: 371 lock_machines: Boolean indicating whether to lock the machines (True) or 372 unlock the machines (False). 373 374 Returns: 375 A list of the machines whose state was successfully updated. 376 """ 377 updated_machines = [] 378 for m in self.machines: 379 self.UpdateLockInAFE(lock_machines, m) 380 381 # Since we returned from self.UpdateLockInAFE we assume the request 382 # succeeded. 383 if lock_machines: 384 self.logger.LogOutput('Locked machine(s) %s.' % m) 385 else: 386 self.logger.LogOutput('Unlocked machine(s) %s.' % m) 387 updated_machines.append(m) 388 389 return updated_machines 390 391 def _InternalRemoveMachine(self, machine): 392 """Remove machine from internal list of machines. 393 394 Args: 395 machine: Name of machine to be removed from internal list. 396 """ 397 # Check to see if machine is lab machine and if so, make sure it has 398 # ".cros" on the end. 399 cros_machine = machine 400 if machine.find('rack') > 0 and machine.find('row') > 0: 401 if machine.find('.cros') == -1: 402 cros_machine = cros_machine + '.cros' 403 404 self.machines = [m for m in self.machines if m != cros_machine and 405 m != machine] 406 407 def CheckMachineLocks(self, machine_states, cmd): 408 """Check that every machine in requested list is in the proper state. 409 410 If the cmd is 'unlock' verify that every machine is locked by requestor. 411 If the cmd is 'lock' verify that every machine is currently unlocked. 412 413 Args: 414 machine_states: A dictionary of the current state of every machine in 415 the current AFELockManager's list of machines. Normally obtained by 416 calling AFELockManager::GetMachineStates. 417 cmd: 'lock' or 'unlock'. The user-requested action for the machines. 418 419 Raises: 420 DontOwnLock: The lock on a requested machine is owned by someone else. 421 """ 422 for k, state in machine_states.iteritems(): 423 if cmd == 'unlock': 424 if not state['locked']: 425 self.logger.LogWarning('Attempt to unlock already unlocked machine ' 426 '(%s).' % k) 427 self._InternalRemoveMachine(k) 428 429 if state['locked'] and state['locked_by'] != self.user: 430 raise DontOwnLock('Attempt to unlock machine (%s) locked by someone ' 431 'else (%s).' % (k, state['locked_by'])) 432 elif cmd == 'lock': 433 if state['locked']: 434 self.logger.LogWarning('Attempt to lock already locked machine (%s)' % k) 435 self._InternalRemoveMachine(k) 436 437 def HasAFEServer(self, local): 438 """Verifies that the AFELockManager has appropriate AFE server. 439 440 Args: 441 local: Boolean indicating whether we are checking for the local server 442 (True) or for the global server (False). 443 444 Returns: 445 A boolean indicating if the AFELockManager has the requested AFE server. 446 """ 447 if local: 448 return self.local_afe is not None 449 else: 450 return self.afe is not None 451 452 def GetMachineStates(self, cmd=''): 453 """Gets the current state of all the requested machines. 454 455 Gets the current state of all the requested machines, both from the HW lab 456 sever and from the local server. Stores the data in a dictionary keyed 457 by machine name. 458 459 Args: 460 cmd: The command for which we are getting the machine states. This is 461 important because if one of the requested machines is missing we raise 462 an exception, unless the requested command is 'add'. 463 464 Returns: 465 A dictionary of machine states for all the machines in the AFELockManager 466 object. 467 468 Raises: 469 NoAFEServer: Cannot find the HW Lab or local AFE server. 470 AFEAccessError: An error occurred when querying the server about a 471 machine. 472 """ 473 if not self.HasAFEServer(False): 474 raise NoAFEServer('Error: Cannot connect to main AFE server.') 475 476 if self.local and not self.HasAFEServer(True): 477 raise NoAFEServer('Error: Cannot connect to local AFE server.') 478 479 machines = {} 480 for m in self.machines: 481 host_info = None 482 if m in self.toolchain_lab_machines: 483 mod_host = m.split('.')[0] 484 host_info = self.afe.get_hosts(hostname=mod_host) 485 if not host_info: 486 raise AFEAccessError('Unable to get information about %s from main' 487 ' autotest server.' % m) 488 else: 489 host_info = self.local_afe.get_hosts(hostname=m) 490 if not host_info and cmd != 'add': 491 raise AFEAccessError('Unable to get information about %s from ' 492 'local autotest server.' % m) 493 if host_info: 494 host_info = host_info[0] 495 name = host_info.hostname 496 values = {} 497 values['board'] = host_info.platform if host_info.platform else '??' 498 values['locked'] = host_info.locked 499 if host_info.locked: 500 values['locked_by'] = host_info.locked_by 501 values['lock_time'] = host_info.lock_time 502 else: 503 values['locked_by'] = '' 504 values['lock_time'] = '' 505 machines[name] = values 506 else: 507 machines[m] = {} 508 return machines 509 510 511def Main(argv): 512 """ 513 Parse the options, initialize lock manager and dispatch proper method. 514 515 Args: 516 argv: The options with which this script was invoked. 517 518 Returns: 519 0 unless an exception is raised. 520 """ 521 parser = argparse.ArgumentParser() 522 523 parser.add_argument('--list', dest='cmd', action='store_const', 524 const='status', 525 help='List current status of all known machines.') 526 parser.add_argument('--lock', dest='cmd', action='store_const', 527 const='lock', help='Lock given machine(s).') 528 parser.add_argument('--unlock', dest='cmd', action='store_const', 529 const='unlock', help='Unlock given machine(s).') 530 parser.add_argument('--status', dest='cmd', action='store_const', 531 const='status', 532 help='List current status of given machine(s).') 533 parser.add_argument('--add_machine', dest='cmd', action='store_const', 534 const='add', 535 help='Add machine to local machine server.') 536 parser.add_argument('--remove_machine', dest='cmd', 537 action='store_const', const='remove', 538 help='Remove machine from the local machine server.') 539 parser.add_argument('--nolocal', dest='local', 540 action='store_false', default=True, 541 help='Do not try to use local machine server.') 542 parser.add_argument('--remote', dest='remote', 543 help='machines on which to operate') 544 parser.add_argument('--chromeos_root', dest='chromeos_root', required=True, 545 help='ChromeOS root to use for autotest scripts.') 546 parser.add_argument('--local_server', dest='local_server', default=None, 547 help='Alternate local autotest server to use.') 548 parser.add_argument('--force', dest='force', action='store_true', 549 default=False, 550 help='Force lock/unlock of machines, even if not' 551 ' current lock owner.') 552 553 options = parser.parse_args(argv) 554 555 if not options.remote and options.cmd != 'status': 556 parser.error('No machines specified for operation.') 557 558 if not os.path.isdir(options.chromeos_root): 559 parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root) 560 561 if not options.cmd: 562 parser.error('No operation selected (--list, --status, --lock, --unlock,' 563 ' --add_machine, --remove_machine).') 564 565 machine_list = [] 566 if options.remote: 567 machine_list = options.remote.split() 568 569 lock_manager = AFELockManager(machine_list, options.force, 570 options.chromeos_root, options.local_server, 571 options.local) 572 573 machine_states = lock_manager.GetMachineStates(cmd=options.cmd) 574 cmd = options.cmd 575 576 if cmd == 'status': 577 lock_manager.ListMachineStates(machine_states) 578 579 elif cmd == 'lock': 580 if not lock_manager.force: 581 lock_manager.CheckMachineLocks(machine_states, cmd) 582 lock_manager.UpdateMachines(True) 583 584 elif cmd == 'unlock': 585 if not lock_manager.force: 586 lock_manager.CheckMachineLocks(machine_states, cmd) 587 lock_manager.UpdateMachines(False) 588 589 elif cmd == 'add': 590 lock_manager.AddMachinesToLocalServer() 591 592 elif cmd == 'remove': 593 lock_manager.RemoveMachinesFromLocalServer() 594 595 return 0 596 597 598if __name__ == '__main__': 599 sys.exit(Main(sys.argv[1:])) 600