afe_lock_machine.py revision e5bc63bbed4e001b080c4ce0b18c5c78900d4786
1#!/usr/bin/python 2# 3# Copyright 2015 Google INc. All Rights Reserved. 4 5import argparse 6import getpass 7import os 8import sys 9import traceback 10 11from utils import logger 12from utils import machines 13from utils import misc 14 15 16class AFELockException(Exception): 17 """Base class for exceptions in this module.""" 18 19 20class MachineNotPingable(AFELockException): 21 """Raised when machine does not respond to ping.""" 22 23 24class MissingHostInfo(AFELockException): 25 """Raised when cannot find info about machine on machine servers.""" 26 27 28class UpdateNonLocalMachine(AFELockException): 29 """Raised when user requests to add/remove a ChromeOS HW Lab machine..""" 30 31 32class DuplicateAdd(AFELockException): 33 """Raised when user requests to add a machine that's already on the server.""" 34 35 36class UpdateServerError(AFELockException): 37 """Raised when attempt to add/remove a machine from local server fails.""" 38 39 40class LockingError(AFELockException): 41 """Raised when server fails to lock/unlock machine as requested.""" 42 43 44class DuplicateLock(AFELockException): 45 """Raised when user attempts to lock an already locked machine.""" 46 47 48class DuplicateUnlock(AFELockException): 49 """Raised when user attempts to unlock an already unlocked machine.""" 50 51 52class DontOwnLock(AFELockException): 53 """Raised when user attmepts to unlock machine locked by someone else.""" 54 # This should not be raised if the user specified '--force' 55 56 57class NoAFEServer(AFELockException): 58 """Raised when cannot find/access the autotest server.""" 59 60 61class AFEAccessError(AFELockException): 62 """Raised when cannot get information about lab machine from lab server.""" 63 64 65class AFELockManager(object): 66 """Class for locking/unlocking machines vie Autotest Front End servers. 67 68 This class contains methods for checking the locked status of machines 69 on both the ChromeOS HW Lab AFE server and a local AFE server. It also 70 has methods for adding/removing machines from the local server, and for 71 changing the lock status of machines on either server. For the ChromeOS 72 HW Lab, it only allows access to the toolchain team lab machines, as 73 defined in toolchain-utils/crosperf/default_remotes. By default it will 74 look for a local server on chrotomation2.mtv.corp.google.com, but an 75 alternative local AFE server can be supplied, if desired. 76 77 !!!IMPORTANT NOTE!!! The AFE server can only be called from the main 78 thread/process of a program. If you launch threads and try to call it 79 from a thread, you will get an error. This has to do with restrictions 80 in the Python virtual machine (and signal handling) and cannot be changed. 81 """ 82 83 LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com' 84 85 def __init__(self, remotes, force_option, chromeos_root, local_server, 86 local=True, log=None): 87 """Initializes an AFELockManager object. 88 89 Args: 90 remotes: A list of machine names or ip addresses to be managed. Names 91 and ip addresses should be represented as strings. If the list is empty, 92 the lock manager will get all known machines. 93 force_option: A Boolean indicating whether or not to force an unlock of 94 a machine that was locked by someone else. 95 chromeos_root: The ChromeOS chroot to use for the autotest scripts. 96 local_server: A string containing the name or ip address of the machine 97 that is running an AFE server, which is to be used for managing 98 machines that are not in the ChromeOS HW lab. 99 local: A Boolean indicating whether or not to use/allow a local AFE 100 server to be used (see local_server argument). 101 log: If not None, this is the logger object to be used for writing out 102 informational output messages. It is expected to be an instance of 103 Logger class from utils/logger.py. 104 """ 105 self.chromeos_root = chromeos_root 106 self.user = getpass.getuser() 107 self.logger = log or logger.GetLogger() 108 autotest_path = os.path.join(chromeos_root, 109 'src/third_party/autotest/files') 110 111 sys.path.append(autotest_path) 112 sys.path.append(os.path.join(autotest_path, 'server', 'cros')) 113 114 # We have to wait to do these imports until the paths above have 115 # been fixed. 116 from client import setup_modules 117 setup_modules.setup(base_path=autotest_path, 118 root_module_name='autotest_lib') 119 120 from dynamic_suite import frontend_wrappers 121 122 self.afe = frontend_wrappers.RetryingAFE(timeout_min=30, 123 delay_sec=10, 124 debug=False) 125 if not local: 126 self.local_afe = None 127 else: 128 dargs = {} 129 dargs['server'] = local_server or AFELockManager.LOCAL_SERVER 130 # Make sure local server is pingable. 131 error_msg = ('Local autotest server machine %s not responding to ping.' 132 % dargs['server']) 133 self.CheckMachine(dargs['server'], error_msg) 134 self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30, 135 delay_sec=10, 136 debug=False, 137 **dargs) 138 self.local = local 139 self.machines = list(set(remotes)) or [] 140 self.force = force_option 141 self.toolchain_lab_machines = self.GetAllToolchainLabMachines() 142 if not self.machines: 143 self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines() 144 145 def CheckMachine(self, machine, error_msg): 146 """Verifies that machine is responding to ping. 147 148 Args: 149 machine: String containing the name or ip address of machine to check. 150 error_msg: Message to print if ping fails. 151 152 Raises: 153 MachineNotPingable: If machine is not responding to 'ping' 154 """ 155 if not machines.MachineIsPingable(machine, logging_level='none'): 156 raise MachineNotPingable(error_msg) 157 158 def MachineIsKnown(self, machine): 159 """Checks to see if either AFE server knows the given machine. 160 161 Args: 162 machine: String containing name or ip address of machine to check. 163 164 Returns: 165 Boolean indicating if the machine is in the list of known machines for 166 either AFE server. 167 """ 168 if machine in self.toolchain_lab_machines: 169 return True 170 elif self.local_afe and machine in self.GetAllNonlabMachines(): 171 return True 172 173 return False 174 175 def GetAllToolchainLabMachines(self): 176 """Gets a list of all the toolchain machines in the ChromeOS HW lab. 177 178 Returns: 179 A list of names of the toolchain machines in the ChromeOS HW lab. 180 """ 181 machines_file = os.path.join(os.getcwd(), 'crosperf', 'default_remotes') 182 machine_list = [] 183 with open(machines_file, 'r') as input_file: 184 lines = input_file.readlines() 185 for line in lines: 186 board, remotes = line.split(':') 187 remotes = remotes.strip() 188 for r in remotes.split(): 189 machine_list.append(r.strip()) 190 return machine_list 191 192 def GetAllNonlabMachines(self): 193 """Gets a list of all known machines on the local AFE server. 194 195 Returns: 196 A list of the names of the machines on the local AFE server. 197 """ 198 non_lab_machines = [] 199 if self.local_afe: 200 non_lab_machines = self.local_afe.get_hostnames() 201 return non_lab_machines 202 203 def PrintStatusHeader(self, is_lab_machine): 204 """Prints the status header lines for machines. 205 206 Args: Boolean indicating whether to print HW Lab header or local 207 machine header (different spacing). 208 """ 209 if is_lab_machine: 210 print '\nMachine (Board)\t\t\t\tStatus' 211 print '---------------\t\t\t\t------\n' 212 else: 213 print '\nMachine (Board)\t\tStatus' 214 print '---------------\t\t------\n' 215 216 def RemoveLocalMachine(self, m): 217 """Removes a machine from the local AFE server. 218 219 Args: 220 m: The machine to remove. 221 222 Raises: 223 MissingHostInfo: Can't find machine to be removed. 224 """ 225 if self.local_afe: 226 host_info = self.local_afe.get_hosts(hostname=m) 227 if host_info: 228 host_info = host_info[0] 229 host_info.delete() 230 else: 231 raise MissingHostInfo('Cannot find/delete machine %s.' % m) 232 233 def AddLocalMachine(self, m): 234 """Adds a machine to the local AFE server. 235 236 Args: 237 m: The machine to be added. 238 """ 239 if self.local_afe: 240 error_msg = 'Machine %s is not responding to ping.' % m 241 self.CheckMachine(m, error_msg) 242 host = self.local_afe.create_host(m) 243 244 def AddMachinesToLocalServer(self): 245 """Adds one or more machines to the local AFE server. 246 247 Verify that the requested machines are legal to add to the local server, 248 i.e. that they are not ChromeOS HW lab machines, and they are not already 249 on the local server. Call AddLocalMachine for each valid machine. 250 251 Raises: 252 DuplicateAdd: Attempt to add a machine that is already on the server. 253 UpdateNonLocalMachine: Attempt to add a ChromeOS HW lab machine. 254 UpdateServerError: Something went wrong while attempting to add a 255 machine. 256 """ 257 for m in self.machines: 258 if m in self.toolchain_lab_machines: 259 raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW ' 260 'Lab. Cannot add it to local server.' % m) 261 host_info = self.local_afe.get_hosts(hostname=m) 262 if host_info: 263 raise DuplicateAdd('Machine %s is already on the local server.' % m) 264 try: 265 self.AddLocalMachine(m) 266 self.logger.LogOutput('Successfully added %s to local server.' % m) 267 except Exception as e: 268 traceback.print_exc() 269 raise UpdateServerError('Error occurred while attempting to add %s. %s' 270 % (m, str(e))) 271 272 def RemoveMachinesFromLocalServer(self): 273 """Removes one or more machines from the local AFE server. 274 275 Verify that the requested machines are legal to remove from the local 276 server, i.e. that they are not ChromeOS HW lab machines. Call 277 RemoveLocalMachine for each valid machine. 278 279 Raises: 280 UpdateServerError: Something went wrong while attempting to remove a 281 machine. 282 """ 283 for m in self.machines: 284 if m in self.toolchain_lab_machines: 285 raise UpdateNonLocalMachine('Machine %s is in the ChromeOS HW Lab. ' 286 'This script cannot remove lab machines.' 287 % m) 288 try: 289 self.RemoveLocalMachine(m) 290 self.logger.LogOutput('Successfully removed %s from local server.' % m) 291 except Exception as e: 292 traceback.print_exc() 293 raise UpdateServerError('Error occurred while attempting to remove %s ' 294 '(%s).' % (m, str(e))) 295 296 def ListMachineStates(self, machine_states): 297 """Gets and prints the current status for a list of machines. 298 299 Prints out the current status for all of the machines in the current 300 AFELockManager's list of machines (set when the object is initialized). 301 302 Args: 303 machine_states: A dictionary of the current state of every machine in 304 the current AFELockManager's list of machines. Normally obtained by 305 calling AFELockManager::GetMachineStates. 306 """ 307 local_machines = [] 308 printed_hdr = False 309 for m in machine_states: 310 cros_name = m + '.cros' 311 if (m in self.toolchain_lab_machines or 312 cros_name in self.toolchain_lab_machines): 313 if not printed_hdr: 314 self.PrintStatusHeader(True) 315 printed_hdr = True 316 state = machine_states[m] 317 if state['locked']: 318 print ('%s (%s)\tlocked by %s since %s' % 319 (m, state['board'], state['locked_by'], state['lock_time'])) 320 else: 321 print '%s (%s)\tunlocked' % (m, state['board']) 322 else: 323 local_machines.append(m) 324 325 if local_machines: 326 self.PrintStatusHeader(False) 327 for m in local_machines: 328 state = machine_states[m] 329 if state['locked']: 330 print ('%s (%s)\tlocked by %s since %s' % 331 (m, state['board'], state['locked_by'], state['lock_time'])) 332 else: 333 print '%s (%s)\tunlocked' % (m, state['board']) 334 335 336 def UpdateLockInAFE(self, should_lock_machine, machine): 337 """Calls an AFE server to lock/unlock a machine. 338 339 Args: 340 should_lock_machine: Boolean indicating whether to lock the machine (True) 341 or unlock the machine (False). 342 machine: The machine to update. 343 344 Raises: 345 LockingError: An error occurred while attempting to update the machine 346 state. 347 """ 348 action = 'lock' 349 if not should_lock_machine: 350 action = 'unlock' 351 kwargs = {'locked': should_lock_machine} 352 353 if machine in self.toolchain_lab_machines: 354 m = machine.split('.')[0] 355 kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user 356 afe_server = self.afe 357 else: 358 m = machine 359 afe_server = self.local_afe 360 361 try: 362 afe_server.run('modify_hosts', 363 host_filter_data={'hostname__in': [m]}, 364 update_data=kwargs) 365 except Exception as e: 366 traceback.print_exc() 367 raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e))) 368 369 def UpdateMachines(self, lock_machines): 370 """Sets the locked state of the machines to the requested value. 371 372 The machines updated are the ones in self.machines (specified when the 373 class object was intialized). 374 375 Args: 376 lock_machines: Boolean indicating whether to lock the machines (True) or 377 unlock the machines (False). 378 """ 379 for m in self.machines: 380 self.UpdateLockInAFE(lock_machines, m) 381 382 # Since we returned from self.UpdateLockInAFE we assume the request 383 # succeeded. 384 if lock_machines: 385 self.logger.LogOutput('Locked machine(s) %s.' % m) 386 else: 387 self.logger.LogOutput('Unlocked machine(s) %s.' % m) 388 389 def CheckMachineLocks(self, machine_states, cmd): 390 """Check that every machine in requested list is in the proper state. 391 392 If the cmd is 'unlock' verify that every machine is locked by requestor. 393 If the cmd is 'lock' verify that every machine is currently unlocked. 394 395 Args: 396 machine_states: A dictionary of the current state of every machine in 397 the current AFELockManager's list of machines. Normally obtained by 398 calling AFELockManager::GetMachineStates. 399 cmd: 'lock' or 'unlock'. The user-requested action for the machines. 400 401 Raises: 402 DuplicateLock: A machine requested to be locked is already locked. 403 DuplicateUnlock: A machine requested to be unlocked is already unlocked. 404 DontOwnLock: The lock on a requested machine is owned by someone else. 405 """ 406 for k, state in machine_states.iteritems(): 407 if cmd == 'unlock': 408 if not state['locked']: 409 raise DuplicateUnlock('Attempt to unlock already unlocked machine ' 410 '(%s).' % k) 411 412 if state['locked_by'] != self.user: 413 raise DontOwnLock('Attempt to unlock machine (%s) locked by someone ' 414 'else (%s).' % (k, state['locked_by'])) 415 elif cmd == 'lock': 416 if state['locked']: 417 raise DuplicateLock('Attempt to lock already locked machine (%s)' % k) 418 419 def HasAFEServer(self, local): 420 """Verifies that the AFELockManager has appropriate AFE server. 421 422 Args: 423 local: Boolean indicating whether we are checking for the local server 424 (True) or for the global server (False). 425 426 Returns: 427 A boolean indicating if the AFELockManager has the requested AFE server. 428 """ 429 if local: 430 return self.local_afe is not None 431 else: 432 return self.afe is not None 433 434 def GetMachineStates(self, cmd=''): 435 """Gets the current state of all the requested machines. 436 437 Gets the current state of all the requested machines, both from the HW lab 438 sever and from the local server. Stores the data in a dictionary keyed 439 by machine name. 440 441 Args: 442 cmd: The command for which we are getting the machine states. This is 443 important because if one of the requested machines is missing we raise 444 an exception, unless the requested command is 'add'. 445 446 Returns: 447 A dictionary of machine states for all the machines in the AFELockManager 448 object. 449 450 Raises: 451 NoAFEServer: Cannot find the HW Lab or local AFE server. 452 AFEAccessError: An error occurred when querying the server about a 453 machine. 454 """ 455 if not self.HasAFEServer(False): 456 raise NoAFEServer('Error: Cannot connect to main AFE server.') 457 458 if self.local and not self.HasAFEServer(True): 459 raise NoAFEServer('Error: Cannot connect to local AFE server.') 460 461 machines = {} 462 for m in self.machines: 463 host_info = None 464 if m in self.toolchain_lab_machines: 465 mod_host = m.split('.')[0] 466 host_info = self.afe.get_hosts(hostname=mod_host) 467 if not host_info: 468 raise AFEAccessError('Unable to get information about %s from main' 469 ' autotest server.' % m) 470 else: 471 host_info = self.local_afe.get_hosts(hostname=m) 472 if not host_info and cmd != 'add': 473 raise AFEAccessError('Unable to get information about %s from ' 474 'local autotest server.' % m) 475 if host_info: 476 host_info = host_info[0] 477 name = host_info.hostname 478 values = {} 479 values['board'] = host_info.platform if host_info.platform else '??' 480 values['locked'] = host_info.locked 481 if host_info.locked: 482 values['locked_by'] = host_info.locked_by 483 values['lock_time'] = host_info.lock_time 484 else: 485 values['locked_by'] = '' 486 values['lock_time'] = '' 487 machines[name] = values 488 else: 489 machines[m] = {} 490 return machines 491 492 493def Main(argv): 494 """ 495 Parse the options, initialize lock manager and dispatch proper method. 496 497 Args: 498 argv: The options with which this script was invoked. 499 500 Returns: 501 0 unless an exception is raised. 502 """ 503 parser = argparse.ArgumentParser() 504 505 parser.add_argument('--list', dest='cmd', action='store_const', 506 const='status', 507 help='List current status of all known machines.') 508 parser.add_argument('--lock', dest='cmd', action='store_const', 509 const='lock', help='Lock given machine(s).') 510 parser.add_argument('--unlock', dest='cmd', action='store_const', 511 const='unlock', help='Unlock given machine(s).') 512 parser.add_argument('--status', dest='cmd', action='store_const', 513 const='status', 514 help='List current status of given machine(s).') 515 parser.add_argument('--add_machine', dest='cmd', action='store_const', 516 const='add', 517 help='Add machine to local machine server.') 518 parser.add_argument('--remove_machine', dest='cmd', 519 action='store_const', const='remove', 520 help='Remove machine from the local machine server.') 521 parser.add_argument('--nolocal', dest='local', 522 action='store_false', default=True, 523 help='Do not try to use local machine server.') 524 parser.add_argument('--remote', dest='remote', 525 help='machines on which to operate') 526 parser.add_argument('--chromeos_root', dest='chromeos_root', required=True, 527 help='ChromeOS root to use for autotest scripts.') 528 parser.add_argument('--local_server', dest='local_server', default=None, 529 help='Alternate local autotest server to use.') 530 parser.add_argument('--force', dest='force', action='store_true', 531 default=False, 532 help='Force lock/unlock of machines, even if not' 533 ' current lock owner.') 534 535 options = parser.parse_args(argv) 536 537 if not options.remote and options.cmd != 'status': 538 parser.error('No machines specified for operation.') 539 540 if not os.path.isdir(options.chromeos_root): 541 parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root) 542 543 if not options.cmd: 544 parser.error('No operation selected (--list, --status, --lock, --unlock,' 545 ' --add_machine, --remove_machine).') 546 547 machine_list = [] 548 if options.remote: 549 machine_list = options.remote.split() 550 551 lock_manager = AFELockManager(machine_list, options.force, 552 options.chromeos_root, options.local_server, 553 options.local) 554 555 machine_states = lock_manager.GetMachineStates(cmd=options.cmd) 556 cmd = options.cmd 557 558 if cmd == 'status': 559 lock_manager.ListMachineStates(machine_states) 560 561 elif cmd == 'lock': 562 if not lock_manager.force: 563 lock_manager.CheckMachineLocks(machine_states, cmd) 564 lock_manager.UpdateMachines(True) 565 566 elif cmd == 'unlock': 567 if not lock_manager.force: 568 lock_manager.CheckMachineLocks(machine_states, cmd) 569 lock_manager.UpdateMachines(False) 570 571 elif cmd == 'add': 572 lock_manager.AddMachinesToLocalServer() 573 574 elif cmd == 'remove': 575 lock_manager.RemoveMachinesFromLocalServer() 576 577 return 0 578 579 580if __name__ == '__main__': 581 sys.exit(Main(sys.argv[1:])) 582