afe_lock_machine.py revision d1172b4df650c89b6fef7719db8225c31a4eba0e
1#!/usr/bin/python 2# 3# Copyright 2015 Google INc. All Rights Reserved. 4 5import argparse 6import getpass 7import os 8import sys 9import traceback 10 11from utils import logger 12from utils import machines 13from utils import misc 14 15 16class AFELockException(Exception): 17 """Base class for exceptions in this module.""" 18 19 20class MachineNotPingable(AFELockException): 21 """Raised when machine does not respond to ping.""" 22 23 24class MissingHostInfo(AFELockException): 25 """Raised when cannot find info about machine on machine servers.""" 26 27 28class UpdateNonLocalMachine(AFELockException): 29 """Raised when user requests to add/remove a ChromeOS HW Lab machine..""" 30 31 32class DuplicateAdd(AFELockException): 33 """Raised when user requests to add a machine that's already on the server.""" 34 35 36class UpdateServerError(AFELockException): 37 """Raised when attempt to add/remove a machine from local server fails.""" 38 39 40class LockingError(AFELockException): 41 """Raised when server fails to lock/unlock machine as requested.""" 42 43 44class DuplicateLock(AFELockException): 45 """Raised when user attempts to lock an already locked machine.""" 46 47 48class DuplicateUnlock(AFELockException): 49 """Raised when user attempts to unlock an already unlocked machine.""" 50 51 52class DontOwnLock(AFELockException): 53 """Raised when user attmepts to unlock machine locked by someone else.""" 54 # This should not be raised if the user specified '--force' 55 56 57class NoAFEServer(AFELockException): 58 """Raised when cannot find/access the autotest server.""" 59 60 61class AFEAccessError(AFELockException): 62 """Raised when cannot get information about lab machine from lab server.""" 63 64 65class AFELockManager(object): 66 """Class for locking/unlocking machines vie Autotest Front End servers. 67 68 This class contains methods for checking the locked status of machines 69 on both the ChromeOS HW Lab AFE server and a local AFE server. It also 70 has methods for adding/removing machines from the local server, and for 71 changing the lock status of machines on either server. For the ChromeOS 72 HW Lab, it only allows access to the toolchain team lab machines, as 73 defined in toolchain-utils/crosperf/default_remotes. By default it will 74 look for a local server on chrotomation2.mtv.corp.google.com, but an 75 alternative local AFE server can be supplied, if desired. 76 77 !!!IMPORTANT NOTE!!! The AFE server can only be called from the main 78 thread/process of a program. If you launch threads and try to call it 79 from a thread, you will get an error. This has to do with restrictions 80 in the Python virtual machine (and signal handling) and cannot be changed. 81 """ 82 83 LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com' 84 85 def __init__(self, remotes, force_option, chromeos_root, local_server, 86 local=True, log=None): 87 """Initializes an AFELockManager object. 88 89 Args: 90 remotes: A list of machine names or ip addresses to be managed. Names 91 and ip addresses should be represented as strings. If the list is empty, 92 the lock manager will get all known machines. 93 force_option: A Boolean indicating whether or not to force an unlock of 94 a machine that was locked by someone else. 95 chromeos_root: The ChromeOS chroot to use for the autotest scripts. 96 local_server: A string containing the name or ip address of the machine 97 that is running an AFE server, which is to be used for managing 98 machines that are not in the ChromeOS HW lab. 99 local: A Boolean indicating whether or not to use/allow a local AFE 100 server to be used (see local_server argument). 101 log: If not None, this is the logger object to be used for writing out 102 informational output messages. It is expected to be an instance of 103 Logger class from utils/logger.py. 104 """ 105 self.chromeos_root = chromeos_root 106 self.user = getpass.getuser() 107 self.logger = log or logger.GetLogger() 108 autotest_path = os.path.join(chromeos_root, 109 'src/third_party/autotest/files') 110 111 sys.path.append(chromeos_root) 112 sys.path.append(autotest_path) 113 sys.path.append(os.path.join(autotest_path, 'server', 'cros')) 114 115 # We have to wait to do these imports until the paths above have 116 # been fixed. 117 from client import setup_modules 118 setup_modules.setup(base_path=autotest_path, 119 root_module_name='autotest_lib') 120 121 from dynamic_suite import frontend_wrappers 122 123 self.afe = frontend_wrappers.RetryingAFE(timeout_min=30, 124 delay_sec=10, 125 debug=False, 126 server='cautotest') 127 if not local: 128 self.local_afe = None 129 else: 130 dargs = {} 131 dargs['server'] = local_server or AFELockManager.LOCAL_SERVER 132 # Make sure local server is pingable. 133 error_msg = ('Local autotest server machine %s not responding to ping.' 134 % dargs['server']) 135 self.CheckMachine(dargs['server'], error_msg) 136 self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30, 137 delay_sec=10, 138 debug=False, 139 **dargs) 140 self.local = local 141 self.machines = list(set(remotes)) or [] 142 self.force = force_option 143 self.toolchain_lab_machines = self.GetAllToolchainLabMachines() 144 if not self.machines: 145 self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines() 146 147 def CheckMachine(self, machine, error_msg): 148 """Verifies that machine is responding to ping. 149 150 Args: 151 machine: String containing the name or ip address of machine to check. 152 error_msg: Message to print if ping fails. 153 154 Raises: 155 MachineNotPingable: If machine is not responding to 'ping' 156 """ 157 if not machines.MachineIsPingable(machine, logging_level='none'): 158 raise MachineNotPingable(error_msg) 159 160 def MachineIsKnown(self, machine): 161 """Checks to see if either AFE server knows the given machine. 162 163 Args: 164 machine: String containing name or ip address of machine to check. 165 166 Returns: 167 Boolean indicating if the machine is in the list of known machines for 168 either AFE server. 169 """ 170 if machine in self.toolchain_lab_machines: 171 return True 172 elif self.local_afe and machine in self.GetAllNonlabMachines(): 173 return True 174 175 return False 176 177 def GetAllToolchainLabMachines(self): 178 """Gets a list of all the toolchain machines in the ChromeOS HW lab. 179 180 Returns: 181 A list of names of the toolchain machines in the ChromeOS HW lab. 182 """ 183 machines_file = os.path.join(os.path.dirname(__file__), 184 'crosperf', 'default_remotes') 185 machine_list = [] 186 with open(machines_file, 'r') as input_file: 187 lines = input_file.readlines() 188 for line in lines: 189 board, remotes = line.split(':') 190 remotes = remotes.strip() 191 for r in remotes.split(): 192 machine_list.append(r.strip()) 193 return machine_list 194 195 def GetAllNonlabMachines(self): 196 """Gets a list of all known machines on the local AFE server. 197 198 Returns: 199 A list of the names of the machines on the local AFE server. 200 """ 201 non_lab_machines = [] 202 if self.local_afe: 203 non_lab_machines = self.local_afe.get_hostnames() 204 return non_lab_machines 205 206 def PrintStatusHeader(self, is_lab_machine): 207 """Prints the status header lines for machines. 208 209 Args: Boolean indicating whether to print HW Lab header or local 210 machine header (different spacing). 211 """ 212 if is_lab_machine: 213 print '\nMachine (Board)\t\t\t\tStatus' 214 print '---------------\t\t\t\t------\n' 215 else: 216 print '\nMachine (Board)\t\tStatus' 217 print '---------------\t\t------\n' 218 219 def RemoveLocalMachine(self, m): 220 """Removes a machine from the local AFE server. 221 222 Args: 223 m: The machine to remove. 224 225 Raises: 226 MissingHostInfo: Can't find machine to be removed. 227 """ 228 if self.local_afe: 229 host_info = self.local_afe.get_hosts(hostname=m) 230 if host_info: 231 host_info = host_info[0] 232 host_info.delete() 233 else: 234 raise MissingHostInfo('Cannot find/delete machine %s.' % m) 235 236 def AddLocalMachine(self, m): 237 """Adds a machine to the local AFE server. 238 239 Args: 240 m: The machine to be added. 241 """ 242 if self.local_afe: 243 error_msg = 'Machine %s is not responding to ping.' % m 244 self.CheckMachine(m, error_msg) 245 host = self.local_afe.create_host(m) 246 247 def AddMachinesToLocalServer(self): 248 """Adds one or more machines to the local AFE server. 249 250 Verify that the requested machines are legal to add to the local server, 251 i.e. that they are not ChromeOS HW lab machines, and they are not already 252 on the local server. Call AddLocalMachine for each valid machine. 253 254 Raises: 255 DuplicateAdd: Attempt to add a machine that is already on the server. 256 UpdateNonLocalMachine: Attempt to add a ChromeOS HW lab machine. 257 UpdateServerError: Something went wrong while attempting to add a 258 machine. 259 """ 260 for m in self.machines: 261 if m in self.toolchain_lab_machines: 262 raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW ' 263 'Lab. Cannot add it to local server.' % m) 264 host_info = self.local_afe.get_hosts(hostname=m) 265 if host_info: 266 raise DuplicateAdd('Machine %s is already on the local server.' % m) 267 try: 268 self.AddLocalMachine(m) 269 self.logger.LogOutput('Successfully added %s to local server.' % m) 270 except Exception as e: 271 traceback.print_exc() 272 raise UpdateServerError('Error occurred while attempting to add %s. %s' 273 % (m, str(e))) 274 275 def RemoveMachinesFromLocalServer(self): 276 """Removes one or more machines from the local AFE server. 277 278 Verify that the requested machines are legal to remove from the local 279 server, i.e. that they are not ChromeOS HW lab machines. Call 280 RemoveLocalMachine for each valid machine. 281 282 Raises: 283 UpdateServerError: Something went wrong while attempting to remove a 284 machine. 285 """ 286 for m in self.machines: 287 if m in self.toolchain_lab_machines: 288 raise UpdateNonLocalMachine('Machine %s is in the ChromeOS HW Lab. ' 289 'This script cannot remove lab machines.' 290 % m) 291 try: 292 self.RemoveLocalMachine(m) 293 self.logger.LogOutput('Successfully removed %s from local server.' % m) 294 except Exception as e: 295 traceback.print_exc() 296 raise UpdateServerError('Error occurred while attempting to remove %s ' 297 '(%s).' % (m, str(e))) 298 299 def ListMachineStates(self, machine_states): 300 """Gets and prints the current status for a list of machines. 301 302 Prints out the current status for all of the machines in the current 303 AFELockManager's list of machines (set when the object is initialized). 304 305 Args: 306 machine_states: A dictionary of the current state of every machine in 307 the current AFELockManager's list of machines. Normally obtained by 308 calling AFELockManager::GetMachineStates. 309 """ 310 local_machines = [] 311 printed_hdr = False 312 for m in machine_states: 313 cros_name = m + '.cros' 314 if (m in self.toolchain_lab_machines or 315 cros_name in self.toolchain_lab_machines): 316 if not printed_hdr: 317 self.PrintStatusHeader(True) 318 printed_hdr = True 319 state = machine_states[m] 320 if state['locked']: 321 print ('%s (%s)\tlocked by %s since %s' % 322 (m, state['board'], state['locked_by'], state['lock_time'])) 323 else: 324 print '%s (%s)\tunlocked' % (m, state['board']) 325 else: 326 local_machines.append(m) 327 328 if local_machines: 329 self.PrintStatusHeader(False) 330 for m in local_machines: 331 state = machine_states[m] 332 if state['locked']: 333 print ('%s (%s)\tlocked by %s since %s' % 334 (m, state['board'], state['locked_by'], state['lock_time'])) 335 else: 336 print '%s (%s)\tunlocked' % (m, state['board']) 337 338 339 def UpdateLockInAFE(self, should_lock_machine, machine): 340 """Calls an AFE server to lock/unlock a machine. 341 342 Args: 343 should_lock_machine: Boolean indicating whether to lock the machine (True) 344 or unlock the machine (False). 345 machine: The machine to update. 346 347 Raises: 348 LockingError: An error occurred while attempting to update the machine 349 state. 350 """ 351 action = 'lock' 352 if not should_lock_machine: 353 action = 'unlock' 354 kwargs = {'locked': should_lock_machine} 355 356 if machine in self.toolchain_lab_machines: 357 m = machine.split('.')[0] 358 kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user 359 afe_server = self.afe 360 else: 361 m = machine 362 afe_server = self.local_afe 363 364 try: 365 afe_server.run('modify_hosts', 366 host_filter_data={'hostname__in': [m]}, 367 update_data=kwargs) 368 except Exception as e: 369 traceback.print_exc() 370 raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e))) 371 372 def UpdateMachines(self, lock_machines): 373 """Sets the locked state of the machines to the requested value. 374 375 The machines updated are the ones in self.machines (specified when the 376 class object was intialized). 377 378 Args: 379 lock_machines: Boolean indicating whether to lock the machines (True) or 380 unlock the machines (False). 381 """ 382 for m in self.machines: 383 self.UpdateLockInAFE(lock_machines, m) 384 385 # Since we returned from self.UpdateLockInAFE we assume the request 386 # succeeded. 387 if lock_machines: 388 self.logger.LogOutput('Locked machine(s) %s.' % m) 389 else: 390 self.logger.LogOutput('Unlocked machine(s) %s.' % m) 391 392 def CheckMachineLocks(self, machine_states, cmd): 393 """Check that every machine in requested list is in the proper state. 394 395 If the cmd is 'unlock' verify that every machine is locked by requestor. 396 If the cmd is 'lock' verify that every machine is currently unlocked. 397 398 Args: 399 machine_states: A dictionary of the current state of every machine in 400 the current AFELockManager's list of machines. Normally obtained by 401 calling AFELockManager::GetMachineStates. 402 cmd: 'lock' or 'unlock'. The user-requested action for the machines. 403 404 Raises: 405 DuplicateLock: A machine requested to be locked is already locked. 406 DuplicateUnlock: A machine requested to be unlocked is already unlocked. 407 DontOwnLock: The lock on a requested machine is owned by someone else. 408 """ 409 for k, state in machine_states.iteritems(): 410 if cmd == 'unlock': 411 if not state['locked']: 412 raise DuplicateUnlock('Attempt to unlock already unlocked machine ' 413 '(%s).' % k) 414 415 if state['locked_by'] != self.user: 416 raise DontOwnLock('Attempt to unlock machine (%s) locked by someone ' 417 'else (%s).' % (k, state['locked_by'])) 418 elif cmd == 'lock': 419 if state['locked']: 420 raise DuplicateLock('Attempt to lock already locked machine (%s)' % k) 421 422 def HasAFEServer(self, local): 423 """Verifies that the AFELockManager has appropriate AFE server. 424 425 Args: 426 local: Boolean indicating whether we are checking for the local server 427 (True) or for the global server (False). 428 429 Returns: 430 A boolean indicating if the AFELockManager has the requested AFE server. 431 """ 432 if local: 433 return self.local_afe is not None 434 else: 435 return self.afe is not None 436 437 def GetMachineStates(self, cmd=''): 438 """Gets the current state of all the requested machines. 439 440 Gets the current state of all the requested machines, both from the HW lab 441 sever and from the local server. Stores the data in a dictionary keyed 442 by machine name. 443 444 Args: 445 cmd: The command for which we are getting the machine states. This is 446 important because if one of the requested machines is missing we raise 447 an exception, unless the requested command is 'add'. 448 449 Returns: 450 A dictionary of machine states for all the machines in the AFELockManager 451 object. 452 453 Raises: 454 NoAFEServer: Cannot find the HW Lab or local AFE server. 455 AFEAccessError: An error occurred when querying the server about a 456 machine. 457 """ 458 if not self.HasAFEServer(False): 459 raise NoAFEServer('Error: Cannot connect to main AFE server.') 460 461 if self.local and not self.HasAFEServer(True): 462 raise NoAFEServer('Error: Cannot connect to local AFE server.') 463 464 machines = {} 465 for m in self.machines: 466 host_info = None 467 if m in self.toolchain_lab_machines: 468 mod_host = m.split('.')[0] 469 host_info = self.afe.get_hosts(hostname=mod_host) 470 if not host_info: 471 raise AFEAccessError('Unable to get information about %s from main' 472 ' autotest server.' % m) 473 else: 474 host_info = self.local_afe.get_hosts(hostname=m) 475 if not host_info and cmd != 'add': 476 raise AFEAccessError('Unable to get information about %s from ' 477 'local autotest server.' % m) 478 if host_info: 479 host_info = host_info[0] 480 name = host_info.hostname 481 values = {} 482 values['board'] = host_info.platform if host_info.platform else '??' 483 values['locked'] = host_info.locked 484 if host_info.locked: 485 values['locked_by'] = host_info.locked_by 486 values['lock_time'] = host_info.lock_time 487 else: 488 values['locked_by'] = '' 489 values['lock_time'] = '' 490 machines[name] = values 491 else: 492 machines[m] = {} 493 return machines 494 495 496def Main(argv): 497 """ 498 Parse the options, initialize lock manager and dispatch proper method. 499 500 Args: 501 argv: The options with which this script was invoked. 502 503 Returns: 504 0 unless an exception is raised. 505 """ 506 parser = argparse.ArgumentParser() 507 508 parser.add_argument('--list', dest='cmd', action='store_const', 509 const='status', 510 help='List current status of all known machines.') 511 parser.add_argument('--lock', dest='cmd', action='store_const', 512 const='lock', help='Lock given machine(s).') 513 parser.add_argument('--unlock', dest='cmd', action='store_const', 514 const='unlock', help='Unlock given machine(s).') 515 parser.add_argument('--status', dest='cmd', action='store_const', 516 const='status', 517 help='List current status of given machine(s).') 518 parser.add_argument('--add_machine', dest='cmd', action='store_const', 519 const='add', 520 help='Add machine to local machine server.') 521 parser.add_argument('--remove_machine', dest='cmd', 522 action='store_const', const='remove', 523 help='Remove machine from the local machine server.') 524 parser.add_argument('--nolocal', dest='local', 525 action='store_false', default=True, 526 help='Do not try to use local machine server.') 527 parser.add_argument('--remote', dest='remote', 528 help='machines on which to operate') 529 parser.add_argument('--chromeos_root', dest='chromeos_root', required=True, 530 help='ChromeOS root to use for autotest scripts.') 531 parser.add_argument('--local_server', dest='local_server', default=None, 532 help='Alternate local autotest server to use.') 533 parser.add_argument('--force', dest='force', action='store_true', 534 default=False, 535 help='Force lock/unlock of machines, even if not' 536 ' current lock owner.') 537 538 options = parser.parse_args(argv) 539 540 if not options.remote and options.cmd != 'status': 541 parser.error('No machines specified for operation.') 542 543 if not os.path.isdir(options.chromeos_root): 544 parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root) 545 546 if not options.cmd: 547 parser.error('No operation selected (--list, --status, --lock, --unlock,' 548 ' --add_machine, --remove_machine).') 549 550 machine_list = [] 551 if options.remote: 552 machine_list = options.remote.split() 553 554 lock_manager = AFELockManager(machine_list, options.force, 555 options.chromeos_root, options.local_server, 556 options.local) 557 558 machine_states = lock_manager.GetMachineStates(cmd=options.cmd) 559 cmd = options.cmd 560 561 if cmd == 'status': 562 lock_manager.ListMachineStates(machine_states) 563 564 elif cmd == 'lock': 565 if not lock_manager.force: 566 lock_manager.CheckMachineLocks(machine_states, cmd) 567 lock_manager.UpdateMachines(True) 568 569 elif cmd == 'unlock': 570 if not lock_manager.force: 571 lock_manager.CheckMachineLocks(machine_states, cmd) 572 lock_manager.UpdateMachines(False) 573 574 elif cmd == 'add': 575 lock_manager.AddMachinesToLocalServer() 576 577 elif cmd == 'remove': 578 lock_manager.RemoveMachinesFromLocalServer() 579 580 return 0 581 582 583if __name__ == '__main__': 584 sys.exit(Main(sys.argv[1:])) 585