afe_lock_machine.py revision e5bc63bbed4e001b080c4ce0b18c5c78900d4786
1#!/usr/bin/python
2#
3# Copyright 2015 Google INc.  All Rights Reserved.
4
5import argparse
6import getpass
7import os
8import sys
9import traceback
10
11from utils import logger
12from utils import machines
13from utils import misc
14
15
16class AFELockException(Exception):
17  """Base class for exceptions in this module."""
18
19
20class MachineNotPingable(AFELockException):
21  """Raised when machine does not respond to ping."""
22
23
24class MissingHostInfo(AFELockException):
25  """Raised when cannot find info about machine on machine servers."""
26
27
28class UpdateNonLocalMachine(AFELockException):
29  """Raised when user requests to add/remove a ChromeOS HW Lab machine.."""
30
31
32class DuplicateAdd(AFELockException):
33  """Raised when user requests to add a machine that's already on the server."""
34
35
36class UpdateServerError(AFELockException):
37  """Raised when attempt to add/remove a machine from local server fails."""
38
39
40class LockingError(AFELockException):
41  """Raised when server fails to lock/unlock machine as requested."""
42
43
44class DuplicateLock(AFELockException):
45  """Raised when user attempts to lock an already locked machine."""
46
47
48class DuplicateUnlock(AFELockException):
49  """Raised when user attempts to unlock an already unlocked machine."""
50
51
52class DontOwnLock(AFELockException):
53  """Raised when user attmepts to unlock machine locked by someone else."""
54  # This should not be raised if the user specified '--force'
55
56
57class NoAFEServer(AFELockException):
58  """Raised when cannot find/access the autotest server."""
59
60
61class AFEAccessError(AFELockException):
62  """Raised when cannot get information about lab machine from lab server."""
63
64
65class AFELockManager(object):
66  """Class for locking/unlocking machines vie Autotest Front End servers.
67
68  This class contains methods for checking the locked status of machines
69  on both the ChromeOS HW Lab AFE server and a local AFE server.  It also
70  has methods for adding/removing machines from the local server, and for
71  changing the lock status of machines on either server.  For the ChromeOS
72  HW Lab, it only allows access to the toolchain team lab machines, as
73  defined in toolchain-utils/crosperf/default_remotes.  By default it will
74  look for a local server on chrotomation2.mtv.corp.google.com, but an
75  alternative local AFE server can be supplied, if desired.
76
77  !!!IMPORTANT NOTE!!!  The AFE server can only be called from the main
78  thread/process of a program.  If you launch threads and try to call it
79  from a thread, you will get an error.  This has to do with restrictions
80  in the Python virtual machine (and signal handling) and cannot be changed.
81  """
82
83  LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com'
84
85  def __init__(self, remotes, force_option, chromeos_root, local_server,
86               local=True, log=None):
87    """Initializes an AFELockManager object.
88
89    Args:
90      remotes: A list of machine names or ip addresses to be managed.  Names
91        and ip addresses should be represented as strings.  If the list is empty,
92        the lock manager will get all known machines.
93      force_option:  A Boolean indicating whether or not to force an unlock of
94        a machine that was locked by someone else.
95      chromeos_root: The ChromeOS chroot to use for the autotest scripts.
96      local_server:  A string containing the name or ip address of the machine
97        that is running an AFE server, which is to be used for managing
98        machines that are not in the ChromeOS HW lab.
99      local: A Boolean indicating whether or not to use/allow a local AFE
100        server to be used (see local_server argument).
101      log: If not None, this is the logger object to be used for writing out
102        informational output messages.  It is expected to be an instance of
103        Logger class from utils/logger.py.
104    """
105    self.chromeos_root = chromeos_root
106    self.user = getpass.getuser()
107    self.logger = log or logger.GetLogger()
108    autotest_path = os.path.join(chromeos_root,
109                                 'src/third_party/autotest/files')
110
111    sys.path.append(autotest_path)
112    sys.path.append(os.path.join(autotest_path, 'server', 'cros'))
113
114    # We have to wait to do these imports until the paths above have
115    # been fixed.
116    from client import setup_modules
117    setup_modules.setup(base_path=autotest_path,
118                        root_module_name='autotest_lib')
119
120    from dynamic_suite import frontend_wrappers
121
122    self.afe = frontend_wrappers.RetryingAFE(timeout_min=30,
123                                             delay_sec=10,
124                                             debug=False)
125    if not local:
126      self.local_afe = None
127    else:
128      dargs = {}
129      dargs['server'] = local_server or AFELockManager.LOCAL_SERVER
130      # Make sure local server is pingable.
131      error_msg = ('Local autotest server machine %s not responding to ping.'
132                 % dargs['server'])
133      self.CheckMachine(dargs['server'], error_msg)
134      self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30,
135                                                     delay_sec=10,
136                                                     debug=False,
137                                                     **dargs)
138    self.local = local
139    self.machines = list(set(remotes)) or []
140    self.force = force_option
141    self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
142    if not self.machines:
143      self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines()
144
145  def CheckMachine(self, machine, error_msg):
146    """Verifies that machine is responding to ping.
147
148    Args:
149      machine: String containing the name or ip address of machine to check.
150      error_msg: Message to print if ping fails.
151
152    Raises:
153      MachineNotPingable:  If machine is not responding to 'ping'
154    """
155    if not machines.MachineIsPingable(machine, logging_level='none'):
156        raise MachineNotPingable(error_msg)
157
158  def MachineIsKnown(self, machine):
159    """Checks to see if either AFE server knows the given machine.
160
161    Args:
162      machine: String containing name or ip address of machine to check.
163
164    Returns:
165      Boolean indicating if the machine is in the list of known machines for
166        either AFE server.
167    """
168    if machine in self.toolchain_lab_machines:
169      return True
170    elif self.local_afe and machine in self.GetAllNonlabMachines():
171      return True
172
173    return False
174
175  def GetAllToolchainLabMachines(self):
176    """Gets a list of all the toolchain machines in the ChromeOS HW lab.
177
178    Returns:
179      A list of names of the toolchain machines in the ChromeOS HW lab.
180    """
181    machines_file = os.path.join(os.getcwd(), 'crosperf', 'default_remotes')
182    machine_list = []
183    with open(machines_file, 'r') as input_file:
184      lines = input_file.readlines()
185      for line in lines:
186        board, remotes = line.split(':')
187        remotes = remotes.strip()
188        for r in remotes.split():
189          machine_list.append(r.strip())
190    return machine_list
191
192  def GetAllNonlabMachines(self):
193    """Gets a list of all known machines on the local AFE server.
194
195    Returns:
196      A list of the names of the machines on the local AFE server.
197    """
198    non_lab_machines = []
199    if self.local_afe:
200      non_lab_machines = self.local_afe.get_hostnames()
201    return non_lab_machines
202
203  def PrintStatusHeader(self, is_lab_machine):
204    """Prints the status header lines for machines.
205
206    Args: Boolean indicating whether to print HW Lab header or local
207      machine header (different spacing).
208    """
209    if is_lab_machine:
210      print '\nMachine (Board)\t\t\t\tStatus'
211      print '---------------\t\t\t\t------\n'
212    else:
213      print '\nMachine (Board)\t\tStatus'
214      print '---------------\t\t------\n'
215
216  def RemoveLocalMachine(self, m):
217    """Removes a machine from the local AFE server.
218
219    Args:
220      m: The machine to remove.
221
222    Raises:
223      MissingHostInfo:  Can't find machine to be removed.
224    """
225    if self.local_afe:
226      host_info = self.local_afe.get_hosts(hostname=m)
227      if host_info:
228        host_info = host_info[0]
229        host_info.delete()
230      else:
231        raise MissingHostInfo('Cannot find/delete machine %s.' % m)
232
233  def AddLocalMachine(self, m):
234    """Adds a machine to the local AFE server.
235
236    Args:
237      m: The machine to be added.
238    """
239    if self.local_afe:
240      error_msg = 'Machine %s is not responding to ping.' % m
241      self.CheckMachine(m, error_msg)
242      host = self.local_afe.create_host(m)
243
244  def AddMachinesToLocalServer(self):
245    """Adds one or more machines to the local AFE server.
246
247    Verify that the requested machines are legal to add to the local server,
248    i.e. that they are not ChromeOS HW lab machines, and they are not already
249    on the local server.  Call AddLocalMachine for each valid machine.
250
251    Raises:
252      DuplicateAdd: Attempt to add a machine that is already on the server.
253      UpdateNonLocalMachine:  Attempt to add a ChromeOS HW lab machine.
254      UpdateServerError:  Something went wrong while attempting to add a
255        machine.
256    """
257    for m in self.machines:
258      if m in self.toolchain_lab_machines:
259        raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW '
260                                    'Lab.  Cannot add it to local server.' % m)
261      host_info = self.local_afe.get_hosts(hostname=m)
262      if host_info:
263        raise DuplicateAdd('Machine %s is already on the local server.' % m)
264      try:
265        self.AddLocalMachine(m)
266        self.logger.LogOutput('Successfully added %s to local server.' % m)
267      except Exception as e:
268        traceback.print_exc()
269        raise UpdateServerError('Error occurred while attempting to add %s. %s'
270                                % (m, str(e)))
271
272  def RemoveMachinesFromLocalServer(self):
273    """Removes one or more machines from the local AFE server.
274
275    Verify that the requested machines are legal to remove from the local
276    server, i.e. that they are not ChromeOS HW lab machines.  Call
277    RemoveLocalMachine for each valid machine.
278
279    Raises:
280      UpdateServerError:  Something went wrong while attempting to remove a
281        machine.
282    """
283    for m in self.machines:
284      if m in self.toolchain_lab_machines:
285        raise UpdateNonLocalMachine('Machine %s is in the ChromeOS HW Lab. '
286                                    'This script cannot remove lab machines.'
287                                    % m)
288      try:
289        self.RemoveLocalMachine(m)
290        self.logger.LogOutput('Successfully removed %s from local server.' % m)
291      except Exception as e:
292        traceback.print_exc()
293        raise UpdateServerError('Error occurred while attempting to remove %s '
294                                '(%s).' % (m, str(e)))
295
296  def ListMachineStates(self, machine_states):
297    """Gets and prints the current status for a list of machines.
298
299    Prints out the current status for all of the machines in the current
300    AFELockManager's list of machines (set when the object is initialized).
301
302    Args:
303      machine_states: A dictionary of the current state of every machine in
304        the current AFELockManager's list of machines.  Normally obtained by
305        calling AFELockManager::GetMachineStates.
306    """
307    local_machines = []
308    printed_hdr = False
309    for m in machine_states:
310      cros_name = m + '.cros'
311      if (m in self.toolchain_lab_machines or
312          cros_name in self.toolchain_lab_machines):
313        if not printed_hdr:
314          self.PrintStatusHeader(True)
315          printed_hdr = True
316        state = machine_states[m]
317        if state['locked']:
318          print ('%s (%s)\tlocked by %s since %s' %
319                 (m, state['board'], state['locked_by'], state['lock_time']))
320        else:
321          print '%s (%s)\tunlocked' % (m, state['board'])
322      else:
323        local_machines.append(m)
324
325    if local_machines:
326      self.PrintStatusHeader(False)
327      for m in local_machines:
328        state = machine_states[m]
329        if state['locked']:
330          print ('%s (%s)\tlocked by %s since %s' %
331                 (m, state['board'], state['locked_by'], state['lock_time']))
332        else:
333          print '%s (%s)\tunlocked' % (m, state['board'])
334
335
336  def UpdateLockInAFE(self, should_lock_machine, machine):
337    """Calls an AFE server to lock/unlock a machine.
338
339    Args:
340      should_lock_machine: Boolean indicating whether to lock the machine (True)
341        or unlock the machine (False).
342      machine: The machine to update.
343
344    Raises:
345      LockingError:  An error occurred while attempting to update the machine
346        state.
347    """
348    action = 'lock'
349    if not should_lock_machine:
350      action = 'unlock'
351    kwargs = {'locked': should_lock_machine}
352
353    if machine in self.toolchain_lab_machines:
354      m = machine.split('.')[0]
355      kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user
356      afe_server = self.afe
357    else:
358      m = machine
359      afe_server = self.local_afe
360
361    try:
362      afe_server.run('modify_hosts',
363                      host_filter_data={'hostname__in': [m]},
364                      update_data=kwargs)
365    except Exception as e:
366      traceback.print_exc()
367      raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e)))
368
369  def UpdateMachines(self, lock_machines):
370    """Sets the locked state of the machines to the requested value.
371
372    The machines updated are the ones in self.machines (specified when the
373    class object was intialized).
374
375    Args:
376      lock_machines:  Boolean indicating whether to lock the machines (True) or
377        unlock the machines (False).
378    """
379    for m in self.machines:
380      self.UpdateLockInAFE(lock_machines, m)
381
382      # Since we returned from self.UpdateLockInAFE we assume the request
383      # succeeded.
384      if lock_machines:
385        self.logger.LogOutput('Locked machine(s) %s.' % m)
386      else:
387        self.logger.LogOutput('Unlocked machine(s) %s.' % m)
388
389  def CheckMachineLocks(self, machine_states, cmd):
390    """Check that every machine in requested list is in the proper state.
391
392    If the cmd is 'unlock' verify that every machine is locked by requestor.
393    If the cmd is 'lock' verify that every machine is currently unlocked.
394
395    Args:
396      machine_states: A dictionary of the current state of every machine in
397        the current AFELockManager's list of machines.  Normally obtained by
398        calling AFELockManager::GetMachineStates.
399      cmd:  'lock' or 'unlock'.  The user-requested action for the machines.
400
401    Raises:
402      DuplicateLock: A machine requested to be locked is already locked.
403      DuplicateUnlock: A machine requested to be unlocked is already unlocked.
404      DontOwnLock: The lock on a requested machine is owned by someone else.
405    """
406    for k, state in machine_states.iteritems():
407      if cmd == 'unlock':
408        if not state['locked']:
409          raise DuplicateUnlock('Attempt to unlock already unlocked machine '
410                                '(%s).' % k)
411
412        if state['locked_by'] != self.user:
413          raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
414                            'else (%s).' % (k, state['locked_by']))
415      elif cmd == 'lock':
416        if state['locked']:
417          raise DuplicateLock('Attempt to lock already locked machine (%s)' % k)
418
419  def HasAFEServer(self, local):
420    """Verifies that the AFELockManager has appropriate AFE server.
421
422    Args:
423      local: Boolean indicating whether we are checking for the local server
424        (True) or for the global server (False).
425
426    Returns:
427      A boolean indicating if the AFELockManager has the requested AFE server.
428    """
429    if local:
430      return self.local_afe is not None
431    else:
432      return self.afe is not None
433
434  def GetMachineStates(self, cmd=''):
435    """Gets the current state of all the requested machines.
436
437    Gets the current state of all the requested machines, both from the HW lab
438    sever and from the local server.  Stores the data in a dictionary keyed
439    by machine name.
440
441    Args:
442      cmd: The command for which we are getting the machine states. This is
443        important because if one of the requested machines is missing we raise
444        an exception, unless the requested command is 'add'.
445
446    Returns:
447      A dictionary of machine states for all the machines in the AFELockManager
448      object.
449
450    Raises:
451      NoAFEServer:  Cannot find the HW Lab or local AFE server.
452      AFEAccessError:  An error occurred when querying the server about a
453        machine.
454    """
455    if not self.HasAFEServer(False):
456      raise NoAFEServer('Error: Cannot connect to main AFE server.')
457
458    if self.local and not self.HasAFEServer(True):
459      raise NoAFEServer('Error: Cannot connect to local AFE server.')
460
461    machines = {}
462    for m in self.machines:
463      host_info = None
464      if m in self.toolchain_lab_machines:
465        mod_host = m.split('.')[0]
466        host_info = self.afe.get_hosts(hostname=mod_host)
467        if not host_info:
468          raise AFEAccessError('Unable to get information about %s from main'
469                               ' autotest server.' % m)
470      else:
471        host_info = self.local_afe.get_hosts(hostname=m)
472        if not host_info and cmd != 'add':
473          raise AFEAccessError('Unable to get information about %s from '
474                               'local autotest server.' % m)
475      if host_info:
476        host_info = host_info[0]
477        name = host_info.hostname
478        values = {}
479        values['board'] = host_info.platform if host_info.platform else '??'
480        values['locked'] = host_info.locked
481        if host_info.locked:
482            values['locked_by'] = host_info.locked_by
483            values['lock_time'] = host_info.lock_time
484        else:
485            values['locked_by'] = ''
486            values['lock_time'] = ''
487        machines[name] = values
488      else:
489        machines[m] = {}
490    return machines
491
492
493def Main(argv):
494    """
495    Parse the options, initialize lock manager and dispatch proper method.
496
497    Args:
498      argv:  The options with which this script was invoked.
499
500    Returns:
501      0 unless an exception is raised.
502    """
503    parser = argparse.ArgumentParser()
504
505    parser.add_argument('--list', dest='cmd', action='store_const',
506                        const='status',
507                        help='List current status of all known machines.')
508    parser.add_argument('--lock', dest='cmd', action='store_const',
509                        const='lock', help='Lock given machine(s).')
510    parser.add_argument('--unlock', dest='cmd', action='store_const',
511                        const='unlock', help='Unlock given machine(s).')
512    parser.add_argument('--status', dest='cmd', action='store_const',
513                        const='status',
514                        help='List current status of given machine(s).')
515    parser.add_argument('--add_machine', dest='cmd', action='store_const',
516                        const='add',
517                        help='Add machine to local machine server.')
518    parser.add_argument('--remove_machine', dest='cmd',
519                        action='store_const', const='remove',
520                        help='Remove machine from the local machine server.')
521    parser.add_argument('--nolocal', dest='local',
522                        action='store_false', default=True,
523                        help='Do not try to use local machine server.')
524    parser.add_argument('--remote', dest='remote',
525                        help='machines on which to operate')
526    parser.add_argument('--chromeos_root', dest='chromeos_root', required=True,
527                        help='ChromeOS root to use for autotest scripts.')
528    parser.add_argument('--local_server', dest='local_server', default=None,
529                        help='Alternate local autotest server to use.')
530    parser.add_argument('--force', dest='force', action='store_true',
531                        default=False,
532                        help='Force lock/unlock of machines, even if not'
533                        ' current lock owner.')
534
535    options = parser.parse_args(argv)
536
537    if not options.remote and options.cmd != 'status':
538      parser.error('No machines specified for operation.')
539
540    if not os.path.isdir(options.chromeos_root):
541      parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
542
543    if not options.cmd:
544      parser.error('No operation selected (--list, --status, --lock, --unlock,'
545                   ' --add_machine, --remove_machine).')
546
547    machine_list = []
548    if options.remote:
549      machine_list = options.remote.split()
550
551    lock_manager = AFELockManager(machine_list, options.force,
552                                  options.chromeos_root, options.local_server,
553                                  options.local)
554
555    machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
556    cmd = options.cmd
557
558    if cmd == 'status':
559      lock_manager.ListMachineStates(machine_states)
560
561    elif cmd == 'lock':
562      if not lock_manager.force:
563        lock_manager.CheckMachineLocks(machine_states, cmd)
564        lock_manager.UpdateMachines(True)
565
566    elif cmd == 'unlock':
567      if not lock_manager.force:
568        lock_manager.CheckMachineLocks(machine_states, cmd)
569        lock_manager.UpdateMachines(False)
570
571    elif cmd == 'add':
572      lock_manager.AddMachinesToLocalServer()
573
574    elif cmd == 'remove':
575      lock_manager.RemoveMachinesFromLocalServer()
576
577    return 0
578
579
580if __name__ == '__main__':
581    sys.exit(Main(sys.argv[1:]))
582