afe_lock_machine.py revision f2a3ef46f75d2196a93d3ed27f4d1fcf22b54fbe
1#!/usr/bin/python2
2#
3# Copyright 2015 Google INc.  All Rights Reserved.
4"""This module controls locking and unlocking of test machines."""
5
6from __future__ import print_function
7
8import argparse
9import getpass
10import os
11import sys
12import traceback
13
14from utils import logger
15from utils import machines
16
17
18class AFELockException(Exception):
19  """Base class for exceptions in this module."""
20
21
22class MachineNotPingable(AFELockException):
23  """Raised when machine does not respond to ping."""
24
25
26class MissingHostInfo(AFELockException):
27  """Raised when cannot find info about machine on machine servers."""
28
29
30class UpdateNonLocalMachine(AFELockException):
31  """Raised when user requests to add/remove a ChromeOS HW Lab machine.."""
32
33
34class DuplicateAdd(AFELockException):
35  """Raised when user requests to add a machine that's already on the server."""
36
37
38class UpdateServerError(AFELockException):
39  """Raised when attempt to add/remove a machine from local server fails."""
40
41
42class LockingError(AFELockException):
43  """Raised when server fails to lock/unlock machine as requested."""
44
45
46class DontOwnLock(AFELockException):
47  """Raised when user attmepts to unlock machine locked by someone else."""
48  # This should not be raised if the user specified '--force'
49
50
51class NoAFEServer(AFELockException):
52  """Raised when cannot find/access the autotest server."""
53
54
55class AFEAccessError(AFELockException):
56  """Raised when cannot get information about lab machine from lab server."""
57
58
59class AFELockManager(object):
60  """Class for locking/unlocking machines vie Autotest Front End servers.
61
62  This class contains methods for checking the locked status of machines
63  on both the ChromeOS HW Lab AFE server and a local AFE server.  It also
64  has methods for adding/removing machines from the local server, and for
65  changing the lock status of machines on either server.  For the ChromeOS
66  HW Lab, it only allows access to the toolchain team lab machines, as
67  defined in toolchain-utils/crosperf/default_remotes.  By default it will
68  look for a local server on chrotomation2.mtv.corp.google.com, but an
69  alternative local AFE server can be supplied, if desired.
70
71  !!!IMPORTANT NOTE!!!  The AFE server can only be called from the main
72  thread/process of a program.  If you launch threads and try to call it
73  from a thread, you will get an error.  This has to do with restrictions
74  in the Python virtual machine (and signal handling) and cannot be changed.
75  """
76
77  LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com'
78
79  def __init__(self,
80               remotes,
81               force_option,
82               chromeos_root,
83               local_server,
84               local=True,
85               log=None):
86    """Initializes an AFELockManager object.
87
88    Args:
89      remotes: A list of machine names or ip addresses to be managed.  Names
90        and ip addresses should be represented as strings.  If the list is
91        empty, the lock manager will get all known machines.
92      force_option: A Boolean indicating whether or not to force an unlock of
93        a machine that was locked by someone else.
94      chromeos_root: The ChromeOS chroot to use for the autotest scripts.
95      local_server: A string containing the name or ip address of the machine
96        that is running an AFE server, which is to be used for managing
97        machines that are not in the ChromeOS HW lab.
98      local: A Boolean indicating whether or not to use/allow a local AFE
99        server to be used (see local_server argument).
100      log: If not None, this is the logger object to be used for writing out
101        informational output messages.  It is expected to be an instance of
102        Logger class from utils/logger.py.
103    """
104    self.chromeos_root = chromeos_root
105    self.user = getpass.getuser()
106    self.logger = log or logger.GetLogger()
107    autotest_path = os.path.join(chromeos_root,
108                                 'src/third_party/autotest/files')
109
110    sys.path.append(chromeos_root)
111    sys.path.append(autotest_path)
112    sys.path.append(os.path.join(autotest_path, 'server', 'cros'))
113
114    # We have to wait to do these imports until the paths above have
115    # been fixed.
116    from client import setup_modules
117    setup_modules.setup(base_path=autotest_path,
118                        root_module_name='autotest_lib')
119
120    from dynamic_suite import frontend_wrappers
121
122    self.afe = frontend_wrappers.RetryingAFE(timeout_min=30,
123                                             delay_sec=10,
124                                             debug=False,
125                                             server='cautotest')
126    if not local:
127      self.local_afe = None
128    else:
129      dargs = {}
130      dargs['server'] = local_server or AFELockManager.LOCAL_SERVER
131      # Make sure local server is pingable.
132      error_msg = ('Local autotest server machine %s not responding to ping.' %
133                   dargs['server'])
134      self.CheckMachine(dargs['server'], error_msg)
135      self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30,
136                                                     delay_sec=10,
137                                                     debug=False,
138                                                     **dargs)
139    self.local = local
140    self.machines = list(set(remotes)) or []
141    self.force = force_option
142    self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
143    if not self.machines:
144      self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines()
145
146  def CheckMachine(self, machine, error_msg):
147    """Verifies that machine is responding to ping.
148
149    Args:
150      machine: String containing the name or ip address of machine to check.
151      error_msg: Message to print if ping fails.
152
153    Raises:
154      MachineNotPingable:  If machine is not responding to 'ping'
155    """
156    if not machines.MachineIsPingable(machine, logging_level='none'):
157      cros_machine = machine + '.cros'
158      if not machines.MachineIsPingable(cros_machine, logging_level='none'):
159        raise MachineNotPingable(error_msg)
160
161  def MachineIsKnown(self, machine):
162    """Checks to see if either AFE server knows the given machine.
163
164    Args:
165      machine: String containing name or ip address of machine to check.
166
167    Returns:
168      Boolean indicating if the machine is in the list of known machines for
169        either AFE server.
170    """
171    if machine in self.toolchain_lab_machines:
172      return True
173    elif self.local_afe and machine in self.GetAllNonlabMachines():
174      return True
175
176    return False
177
178  def GetAllToolchainLabMachines(self):
179    """Gets a list of all the toolchain machines in the ChromeOS HW lab.
180
181    Returns:
182      A list of names of the toolchain machines in the ChromeOS HW lab.
183    """
184    machines_file = os.path.join(
185        os.path.dirname(__file__), 'crosperf', 'default_remotes')
186    machine_list = []
187    with open(machines_file, 'r') as input_file:
188      lines = input_file.readlines()
189      for line in lines:
190        _, remotes = line.split(':')
191        remotes = remotes.strip()
192        for r in remotes.split():
193          machine_list.append(r.strip())
194    return machine_list
195
196  def GetAllNonlabMachines(self):
197    """Gets a list of all known machines on the local AFE server.
198
199    Returns:
200      A list of the names of the machines on the local AFE server.
201    """
202    non_lab_machines = []
203    if self.local_afe:
204      non_lab_machines = self.local_afe.get_hostnames()
205    return non_lab_machines
206
207  def PrintStatusHeader(self, is_lab_machine):
208    """Prints the status header lines for machines.
209
210    Args:
211      is_lab_machine: Boolean indicating whether to print HW Lab header or
212        local machine header (different spacing).
213    """
214    if is_lab_machine:
215      print('\nMachine (Board)\t\t\t\t\tStatus')
216      print('---------------\t\t\t\t\t------\n')
217    else:
218      print('\nMachine (Board)\t\tStatus')
219      print('---------------\t\t------\n')
220
221  def RemoveLocalMachine(self, m):
222    """Removes a machine from the local AFE server.
223
224    Args:
225      m: The machine to remove.
226
227    Raises:
228      MissingHostInfo:  Can't find machine to be removed.
229    """
230    if self.local_afe:
231      host_info = self.local_afe.get_hosts(hostname=m)
232      if host_info:
233        host_info = host_info[0]
234        host_info.delete()
235      else:
236        raise MissingHostInfo('Cannot find/delete machine %s.' % m)
237
238  def AddLocalMachine(self, m):
239    """Adds a machine to the local AFE server.
240
241    Args:
242      m: The machine to be added.
243    """
244    if self.local_afe:
245      error_msg = 'Machine %s is not responding to ping.' % m
246      self.CheckMachine(m, error_msg)
247      self.local_afe.create_host(m)
248
249  def AddMachinesToLocalServer(self):
250    """Adds one or more machines to the local AFE server.
251
252    Verify that the requested machines are legal to add to the local server,
253    i.e. that they are not ChromeOS HW lab machines, and they are not already
254    on the local server.  Call AddLocalMachine for each valid machine.
255
256    Raises:
257      DuplicateAdd: Attempt to add a machine that is already on the server.
258      UpdateNonLocalMachine:  Attempt to add a ChromeOS HW lab machine.
259      UpdateServerError:  Something went wrong while attempting to add a
260        machine.
261    """
262    for m in self.machines:
263      for cros_name in [m, m + '.cros']:
264        if cros_name in self.toolchain_lab_machines:
265          raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW'
266                                      'Lab.  Cannot add it to local server.' %
267                                      cros_name)
268      host_info = self.local_afe.get_hosts(hostname=m)
269      if host_info:
270        raise DuplicateAdd('Machine %s is already on the local server.' % m)
271      try:
272        self.AddLocalMachine(m)
273        self.logger.LogOutput('Successfully added %s to local server.' % m)
274      except Exception as e:
275        traceback.print_exc()
276        raise UpdateServerError(
277            'Error occurred while attempting to add %s. %s' % (m, str(e)))
278
279  def RemoveMachinesFromLocalServer(self):
280    """Removes one or more machines from the local AFE server.
281
282    Verify that the requested machines are legal to remove from the local
283    server, i.e. that they are not ChromeOS HW lab machines.  Call
284    RemoveLocalMachine for each valid machine.
285
286    Raises:
287      UpdateServerError:  Something went wrong while attempting to remove a
288        machine.
289    """
290    for m in self.machines:
291      for cros_name in [m, m + '.cros']:
292        if cros_name in self.toolchain_lab_machines:
293          raise UpdateNonLocalMachine(
294              'Machine %s is in the ChromeOS HW Lab. '
295              'This script cannot remove lab machines.' % cros_name)
296      try:
297        self.RemoveLocalMachine(m)
298        self.logger.LogOutput('Successfully removed %s from local server.' % m)
299      except Exception as e:
300        traceback.print_exc()
301        raise UpdateServerError('Error occurred while attempting to remove %s '
302                                '(%s).' % (m, str(e)))
303
304  def ListMachineStates(self, machine_states):
305    """Gets and prints the current status for a list of machines.
306
307    Prints out the current status for all of the machines in the current
308    AFELockManager's list of machines (set when the object is initialized).
309
310    Args:
311      machine_states: A dictionary of the current state of every machine in
312        the current AFELockManager's list of machines.  Normally obtained by
313        calling AFELockManager::GetMachineStates.
314    """
315    local_machines = []
316    printed_hdr = False
317    for m in machine_states:
318      cros_name = m + '.cros'
319      if (m in self.toolchain_lab_machines or
320          cros_name in self.toolchain_lab_machines):
321        name = m if m in self.toolchain_lab_machines else cros_name
322        if not printed_hdr:
323          self.PrintStatusHeader(True)
324          printed_hdr = True
325        state = machine_states[m]
326        if state['locked']:
327          print('%s (%s)\tlocked by %s since %s' %
328                (name, state['board'], state['locked_by'], state['lock_time']))
329        else:
330          print('%s (%s)\tunlocked' % (name, state['board']))
331      else:
332        local_machines.append(m)
333
334    if local_machines:
335      self.PrintStatusHeader(False)
336      for m in local_machines:
337        state = machine_states[m]
338        if state['locked']:
339          print('%s (%s)\tlocked by %s since %s' %
340                (m, state['board'], state['locked_by'], state['lock_time']))
341        else:
342          print('%s (%s)\tunlocked' % (m, state['board']))
343
344  def UpdateLockInAFE(self, should_lock_machine, machine):
345    """Calls an AFE server to lock/unlock a machine.
346
347    Args:
348      should_lock_machine: Boolean indicating whether to lock the machine (True)
349        or unlock the machine (False).
350      machine: The machine to update.
351
352    Raises:
353      LockingError:  An error occurred while attempting to update the machine
354        state.
355    """
356    action = 'lock'
357    if not should_lock_machine:
358      action = 'unlock'
359    kwargs = {'locked': should_lock_machine}
360    kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user
361
362    cros_name = machine + '.cros'
363    if cros_name in self.toolchain_lab_machines:
364      machine = cros_name
365    if machine in self.toolchain_lab_machines:
366      m = machine.split('.')[0]
367      afe_server = self.afe
368    else:
369      m = machine
370      afe_server = self.local_afe
371
372    try:
373      afe_server.run('modify_hosts',
374                     host_filter_data={'hostname__in': [m]},
375                     update_data=kwargs)
376    except Exception as e:
377      traceback.print_exc()
378      raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e)))
379
380  def UpdateMachines(self, lock_machines):
381    """Sets the locked state of the machines to the requested value.
382
383    The machines updated are the ones in self.machines (specified when the
384    class object was intialized).
385
386    Args:
387      lock_machines: Boolean indicating whether to lock the machines (True) or
388        unlock the machines (False).
389
390    Returns:
391      A list of the machines whose state was successfully updated.
392    """
393    updated_machines = []
394    for m in self.machines:
395      self.UpdateLockInAFE(lock_machines, m)
396      # Since we returned from self.UpdateLockInAFE we assume the request
397      # succeeded.
398      if lock_machines:
399        self.logger.LogOutput('Locked machine(s) %s.' % m)
400      else:
401        self.logger.LogOutput('Unlocked machine(s) %s.' % m)
402      updated_machines.append(m)
403
404    return updated_machines
405
406  def _InternalRemoveMachine(self, machine):
407    """Remove machine from internal list of machines.
408
409    Args:
410      machine: Name of machine to be removed from internal list.
411    """
412    # Check to see if machine is lab machine and if so, make sure it has
413    # ".cros" on the end.
414    cros_machine = machine
415    if machine.find('rack') > 0 and machine.find('row') > 0:
416      if machine.find('.cros') == -1:
417        cros_machine = cros_machine + '.cros'
418
419    self.machines = [m
420                     for m in self.machines
421                     if m != cros_machine and m != machine]
422
423  def CheckMachineLocks(self, machine_states, cmd):
424    """Check that every machine in requested list is in the proper state.
425
426    If the cmd is 'unlock' verify that every machine is locked by requestor.
427    If the cmd is 'lock' verify that every machine is currently unlocked.
428
429    Args:
430      machine_states: A dictionary of the current state of every machine in
431        the current AFELockManager's list of machines.  Normally obtained by
432        calling AFELockManager::GetMachineStates.
433      cmd: The user-requested action for the machines: 'lock' or 'unlock'.
434
435    Raises:
436      DontOwnLock: The lock on a requested machine is owned by someone else.
437    """
438    for k, state in machine_states.iteritems():
439      if cmd == 'unlock':
440        if not state['locked']:
441          self.logger.LogWarning('Attempt to unlock already unlocked machine '
442                                 '(%s).' % k)
443          self._InternalRemoveMachine(k)
444
445        if state['locked'] and state['locked_by'] != self.user:
446          raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
447                            'else (%s).' % (k, state['locked_by']))
448      elif cmd == 'lock':
449        if state['locked']:
450          self.logger.LogWarning('Attempt to lock already locked machine (%s)' %
451                                 k)
452          self._InternalRemoveMachine(k)
453
454  def HasAFEServer(self, local):
455    """Verifies that the AFELockManager has appropriate AFE server.
456
457    Args:
458      local: Boolean indicating whether we are checking for the local server
459        (True) or for the global server (False).
460
461    Returns:
462      A boolean indicating if the AFELockManager has the requested AFE server.
463    """
464    if local:
465      return self.local_afe is not None
466    else:
467      return self.afe is not None
468
469  def GetMachineStates(self, cmd=''):
470    """Gets the current state of all the requested machines.
471
472    Gets the current state of all the requested machines, both from the HW lab
473    sever and from the local server.  Stores the data in a dictionary keyed
474    by machine name.
475
476    Args:
477      cmd: The command for which we are getting the machine states. This is
478        important because if one of the requested machines is missing we raise
479        an exception, unless the requested command is 'add'.
480
481    Returns:
482      A dictionary of machine states for all the machines in the AFELockManager
483      object.
484
485    Raises:
486      NoAFEServer:  Cannot find the HW Lab or local AFE server.
487      AFEAccessError:  An error occurred when querying the server about a
488        machine.
489    """
490    if not self.HasAFEServer(False):
491      raise NoAFEServer('Error: Cannot connect to main AFE server.')
492
493    if self.local and not self.HasAFEServer(True):
494      raise NoAFEServer('Error: Cannot connect to local AFE server.')
495
496    machine_list = {}
497    for m in self.machines:
498      host_info = None
499      cros_name = m + '.cros'
500      if (m in self.toolchain_lab_machines or
501          cros_name in self.toolchain_lab_machines):
502        mod_host = m.split('.')[0]
503        host_info = self.afe.get_hosts(hostname=mod_host)
504        if not host_info:
505          raise AFEAccessError('Unable to get information about %s from main'
506                               ' autotest server.' % m)
507      else:
508        host_info = self.local_afe.get_hosts(hostname=m)
509        if not host_info and cmd != 'add':
510          raise AFEAccessError('Unable to get information about %s from '
511                               'local autotest server.' % m)
512      if host_info:
513        host_info = host_info[0]
514        name = host_info.hostname
515        values = {}
516        values['board'] = host_info.platform if host_info.platform else '??'
517        values['locked'] = host_info.locked
518        if host_info.locked:
519          values['locked_by'] = host_info.locked_by
520          values['lock_time'] = host_info.lock_time
521        else:
522          values['locked_by'] = ''
523          values['lock_time'] = ''
524        machine_list[name] = values
525      else:
526        machine_list[m] = {}
527    return machine_list
528
529
530def Main(argv):
531  """Parse the options, initialize lock manager and dispatch proper method.
532
533  Args:
534    argv: The options with which this script was invoked.
535
536  Returns:
537    0 unless an exception is raised.
538  """
539  parser = argparse.ArgumentParser()
540
541  parser.add_argument('--list',
542                      dest='cmd',
543                      action='store_const',
544                      const='status',
545                      help='List current status of all known machines.')
546  parser.add_argument('--lock',
547                      dest='cmd',
548                      action='store_const',
549                      const='lock',
550                      help='Lock given machine(s).')
551  parser.add_argument('--unlock',
552                      dest='cmd',
553                      action='store_const',
554                      const='unlock',
555                      help='Unlock given machine(s).')
556  parser.add_argument('--status',
557                      dest='cmd',
558                      action='store_const',
559                      const='status',
560                      help='List current status of given machine(s).')
561  parser.add_argument('--add_machine',
562                      dest='cmd',
563                      action='store_const',
564                      const='add',
565                      help='Add machine to local machine server.')
566  parser.add_argument('--remove_machine',
567                      dest='cmd',
568                      action='store_const',
569                      const='remove',
570                      help='Remove machine from the local machine server.')
571  parser.add_argument('--nolocal',
572                      dest='local',
573                      action='store_false',
574                      default=True,
575                      help='Do not try to use local machine server.')
576  parser.add_argument('--remote',
577                      dest='remote',
578                      help='machines on which to operate')
579  parser.add_argument('--chromeos_root',
580                      dest='chromeos_root',
581                      required=True,
582                      help='ChromeOS root to use for autotest scripts.')
583  parser.add_argument('--local_server',
584                      dest='local_server',
585                      default=None,
586                      help='Alternate local autotest server to use.')
587  parser.add_argument('--force',
588                      dest='force',
589                      action='store_true',
590                      default=False,
591                      help='Force lock/unlock of machines, even if not'
592                      ' current lock owner.')
593
594  options = parser.parse_args(argv)
595
596  if not options.remote and options.cmd != 'status':
597    parser.error('No machines specified for operation.')
598
599  if not os.path.isdir(options.chromeos_root):
600    parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
601
602  if not options.cmd:
603    parser.error('No operation selected (--list, --status, --lock, --unlock,'
604                 ' --add_machine, --remove_machine).')
605
606  machine_list = []
607  if options.remote:
608    machine_list = options.remote.split()
609
610  lock_manager = AFELockManager(machine_list, options.force,
611                                options.chromeos_root, options.local_server,
612                                options.local)
613
614  machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
615  cmd = options.cmd
616
617  if cmd == 'status':
618    lock_manager.ListMachineStates(machine_states)
619
620  elif cmd == 'lock':
621    if not lock_manager.force:
622      lock_manager.CheckMachineLocks(machine_states, cmd)
623      lock_manager.UpdateMachines(True)
624
625  elif cmd == 'unlock':
626    if not lock_manager.force:
627      lock_manager.CheckMachineLocks(machine_states, cmd)
628      lock_manager.UpdateMachines(False)
629
630  elif cmd == 'add':
631    lock_manager.AddMachinesToLocalServer()
632
633  elif cmd == 'remove':
634    lock_manager.RemoveMachinesFromLocalServer()
635
636  return 0
637
638
639if __name__ == '__main__':
640  sys.exit(Main(sys.argv[1:]))
641