afe_lock_machine.py revision d97422aef5709f0a3b16b4efebc397c891940c95
1#!/usr/bin/python2
2#
3# Copyright 2015 Google INc.  All Rights Reserved.
4"""This module controls locking and unlocking of test machines."""
5
6from __future__ import print_function
7
8import argparse
9import getpass
10import os
11import sys
12import traceback
13
14from utils import logger
15from utils import machines
16
17
18class AFELockException(Exception):
19  """Base class for exceptions in this module."""
20
21
22class MachineNotPingable(AFELockException):
23  """Raised when machine does not respond to ping."""
24
25
26class MissingHostInfo(AFELockException):
27  """Raised when cannot find info about machine on machine servers."""
28
29
30class UpdateNonLocalMachine(AFELockException):
31  """Raised when user requests to add/remove a ChromeOS HW Lab machine.."""
32
33
34class DuplicateAdd(AFELockException):
35  """Raised when user requests to add a machine that's already on the server."""
36
37
38class UpdateServerError(AFELockException):
39  """Raised when attempt to add/remove a machine from local server fails."""
40
41
42class LockingError(AFELockException):
43  """Raised when server fails to lock/unlock machine as requested."""
44
45
46class DontOwnLock(AFELockException):
47  """Raised when user attmepts to unlock machine locked by someone else."""
48  # This should not be raised if the user specified '--force'
49
50
51class NoAFEServer(AFELockException):
52  """Raised when cannot find/access the autotest server."""
53
54
55class AFEAccessError(AFELockException):
56  """Raised when cannot get information about lab machine from lab server."""
57
58
59class AFELockManager(object):
60  """Class for locking/unlocking machines vie Autotest Front End servers.
61
62  This class contains methods for checking the locked status of machines
63  on both the ChromeOS HW Lab AFE server and a local AFE server.  It also
64  has methods for adding/removing machines from the local server, and for
65  changing the lock status of machines on either server.  For the ChromeOS
66  HW Lab, it only allows access to the toolchain team lab machines, as
67  defined in toolchain-utils/crosperf/default_remotes.  By default it will
68  look for a local server on chrotomation2.mtv.corp.google.com, but an
69  alternative local AFE server can be supplied, if desired.
70
71  !!!IMPORTANT NOTE!!!  The AFE server can only be called from the main
72  thread/process of a program.  If you launch threads and try to call it
73  from a thread, you will get an error.  This has to do with restrictions
74  in the Python virtual machine (and signal handling) and cannot be changed.
75  """
76
77  LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com'
78
79  def __init__(self,
80               remotes,
81               force_option,
82               chromeos_root,
83               local_server,
84               local=True,
85               log=None):
86    """Initializes an AFELockManager object.
87
88    Args:
89      remotes: A list of machine names or ip addresses to be managed.  Names
90        and ip addresses should be represented as strings.  If the list is
91        empty, the lock manager will get all known machines.
92      force_option: A Boolean indicating whether or not to force an unlock of
93        a machine that was locked by someone else.
94      chromeos_root: The ChromeOS chroot to use for the autotest scripts.
95      local_server: A string containing the name or ip address of the machine
96        that is running an AFE server, which is to be used for managing
97        machines that are not in the ChromeOS HW lab.
98      local: A Boolean indicating whether or not to use/allow a local AFE
99        server to be used (see local_server argument).
100      log: If not None, this is the logger object to be used for writing out
101        informational output messages.  It is expected to be an instance of
102        Logger class from utils/logger.py.
103    """
104    self.chromeos_root = chromeos_root
105    self.user = getpass.getuser()
106    self.logger = log or logger.GetLogger()
107    autotest_path = os.path.join(chromeos_root,
108                                 'src/third_party/autotest/files')
109
110    sys.path.append(chromeos_root)
111    sys.path.append(autotest_path)
112    sys.path.append(os.path.join(autotest_path, 'server', 'cros'))
113
114    # We have to wait to do these imports until the paths above have
115    # been fixed.
116    # pylint: disable=import-error
117    from client import setup_modules
118    setup_modules.setup(base_path=autotest_path,
119                        root_module_name='autotest_lib')
120
121    from dynamic_suite import frontend_wrappers
122
123    self.afe = frontend_wrappers.RetryingAFE(timeout_min=30,
124                                             delay_sec=10,
125                                             debug=False,
126                                             server='cautotest')
127    if not local:
128      self.local_afe = None
129    else:
130      dargs = {}
131      dargs['server'] = local_server or AFELockManager.LOCAL_SERVER
132      # Make sure local server is pingable.
133      error_msg = ('Local autotest server machine %s not responding to ping.' %
134                   dargs['server'])
135      self.CheckMachine(dargs['server'], error_msg)
136      self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30,
137                                                     delay_sec=10,
138                                                     debug=False,
139                                                     **dargs)
140    self.local = local
141    self.machines = list(set(remotes)) or []
142    self.force = force_option
143    self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
144    if not self.machines:
145      self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines()
146
147  def CheckMachine(self, machine, error_msg):
148    """Verifies that machine is responding to ping.
149
150    Args:
151      machine: String containing the name or ip address of machine to check.
152      error_msg: Message to print if ping fails.
153
154    Raises:
155      MachineNotPingable:  If machine is not responding to 'ping'
156    """
157    if not machines.MachineIsPingable(machine, logging_level='none'):
158      cros_machine = machine + '.cros'
159      if not machines.MachineIsPingable(cros_machine, logging_level='none'):
160        raise MachineNotPingable(error_msg)
161
162  def MachineIsKnown(self, machine):
163    """Checks to see if either AFE server knows the given machine.
164
165    Args:
166      machine: String containing name or ip address of machine to check.
167
168    Returns:
169      Boolean indicating if the machine is in the list of known machines for
170        either AFE server.
171    """
172    if machine in self.toolchain_lab_machines:
173      return True
174    elif self.local_afe and machine in self.GetAllNonlabMachines():
175      return True
176
177    return False
178
179  def GetAllToolchainLabMachines(self):
180    """Gets a list of all the toolchain machines in the ChromeOS HW lab.
181
182    Returns:
183      A list of names of the toolchain machines in the ChromeOS HW lab.
184    """
185    machines_file = os.path.join(
186        os.path.dirname(__file__), 'crosperf', 'default_remotes')
187    machine_list = []
188    with open(machines_file, 'r') as input_file:
189      lines = input_file.readlines()
190      for line in lines:
191        _, remotes = line.split(':')
192        remotes = remotes.strip()
193        for r in remotes.split():
194          machine_list.append(r.strip())
195    return machine_list
196
197  def GetAllNonlabMachines(self):
198    """Gets a list of all known machines on the local AFE server.
199
200    Returns:
201      A list of the names of the machines on the local AFE server.
202    """
203    non_lab_machines = []
204    if self.local_afe:
205      non_lab_machines = self.local_afe.get_hostnames()
206    return non_lab_machines
207
208  def PrintStatusHeader(self, is_lab_machine):
209    """Prints the status header lines for machines.
210
211    Args:
212      is_lab_machine: Boolean indicating whether to print HW Lab header or
213        local machine header (different spacing).
214    """
215    if is_lab_machine:
216      print('\nMachine (Board)\t\t\t\t\tStatus')
217      print('---------------\t\t\t\t\t------\n')
218    else:
219      print('\nMachine (Board)\t\tStatus')
220      print('---------------\t\t------\n')
221
222  def RemoveLocalMachine(self, m):
223    """Removes a machine from the local AFE server.
224
225    Args:
226      m: The machine to remove.
227
228    Raises:
229      MissingHostInfo:  Can't find machine to be removed.
230    """
231    if self.local_afe:
232      host_info = self.local_afe.get_hosts(hostname=m)
233      if host_info:
234        host_info = host_info[0]
235        host_info.delete()
236      else:
237        raise MissingHostInfo('Cannot find/delete machine %s.' % m)
238
239  def AddLocalMachine(self, m):
240    """Adds a machine to the local AFE server.
241
242    Args:
243      m: The machine to be added.
244    """
245    if self.local_afe:
246      error_msg = 'Machine %s is not responding to ping.' % m
247      self.CheckMachine(m, error_msg)
248      self.local_afe.create_host(m)
249
250  def AddMachinesToLocalServer(self):
251    """Adds one or more machines to the local AFE server.
252
253    Verify that the requested machines are legal to add to the local server,
254    i.e. that they are not ChromeOS HW lab machines, and they are not already
255    on the local server.  Call AddLocalMachine for each valid machine.
256
257    Raises:
258      DuplicateAdd: Attempt to add a machine that is already on the server.
259      UpdateNonLocalMachine:  Attempt to add a ChromeOS HW lab machine.
260      UpdateServerError:  Something went wrong while attempting to add a
261        machine.
262    """
263    for m in self.machines:
264      for cros_name in [m, m + '.cros']:
265        if cros_name in self.toolchain_lab_machines:
266          raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW'
267                                      'Lab.  Cannot add it to local server.' %
268                                      cros_name)
269      host_info = self.local_afe.get_hosts(hostname=m)
270      if host_info:
271        raise DuplicateAdd('Machine %s is already on the local server.' % m)
272      try:
273        self.AddLocalMachine(m)
274        self.logger.LogOutput('Successfully added %s to local server.' % m)
275      except Exception as e:
276        traceback.print_exc()
277        raise UpdateServerError(
278            'Error occurred while attempting to add %s. %s' % (m, str(e)))
279
280  def RemoveMachinesFromLocalServer(self):
281    """Removes one or more machines from the local AFE server.
282
283    Verify that the requested machines are legal to remove from the local
284    server, i.e. that they are not ChromeOS HW lab machines.  Call
285    RemoveLocalMachine for each valid machine.
286
287    Raises:
288      UpdateServerError:  Something went wrong while attempting to remove a
289        machine.
290    """
291    for m in self.machines:
292      for cros_name in [m, m + '.cros']:
293        if cros_name in self.toolchain_lab_machines:
294          raise UpdateNonLocalMachine(
295              'Machine %s is in the ChromeOS HW Lab. '
296              'This script cannot remove lab machines.' % cros_name)
297      try:
298        self.RemoveLocalMachine(m)
299        self.logger.LogOutput('Successfully removed %s from local server.' % m)
300      except Exception as e:
301        traceback.print_exc()
302        raise UpdateServerError('Error occurred while attempting to remove %s '
303                                '(%s).' % (m, str(e)))
304
305  def ListMachineStates(self, machine_states):
306    """Gets and prints the current status for a list of machines.
307
308    Prints out the current status for all of the machines in the current
309    AFELockManager's list of machines (set when the object is initialized).
310
311    Args:
312      machine_states: A dictionary of the current state of every machine in
313        the current AFELockManager's list of machines.  Normally obtained by
314        calling AFELockManager::GetMachineStates.
315    """
316    local_machines = []
317    printed_hdr = False
318    for m in machine_states:
319      cros_name = m + '.cros'
320      if (m in self.toolchain_lab_machines or
321          cros_name in self.toolchain_lab_machines):
322        name = m if m in self.toolchain_lab_machines else cros_name
323        if not printed_hdr:
324          self.PrintStatusHeader(True)
325          printed_hdr = True
326        state = machine_states[m]
327        if state['locked']:
328          print('%s (%s)\tlocked by %s since %s' %
329                (name, state['board'], state['locked_by'], state['lock_time']))
330        else:
331          print('%s (%s)\tunlocked' % (name, state['board']))
332      else:
333        local_machines.append(m)
334
335    if local_machines:
336      self.PrintStatusHeader(False)
337      for m in local_machines:
338        state = machine_states[m]
339        if state['locked']:
340          print('%s (%s)\tlocked by %s since %s' %
341                (m, state['board'], state['locked_by'], state['lock_time']))
342        else:
343          print('%s (%s)\tunlocked' % (m, state['board']))
344
345  def UpdateLockInAFE(self, should_lock_machine, machine):
346    """Calls an AFE server to lock/unlock a machine.
347
348    Args:
349      should_lock_machine: Boolean indicating whether to lock the machine (True)
350        or unlock the machine (False).
351      machine: The machine to update.
352
353    Raises:
354      LockingError:  An error occurred while attempting to update the machine
355        state.
356    """
357    action = 'lock'
358    if not should_lock_machine:
359      action = 'unlock'
360    kwargs = {'locked': should_lock_machine}
361    kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user
362
363    cros_name = machine + '.cros'
364    if cros_name in self.toolchain_lab_machines:
365      machine = cros_name
366    if machine in self.toolchain_lab_machines:
367      m = machine.split('.')[0]
368      afe_server = self.afe
369    else:
370      m = machine
371      afe_server = self.local_afe
372
373    try:
374      afe_server.run('modify_hosts',
375                     host_filter_data={'hostname__in': [m]},
376                     update_data=kwargs)
377    except Exception as e:
378      traceback.print_exc()
379      raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e)))
380
381  def UpdateMachines(self, lock_machines):
382    """Sets the locked state of the machines to the requested value.
383
384    The machines updated are the ones in self.machines (specified when the
385    class object was intialized).
386
387    Args:
388      lock_machines: Boolean indicating whether to lock the machines (True) or
389        unlock the machines (False).
390
391    Returns:
392      A list of the machines whose state was successfully updated.
393    """
394    updated_machines = []
395    for m in self.machines:
396      self.UpdateLockInAFE(lock_machines, m)
397      # Since we returned from self.UpdateLockInAFE we assume the request
398      # succeeded.
399      if lock_machines:
400        self.logger.LogOutput('Locked machine(s) %s.' % m)
401      else:
402        self.logger.LogOutput('Unlocked machine(s) %s.' % m)
403      updated_machines.append(m)
404
405    return updated_machines
406
407  def _InternalRemoveMachine(self, machine):
408    """Remove machine from internal list of machines.
409
410    Args:
411      machine: Name of machine to be removed from internal list.
412    """
413    # Check to see if machine is lab machine and if so, make sure it has
414    # ".cros" on the end.
415    cros_machine = machine
416    if machine.find('rack') > 0 and machine.find('row') > 0:
417      if machine.find('.cros') == -1:
418        cros_machine = cros_machine + '.cros'
419
420    self.machines = [m
421                     for m in self.machines
422                     if m != cros_machine and m != machine]
423
424  def CheckMachineLocks(self, machine_states, cmd):
425    """Check that every machine in requested list is in the proper state.
426
427    If the cmd is 'unlock' verify that every machine is locked by requestor.
428    If the cmd is 'lock' verify that every machine is currently unlocked.
429
430    Args:
431      machine_states: A dictionary of the current state of every machine in
432        the current AFELockManager's list of machines.  Normally obtained by
433        calling AFELockManager::GetMachineStates.
434      cmd: The user-requested action for the machines: 'lock' or 'unlock'.
435
436    Raises:
437      DontOwnLock: The lock on a requested machine is owned by someone else.
438    """
439    for k, state in machine_states.iteritems():
440      if cmd == 'unlock':
441        if not state['locked']:
442          self.logger.LogWarning('Attempt to unlock already unlocked machine '
443                                 '(%s).' % k)
444          self._InternalRemoveMachine(k)
445
446        if state['locked'] and state['locked_by'] != self.user:
447          raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
448                            'else (%s).' % (k, state['locked_by']))
449      elif cmd == 'lock':
450        if state['locked']:
451          self.logger.LogWarning('Attempt to lock already locked machine (%s)' %
452                                 k)
453          self._InternalRemoveMachine(k)
454
455  def HasAFEServer(self, local):
456    """Verifies that the AFELockManager has appropriate AFE server.
457
458    Args:
459      local: Boolean indicating whether we are checking for the local server
460        (True) or for the global server (False).
461
462    Returns:
463      A boolean indicating if the AFELockManager has the requested AFE server.
464    """
465    if local:
466      return self.local_afe is not None
467    else:
468      return self.afe is not None
469
470  def GetMachineStates(self, cmd=''):
471    """Gets the current state of all the requested machines.
472
473    Gets the current state of all the requested machines, both from the HW lab
474    sever and from the local server.  Stores the data in a dictionary keyed
475    by machine name.
476
477    Args:
478      cmd: The command for which we are getting the machine states. This is
479        important because if one of the requested machines is missing we raise
480        an exception, unless the requested command is 'add'.
481
482    Returns:
483      A dictionary of machine states for all the machines in the AFELockManager
484      object.
485
486    Raises:
487      NoAFEServer:  Cannot find the HW Lab or local AFE server.
488      AFEAccessError:  An error occurred when querying the server about a
489        machine.
490    """
491    if not self.HasAFEServer(False):
492      raise NoAFEServer('Error: Cannot connect to main AFE server.')
493
494    if self.local and not self.HasAFEServer(True):
495      raise NoAFEServer('Error: Cannot connect to local AFE server.')
496
497    machine_list = {}
498    for m in self.machines:
499      host_info = None
500      cros_name = m + '.cros'
501      if (m in self.toolchain_lab_machines or
502          cros_name in self.toolchain_lab_machines):
503        mod_host = m.split('.')[0]
504        host_info = self.afe.get_hosts(hostname=mod_host)
505        if not host_info:
506          raise AFEAccessError('Unable to get information about %s from main'
507                               ' autotest server.' % m)
508      else:
509        host_info = self.local_afe.get_hosts(hostname=m)
510        if not host_info and cmd != 'add':
511          raise AFEAccessError('Unable to get information about %s from '
512                               'local autotest server.' % m)
513      if host_info:
514        host_info = host_info[0]
515        name = host_info.hostname
516        values = {}
517        values['board'] = host_info.platform if host_info.platform else '??'
518        values['locked'] = host_info.locked
519        if host_info.locked:
520          values['locked_by'] = host_info.locked_by
521          values['lock_time'] = host_info.lock_time
522        else:
523          values['locked_by'] = ''
524          values['lock_time'] = ''
525        machine_list[name] = values
526      else:
527        machine_list[m] = {}
528    return machine_list
529
530
531def Main(argv):
532  """Parse the options, initialize lock manager and dispatch proper method.
533
534  Args:
535    argv: The options with which this script was invoked.
536
537  Returns:
538    0 unless an exception is raised.
539  """
540  parser = argparse.ArgumentParser()
541
542  parser.add_argument('--list',
543                      dest='cmd',
544                      action='store_const',
545                      const='status',
546                      help='List current status of all known machines.')
547  parser.add_argument('--lock',
548                      dest='cmd',
549                      action='store_const',
550                      const='lock',
551                      help='Lock given machine(s).')
552  parser.add_argument('--unlock',
553                      dest='cmd',
554                      action='store_const',
555                      const='unlock',
556                      help='Unlock given machine(s).')
557  parser.add_argument('--status',
558                      dest='cmd',
559                      action='store_const',
560                      const='status',
561                      help='List current status of given machine(s).')
562  parser.add_argument('--add_machine',
563                      dest='cmd',
564                      action='store_const',
565                      const='add',
566                      help='Add machine to local machine server.')
567  parser.add_argument('--remove_machine',
568                      dest='cmd',
569                      action='store_const',
570                      const='remove',
571                      help='Remove machine from the local machine server.')
572  parser.add_argument('--nolocal',
573                      dest='local',
574                      action='store_false',
575                      default=True,
576                      help='Do not try to use local machine server.')
577  parser.add_argument('--remote',
578                      dest='remote',
579                      help='machines on which to operate')
580  parser.add_argument('--chromeos_root',
581                      dest='chromeos_root',
582                      required=True,
583                      help='ChromeOS root to use for autotest scripts.')
584  parser.add_argument('--local_server',
585                      dest='local_server',
586                      default=None,
587                      help='Alternate local autotest server to use.')
588  parser.add_argument('--force',
589                      dest='force',
590                      action='store_true',
591                      default=False,
592                      help='Force lock/unlock of machines, even if not'
593                      ' current lock owner.')
594
595  options = parser.parse_args(argv)
596
597  if not options.remote and options.cmd != 'status':
598    parser.error('No machines specified for operation.')
599
600  if not os.path.isdir(options.chromeos_root):
601    parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
602
603  if not options.cmd:
604    parser.error('No operation selected (--list, --status, --lock, --unlock,'
605                 ' --add_machine, --remove_machine).')
606
607  machine_list = []
608  if options.remote:
609    machine_list = options.remote.split()
610
611  lock_manager = AFELockManager(machine_list, options.force,
612                                options.chromeos_root, options.local_server,
613                                options.local)
614
615  machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
616  cmd = options.cmd
617
618  if cmd == 'status':
619    lock_manager.ListMachineStates(machine_states)
620
621  elif cmd == 'lock':
622    if not lock_manager.force:
623      lock_manager.CheckMachineLocks(machine_states, cmd)
624      lock_manager.UpdateMachines(True)
625
626  elif cmd == 'unlock':
627    if not lock_manager.force:
628      lock_manager.CheckMachineLocks(machine_states, cmd)
629      lock_manager.UpdateMachines(False)
630
631  elif cmd == 'add':
632    lock_manager.AddMachinesToLocalServer()
633
634  elif cmd == 'remove':
635    lock_manager.RemoveMachinesFromLocalServer()
636
637  return 0
638
639
640if __name__ == '__main__':
641  sys.exit(Main(sys.argv[1:]))
642