afe_lock_machine.py revision 25c94f1849f617166781903637099d962673c60a
1#!/usr/bin/python
2#
3# Copyright 2015 Google INc.  All Rights Reserved.
4
5import argparse
6import getpass
7import os
8import sys
9import traceback
10
11from utils import logger
12from utils import machines
13from utils import misc
14
15
16class AFELockException(Exception):
17  """Base class for exceptions in this module."""
18
19
20class MachineNotPingable(AFELockException):
21  """Raised when machine does not respond to ping."""
22
23
24class MissingHostInfo(AFELockException):
25  """Raised when cannot find info about machine on machine servers."""
26
27
28class UpdateNonLocalMachine(AFELockException):
29  """Raised when user requests to add/remove a ChromeOS HW Lab machine.."""
30
31
32class DuplicateAdd(AFELockException):
33  """Raised when user requests to add a machine that's already on the server."""
34
35
36class UpdateServerError(AFELockException):
37  """Raised when attempt to add/remove a machine from local server fails."""
38
39
40class LockingError(AFELockException):
41  """Raised when server fails to lock/unlock machine as requested."""
42
43
44class DuplicateLock(AFELockException):
45  """Raised when user attempts to lock an already locked machine."""
46
47
48class DuplicateUnlock(AFELockException):
49  """Raised when user attempts to unlock an already unlocked machine."""
50
51
52class DontOwnLock(AFELockException):
53  """Raised when user attmepts to unlock machine locked by someone else."""
54  # This should not be raised if the user specified '--force'
55
56
57class NoAFEServer(AFELockException):
58  """Raised when cannot find/access the autotest server."""
59
60
61class AFEAccessError(AFELockException):
62  """Raised when cannot get information about lab machine from lab server."""
63
64
65class AFELockManager(object):
66  """Class for locking/unlocking machines vie Autotest Front End servers.
67
68  This class contains methods for checking the locked status of machines
69  on both the ChromeOS HW Lab AFE server and a local AFE server.  It also
70  has methods for adding/removing machines from the local server, and for
71  changing the lock status of machines on either server.  For the ChromeOS
72  HW Lab, it only allows access to the toolchain team lab machines, as
73  defined in toolchain-utils/crosperf/default_remotes.  By default it will
74  look for a local server on chrotomation2.mtv.corp.google.com, but an
75  alternative local AFE server can be supplied, if desired.
76
77  !!!IMPORTANT NOTE!!!  The AFE server can only be called from the main
78  thread/process of a program.  If you launch threads and try to call it
79  from a thread, you will get an error.  This has to do with restrictions
80  in the Python virtual machine (and signal handling) and cannot be changed.
81  """
82
83  LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com'
84
85  def __init__(self, remotes, force_option, chromeos_root, local_server,
86               local=True, log=None):
87    """Initializes an AFELockManager object.
88
89    Args:
90      remotes: A list of machine names or ip addresses to be managed.  Names
91        and ip addresses should be represented as strings.  If the list is empty,
92        the lock manager will get all known machines.
93      force_option:  A Boolean indicating whether or not to force an unlock of
94        a machine that was locked by someone else.
95      chromeos_root: The ChromeOS chroot to use for the autotest scripts.
96      local_server:  A string containing the name or ip address of the machine
97        that is running an AFE server, which is to be used for managing
98        machines that are not in the ChromeOS HW lab.
99      local: A Boolean indicating whether or not to use/allow a local AFE
100        server to be used (see local_server argument).
101      log: If not None, this is the logger object to be used for writing out
102        informational output messages.  It is expected to be an instance of
103        Logger class from utils/logger.py.
104    """
105    self.chromeos_root = chromeos_root
106    self.user = getpass.getuser()
107    self.logger = log or logger.GetLogger()
108    autotest_path = os.path.join(chromeos_root,
109                                 'src/third_party/autotest/files')
110
111    sys.path.append(chromeos_root)
112    sys.path.append(autotest_path)
113    sys.path.append(os.path.join(autotest_path, 'server', 'cros'))
114
115    # We have to wait to do these imports until the paths above have
116    # been fixed.
117    from client import setup_modules
118    setup_modules.setup(base_path=autotest_path,
119                        root_module_name='autotest_lib')
120
121    from dynamic_suite import frontend_wrappers
122
123    self.afe = frontend_wrappers.RetryingAFE(timeout_min=30,
124                                             delay_sec=10,
125                                             debug=False,
126                                             server='cautotest')
127    if not local:
128      self.local_afe = None
129    else:
130      dargs = {}
131      dargs['server'] = local_server or AFELockManager.LOCAL_SERVER
132      # Make sure local server is pingable.
133      error_msg = ('Local autotest server machine %s not responding to ping.'
134                 % dargs['server'])
135      self.CheckMachine(dargs['server'], error_msg)
136      self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30,
137                                                     delay_sec=10,
138                                                     debug=False,
139                                                     **dargs)
140    self.local = local
141    self.machines = list(set(remotes)) or []
142    self.force = force_option
143    self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
144    if not self.machines:
145      self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines()
146
147  def CheckMachine(self, machine, error_msg):
148    """Verifies that machine is responding to ping.
149
150    Args:
151      machine: String containing the name or ip address of machine to check.
152      error_msg: Message to print if ping fails.
153
154    Raises:
155      MachineNotPingable:  If machine is not responding to 'ping'
156    """
157    if not machines.MachineIsPingable(machine, logging_level='none'):
158        raise MachineNotPingable(error_msg)
159
160  def MachineIsKnown(self, machine):
161    """Checks to see if either AFE server knows the given machine.
162
163    Args:
164      machine: String containing name or ip address of machine to check.
165
166    Returns:
167      Boolean indicating if the machine is in the list of known machines for
168        either AFE server.
169    """
170    if machine in self.toolchain_lab_machines:
171      return True
172    elif self.local_afe and machine in self.GetAllNonlabMachines():
173      return True
174
175    return False
176
177  def GetAllToolchainLabMachines(self):
178    """Gets a list of all the toolchain machines in the ChromeOS HW lab.
179
180    Returns:
181      A list of names of the toolchain machines in the ChromeOS HW lab.
182    """
183    machines_file = os.path.join(os.path.dirname(__file__),
184                                 'crosperf', 'default_remotes')
185    machine_list = []
186    with open(machines_file, 'r') as input_file:
187      lines = input_file.readlines()
188      for line in lines:
189        board, remotes = line.split(':')
190        remotes = remotes.strip()
191        for r in remotes.split():
192          machine_list.append(r.strip())
193    return machine_list
194
195  def GetAllNonlabMachines(self):
196    """Gets a list of all known machines on the local AFE server.
197
198    Returns:
199      A list of the names of the machines on the local AFE server.
200    """
201    non_lab_machines = []
202    if self.local_afe:
203      non_lab_machines = self.local_afe.get_hostnames()
204    return non_lab_machines
205
206  def PrintStatusHeader(self, is_lab_machine):
207    """Prints the status header lines for machines.
208
209    Args: Boolean indicating whether to print HW Lab header or local
210      machine header (different spacing).
211    """
212    if is_lab_machine:
213      print '\nMachine (Board)\t\t\t\tStatus'
214      print '---------------\t\t\t\t------\n'
215    else:
216      print '\nMachine (Board)\t\tStatus'
217      print '---------------\t\t------\n'
218
219  def RemoveLocalMachine(self, m):
220    """Removes a machine from the local AFE server.
221
222    Args:
223      m: The machine to remove.
224
225    Raises:
226      MissingHostInfo:  Can't find machine to be removed.
227    """
228    if self.local_afe:
229      host_info = self.local_afe.get_hosts(hostname=m)
230      if host_info:
231        host_info = host_info[0]
232        host_info.delete()
233      else:
234        raise MissingHostInfo('Cannot find/delete machine %s.' % m)
235
236  def AddLocalMachine(self, m):
237    """Adds a machine to the local AFE server.
238
239    Args:
240      m: The machine to be added.
241    """
242    if self.local_afe:
243      error_msg = 'Machine %s is not responding to ping.' % m
244      self.CheckMachine(m, error_msg)
245      host = self.local_afe.create_host(m)
246
247  def AddMachinesToLocalServer(self):
248    """Adds one or more machines to the local AFE server.
249
250    Verify that the requested machines are legal to add to the local server,
251    i.e. that they are not ChromeOS HW lab machines, and they are not already
252    on the local server.  Call AddLocalMachine for each valid machine.
253
254    Raises:
255      DuplicateAdd: Attempt to add a machine that is already on the server.
256      UpdateNonLocalMachine:  Attempt to add a ChromeOS HW lab machine.
257      UpdateServerError:  Something went wrong while attempting to add a
258        machine.
259    """
260    for m in self.machines:
261      if m in self.toolchain_lab_machines:
262        raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW '
263                                    'Lab.  Cannot add it to local server.' % m)
264      host_info = self.local_afe.get_hosts(hostname=m)
265      if host_info:
266        raise DuplicateAdd('Machine %s is already on the local server.' % m)
267      try:
268        self.AddLocalMachine(m)
269        self.logger.LogOutput('Successfully added %s to local server.' % m)
270      except Exception as e:
271        traceback.print_exc()
272        raise UpdateServerError('Error occurred while attempting to add %s. %s'
273                                % (m, str(e)))
274
275  def RemoveMachinesFromLocalServer(self):
276    """Removes one or more machines from the local AFE server.
277
278    Verify that the requested machines are legal to remove from the local
279    server, i.e. that they are not ChromeOS HW lab machines.  Call
280    RemoveLocalMachine for each valid machine.
281
282    Raises:
283      UpdateServerError:  Something went wrong while attempting to remove a
284        machine.
285    """
286    for m in self.machines:
287      if m in self.toolchain_lab_machines:
288        raise UpdateNonLocalMachine('Machine %s is in the ChromeOS HW Lab. '
289                                    'This script cannot remove lab machines.'
290                                    % m)
291      try:
292        self.RemoveLocalMachine(m)
293        self.logger.LogOutput('Successfully removed %s from local server.' % m)
294      except Exception as e:
295        traceback.print_exc()
296        raise UpdateServerError('Error occurred while attempting to remove %s '
297                                '(%s).' % (m, str(e)))
298
299  def ListMachineStates(self, machine_states):
300    """Gets and prints the current status for a list of machines.
301
302    Prints out the current status for all of the machines in the current
303    AFELockManager's list of machines (set when the object is initialized).
304
305    Args:
306      machine_states: A dictionary of the current state of every machine in
307        the current AFELockManager's list of machines.  Normally obtained by
308        calling AFELockManager::GetMachineStates.
309    """
310    local_machines = []
311    printed_hdr = False
312    for m in machine_states:
313      cros_name = m + '.cros'
314      if (m in self.toolchain_lab_machines or
315          cros_name in self.toolchain_lab_machines):
316        if not printed_hdr:
317          self.PrintStatusHeader(True)
318          printed_hdr = True
319        state = machine_states[m]
320        if state['locked']:
321          print ('%s (%s)\tlocked by %s since %s' %
322                 (m, state['board'], state['locked_by'], state['lock_time']))
323        else:
324          print '%s (%s)\tunlocked' % (m, state['board'])
325      else:
326        local_machines.append(m)
327
328    if local_machines:
329      self.PrintStatusHeader(False)
330      for m in local_machines:
331        state = machine_states[m]
332        if state['locked']:
333          print ('%s (%s)\tlocked by %s since %s' %
334                 (m, state['board'], state['locked_by'], state['lock_time']))
335        else:
336          print '%s (%s)\tunlocked' % (m, state['board'])
337
338
339  def UpdateLockInAFE(self, should_lock_machine, machine):
340    """Calls an AFE server to lock/unlock a machine.
341
342    Args:
343      should_lock_machine: Boolean indicating whether to lock the machine (True)
344        or unlock the machine (False).
345      machine: The machine to update.
346
347    Raises:
348      LockingError:  An error occurred while attempting to update the machine
349        state.
350    """
351    action = 'lock'
352    if not should_lock_machine:
353      action = 'unlock'
354    kwargs = {'locked': should_lock_machine}
355    kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user
356
357    if machine in self.toolchain_lab_machines:
358      m = machine.split('.')[0]
359      afe_server = self.afe
360    else:
361      m = machine
362      afe_server = self.local_afe
363
364    try:
365      afe_server.run('modify_hosts',
366                      host_filter_data={'hostname__in': [m]},
367                      update_data=kwargs)
368    except Exception as e:
369      traceback.print_exc()
370      raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e)))
371
372  def UpdateMachines(self, lock_machines):
373    """Sets the locked state of the machines to the requested value.
374
375    The machines updated are the ones in self.machines (specified when the
376    class object was intialized).
377
378    Args:
379      lock_machines:  Boolean indicating whether to lock the machines (True) or
380        unlock the machines (False).
381    """
382    for m in self.machines:
383      self.UpdateLockInAFE(lock_machines, m)
384
385      # Since we returned from self.UpdateLockInAFE we assume the request
386      # succeeded.
387      if lock_machines:
388        self.logger.LogOutput('Locked machine(s) %s.' % m)
389      else:
390        self.logger.LogOutput('Unlocked machine(s) %s.' % m)
391
392  def CheckMachineLocks(self, machine_states, cmd):
393    """Check that every machine in requested list is in the proper state.
394
395    If the cmd is 'unlock' verify that every machine is locked by requestor.
396    If the cmd is 'lock' verify that every machine is currently unlocked.
397
398    Args:
399      machine_states: A dictionary of the current state of every machine in
400        the current AFELockManager's list of machines.  Normally obtained by
401        calling AFELockManager::GetMachineStates.
402      cmd:  'lock' or 'unlock'.  The user-requested action for the machines.
403
404    Raises:
405      DuplicateLock: A machine requested to be locked is already locked.
406      DuplicateUnlock: A machine requested to be unlocked is already unlocked.
407      DontOwnLock: The lock on a requested machine is owned by someone else.
408    """
409    for k, state in machine_states.iteritems():
410      if cmd == 'unlock':
411        if not state['locked']:
412          raise DuplicateUnlock('Attempt to unlock already unlocked machine '
413                                '(%s).' % k)
414
415        if state['locked_by'] != self.user:
416          raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
417                            'else (%s).' % (k, state['locked_by']))
418      elif cmd == 'lock':
419        if state['locked']:
420          raise DuplicateLock('Attempt to lock already locked machine (%s)' % k)
421
422  def HasAFEServer(self, local):
423    """Verifies that the AFELockManager has appropriate AFE server.
424
425    Args:
426      local: Boolean indicating whether we are checking for the local server
427        (True) or for the global server (False).
428
429    Returns:
430      A boolean indicating if the AFELockManager has the requested AFE server.
431    """
432    if local:
433      return self.local_afe is not None
434    else:
435      return self.afe is not None
436
437  def GetMachineStates(self, cmd=''):
438    """Gets the current state of all the requested machines.
439
440    Gets the current state of all the requested machines, both from the HW lab
441    sever and from the local server.  Stores the data in a dictionary keyed
442    by machine name.
443
444    Args:
445      cmd: The command for which we are getting the machine states. This is
446        important because if one of the requested machines is missing we raise
447        an exception, unless the requested command is 'add'.
448
449    Returns:
450      A dictionary of machine states for all the machines in the AFELockManager
451      object.
452
453    Raises:
454      NoAFEServer:  Cannot find the HW Lab or local AFE server.
455      AFEAccessError:  An error occurred when querying the server about a
456        machine.
457    """
458    if not self.HasAFEServer(False):
459      raise NoAFEServer('Error: Cannot connect to main AFE server.')
460
461    if self.local and not self.HasAFEServer(True):
462      raise NoAFEServer('Error: Cannot connect to local AFE server.')
463
464    machines = {}
465    for m in self.machines:
466      host_info = None
467      if m in self.toolchain_lab_machines:
468        mod_host = m.split('.')[0]
469        host_info = self.afe.get_hosts(hostname=mod_host)
470        if not host_info:
471          raise AFEAccessError('Unable to get information about %s from main'
472                               ' autotest server.' % m)
473      else:
474        host_info = self.local_afe.get_hosts(hostname=m)
475        if not host_info and cmd != 'add':
476          raise AFEAccessError('Unable to get information about %s from '
477                               'local autotest server.' % m)
478      if host_info:
479        host_info = host_info[0]
480        name = host_info.hostname
481        values = {}
482        values['board'] = host_info.platform if host_info.platform else '??'
483        values['locked'] = host_info.locked
484        if host_info.locked:
485            values['locked_by'] = host_info.locked_by
486            values['lock_time'] = host_info.lock_time
487        else:
488            values['locked_by'] = ''
489            values['lock_time'] = ''
490        machines[name] = values
491      else:
492        machines[m] = {}
493    return machines
494
495
496def Main(argv):
497    """
498    Parse the options, initialize lock manager and dispatch proper method.
499
500    Args:
501      argv:  The options with which this script was invoked.
502
503    Returns:
504      0 unless an exception is raised.
505    """
506    parser = argparse.ArgumentParser()
507
508    parser.add_argument('--list', dest='cmd', action='store_const',
509                        const='status',
510                        help='List current status of all known machines.')
511    parser.add_argument('--lock', dest='cmd', action='store_const',
512                        const='lock', help='Lock given machine(s).')
513    parser.add_argument('--unlock', dest='cmd', action='store_const',
514                        const='unlock', help='Unlock given machine(s).')
515    parser.add_argument('--status', dest='cmd', action='store_const',
516                        const='status',
517                        help='List current status of given machine(s).')
518    parser.add_argument('--add_machine', dest='cmd', action='store_const',
519                        const='add',
520                        help='Add machine to local machine server.')
521    parser.add_argument('--remove_machine', dest='cmd',
522                        action='store_const', const='remove',
523                        help='Remove machine from the local machine server.')
524    parser.add_argument('--nolocal', dest='local',
525                        action='store_false', default=True,
526                        help='Do not try to use local machine server.')
527    parser.add_argument('--remote', dest='remote',
528                        help='machines on which to operate')
529    parser.add_argument('--chromeos_root', dest='chromeos_root', required=True,
530                        help='ChromeOS root to use for autotest scripts.')
531    parser.add_argument('--local_server', dest='local_server', default=None,
532                        help='Alternate local autotest server to use.')
533    parser.add_argument('--force', dest='force', action='store_true',
534                        default=False,
535                        help='Force lock/unlock of machines, even if not'
536                        ' current lock owner.')
537
538    options = parser.parse_args(argv)
539
540    if not options.remote and options.cmd != 'status':
541      parser.error('No machines specified for operation.')
542
543    if not os.path.isdir(options.chromeos_root):
544      parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
545
546    if not options.cmd:
547      parser.error('No operation selected (--list, --status, --lock, --unlock,'
548                   ' --add_machine, --remove_machine).')
549
550    machine_list = []
551    if options.remote:
552      machine_list = options.remote.split()
553
554    lock_manager = AFELockManager(machine_list, options.force,
555                                  options.chromeos_root, options.local_server,
556                                  options.local)
557
558    machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
559    cmd = options.cmd
560
561    if cmd == 'status':
562      lock_manager.ListMachineStates(machine_states)
563
564    elif cmd == 'lock':
565      if not lock_manager.force:
566        lock_manager.CheckMachineLocks(machine_states, cmd)
567        lock_manager.UpdateMachines(True)
568
569    elif cmd == 'unlock':
570      if not lock_manager.force:
571        lock_manager.CheckMachineLocks(machine_states, cmd)
572        lock_manager.UpdateMachines(False)
573
574    elif cmd == 'add':
575      lock_manager.AddMachinesToLocalServer()
576
577    elif cmd == 'remove':
578      lock_manager.RemoveMachinesFromLocalServer()
579
580    return 0
581
582
583if __name__ == '__main__':
584    sys.exit(Main(sys.argv[1:]))
585