afe_lock_machine.py revision f3eb80354a8d7dd386c2880c282a767abad6a18a
1#!/usr/bin/python
2#
3# Copyright 2015 Google INc.  All Rights Reserved.
4
5import argparse
6import getpass
7import os
8import sys
9import traceback
10
11from utils import logger
12from utils import machines
13from utils import misc
14
15
16class AFELockException(Exception):
17  """Base class for exceptions in this module."""
18
19
20class MachineNotPingable(AFELockException):
21  """Raised when machine does not respond to ping."""
22
23
24class MissingHostInfo(AFELockException):
25  """Raised when cannot find info about machine on machine servers."""
26
27
28class UpdateNonLocalMachine(AFELockException):
29  """Raised when user requests to add/remove a ChromeOS HW Lab machine.."""
30
31
32class DuplicateAdd(AFELockException):
33  """Raised when user requests to add a machine that's already on the server."""
34
35
36class UpdateServerError(AFELockException):
37  """Raised when attempt to add/remove a machine from local server fails."""
38
39
40class LockingError(AFELockException):
41  """Raised when server fails to lock/unlock machine as requested."""
42
43
44class DontOwnLock(AFELockException):
45  """Raised when user attmepts to unlock machine locked by someone else."""
46  # This should not be raised if the user specified '--force'
47
48
49class NoAFEServer(AFELockException):
50  """Raised when cannot find/access the autotest server."""
51
52
53class AFEAccessError(AFELockException):
54  """Raised when cannot get information about lab machine from lab server."""
55
56
57class AFELockManager(object):
58  """Class for locking/unlocking machines vie Autotest Front End servers.
59
60  This class contains methods for checking the locked status of machines
61  on both the ChromeOS HW Lab AFE server and a local AFE server.  It also
62  has methods for adding/removing machines from the local server, and for
63  changing the lock status of machines on either server.  For the ChromeOS
64  HW Lab, it only allows access to the toolchain team lab machines, as
65  defined in toolchain-utils/crosperf/default_remotes.  By default it will
66  look for a local server on chrotomation2.mtv.corp.google.com, but an
67  alternative local AFE server can be supplied, if desired.
68
69  !!!IMPORTANT NOTE!!!  The AFE server can only be called from the main
70  thread/process of a program.  If you launch threads and try to call it
71  from a thread, you will get an error.  This has to do with restrictions
72  in the Python virtual machine (and signal handling) and cannot be changed.
73  """
74
75  LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com'
76
77  def __init__(self, remotes, force_option, chromeos_root, local_server,
78               local=True, log=None):
79    """Initializes an AFELockManager object.
80
81    Args:
82      remotes: A list of machine names or ip addresses to be managed.  Names
83        and ip addresses should be represented as strings.  If the list is empty,
84        the lock manager will get all known machines.
85      force_option:  A Boolean indicating whether or not to force an unlock of
86        a machine that was locked by someone else.
87      chromeos_root: The ChromeOS chroot to use for the autotest scripts.
88      local_server:  A string containing the name or ip address of the machine
89        that is running an AFE server, which is to be used for managing
90        machines that are not in the ChromeOS HW lab.
91      local: A Boolean indicating whether or not to use/allow a local AFE
92        server to be used (see local_server argument).
93      log: If not None, this is the logger object to be used for writing out
94        informational output messages.  It is expected to be an instance of
95        Logger class from utils/logger.py.
96    """
97    self.chromeos_root = chromeos_root
98    self.user = getpass.getuser()
99    self.logger = log or logger.GetLogger()
100    autotest_path = os.path.join(chromeos_root,
101                                 'src/third_party/autotest/files')
102
103    sys.path.append(chromeos_root)
104    sys.path.append(autotest_path)
105    sys.path.append(os.path.join(autotest_path, 'server', 'cros'))
106
107    # We have to wait to do these imports until the paths above have
108    # been fixed.
109    from client import setup_modules
110    setup_modules.setup(base_path=autotest_path,
111                        root_module_name='autotest_lib')
112
113    from dynamic_suite import frontend_wrappers
114
115    self.afe = frontend_wrappers.RetryingAFE(timeout_min=30,
116                                             delay_sec=10,
117                                             debug=False,
118                                             server='cautotest')
119    if not local:
120      self.local_afe = None
121    else:
122      dargs = {}
123      dargs['server'] = local_server or AFELockManager.LOCAL_SERVER
124      # Make sure local server is pingable.
125      error_msg = ('Local autotest server machine %s not responding to ping.'
126                 % dargs['server'])
127      self.CheckMachine(dargs['server'], error_msg)
128      self.local_afe = frontend_wrappers.RetryingAFE(timeout_min=30,
129                                                     delay_sec=10,
130                                                     debug=False,
131                                                     **dargs)
132    self.local = local
133    self.machines = list(set(remotes)) or []
134    self.force = force_option
135    self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
136    if not self.machines:
137      self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines()
138
139  def CheckMachine(self, machine, error_msg):
140    """Verifies that machine is responding to ping.
141
142    Args:
143      machine: String containing the name or ip address of machine to check.
144      error_msg: Message to print if ping fails.
145
146    Raises:
147      MachineNotPingable:  If machine is not responding to 'ping'
148    """
149    if not machines.MachineIsPingable(machine, logging_level='none'):
150        raise MachineNotPingable(error_msg)
151
152  def MachineIsKnown(self, machine):
153    """Checks to see if either AFE server knows the given machine.
154
155    Args:
156      machine: String containing name or ip address of machine to check.
157
158    Returns:
159      Boolean indicating if the machine is in the list of known machines for
160        either AFE server.
161    """
162    if machine in self.toolchain_lab_machines:
163      return True
164    elif self.local_afe and machine in self.GetAllNonlabMachines():
165      return True
166
167    return False
168
169  def GetAllToolchainLabMachines(self):
170    """Gets a list of all the toolchain machines in the ChromeOS HW lab.
171
172    Returns:
173      A list of names of the toolchain machines in the ChromeOS HW lab.
174    """
175    machines_file = os.path.join(os.path.dirname(__file__),
176                                 'crosperf', 'default_remotes')
177    machine_list = []
178    with open(machines_file, 'r') as input_file:
179      lines = input_file.readlines()
180      for line in lines:
181        board, remotes = line.split(':')
182        remotes = remotes.strip()
183        for r in remotes.split():
184          machine_list.append(r.strip())
185    return machine_list
186
187  def GetAllNonlabMachines(self):
188    """Gets a list of all known machines on the local AFE server.
189
190    Returns:
191      A list of the names of the machines on the local AFE server.
192    """
193    non_lab_machines = []
194    if self.local_afe:
195      non_lab_machines = self.local_afe.get_hostnames()
196    return non_lab_machines
197
198  def PrintStatusHeader(self, is_lab_machine):
199    """Prints the status header lines for machines.
200
201    Args: Boolean indicating whether to print HW Lab header or local
202      machine header (different spacing).
203    """
204    if is_lab_machine:
205      print '\nMachine (Board)\t\t\t\tStatus'
206      print '---------------\t\t\t\t------\n'
207    else:
208      print '\nMachine (Board)\t\tStatus'
209      print '---------------\t\t------\n'
210
211  def RemoveLocalMachine(self, m):
212    """Removes a machine from the local AFE server.
213
214    Args:
215      m: The machine to remove.
216
217    Raises:
218      MissingHostInfo:  Can't find machine to be removed.
219    """
220    if self.local_afe:
221      host_info = self.local_afe.get_hosts(hostname=m)
222      if host_info:
223        host_info = host_info[0]
224        host_info.delete()
225      else:
226        raise MissingHostInfo('Cannot find/delete machine %s.' % m)
227
228  def AddLocalMachine(self, m):
229    """Adds a machine to the local AFE server.
230
231    Args:
232      m: The machine to be added.
233    """
234    if self.local_afe:
235      error_msg = 'Machine %s is not responding to ping.' % m
236      self.CheckMachine(m, error_msg)
237      host = self.local_afe.create_host(m)
238
239  def AddMachinesToLocalServer(self):
240    """Adds one or more machines to the local AFE server.
241
242    Verify that the requested machines are legal to add to the local server,
243    i.e. that they are not ChromeOS HW lab machines, and they are not already
244    on the local server.  Call AddLocalMachine for each valid machine.
245
246    Raises:
247      DuplicateAdd: Attempt to add a machine that is already on the server.
248      UpdateNonLocalMachine:  Attempt to add a ChromeOS HW lab machine.
249      UpdateServerError:  Something went wrong while attempting to add a
250        machine.
251    """
252    for m in self.machines:
253      if m in self.toolchain_lab_machines:
254        raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW '
255                                    'Lab.  Cannot add it to local server.' % m)
256      host_info = self.local_afe.get_hosts(hostname=m)
257      if host_info:
258        raise DuplicateAdd('Machine %s is already on the local server.' % m)
259      try:
260        self.AddLocalMachine(m)
261        self.logger.LogOutput('Successfully added %s to local server.' % m)
262      except Exception as e:
263        traceback.print_exc()
264        raise UpdateServerError('Error occurred while attempting to add %s. %s'
265                                % (m, str(e)))
266
267  def RemoveMachinesFromLocalServer(self):
268    """Removes one or more machines from the local AFE server.
269
270    Verify that the requested machines are legal to remove from the local
271    server, i.e. that they are not ChromeOS HW lab machines.  Call
272    RemoveLocalMachine for each valid machine.
273
274    Raises:
275      UpdateServerError:  Something went wrong while attempting to remove a
276        machine.
277    """
278    for m in self.machines:
279      if m in self.toolchain_lab_machines:
280        raise UpdateNonLocalMachine('Machine %s is in the ChromeOS HW Lab. '
281                                    'This script cannot remove lab machines.'
282                                    % m)
283      try:
284        self.RemoveLocalMachine(m)
285        self.logger.LogOutput('Successfully removed %s from local server.' % m)
286      except Exception as e:
287        traceback.print_exc()
288        raise UpdateServerError('Error occurred while attempting to remove %s '
289                                '(%s).' % (m, str(e)))
290
291  def ListMachineStates(self, machine_states):
292    """Gets and prints the current status for a list of machines.
293
294    Prints out the current status for all of the machines in the current
295    AFELockManager's list of machines (set when the object is initialized).
296
297    Args:
298      machine_states: A dictionary of the current state of every machine in
299        the current AFELockManager's list of machines.  Normally obtained by
300        calling AFELockManager::GetMachineStates.
301    """
302    local_machines = []
303    printed_hdr = False
304    for m in machine_states:
305      cros_name = m + '.cros'
306      if (m in self.toolchain_lab_machines or
307          cros_name in self.toolchain_lab_machines):
308        if not printed_hdr:
309          self.PrintStatusHeader(True)
310          printed_hdr = True
311        state = machine_states[m]
312        if state['locked']:
313          print ('%s (%s)\tlocked by %s since %s' %
314                 (m, state['board'], state['locked_by'], state['lock_time']))
315        else:
316          print '%s (%s)\tunlocked' % (m, state['board'])
317      else:
318        local_machines.append(m)
319
320    if local_machines:
321      self.PrintStatusHeader(False)
322      for m in local_machines:
323        state = machine_states[m]
324        if state['locked']:
325          print ('%s (%s)\tlocked by %s since %s' %
326                 (m, state['board'], state['locked_by'], state['lock_time']))
327        else:
328          print '%s (%s)\tunlocked' % (m, state['board'])
329
330
331  def UpdateLockInAFE(self, should_lock_machine, machine):
332    """Calls an AFE server to lock/unlock a machine.
333
334    Args:
335      should_lock_machine: Boolean indicating whether to lock the machine (True)
336        or unlock the machine (False).
337      machine: The machine to update.
338
339    Raises:
340      LockingError:  An error occurred while attempting to update the machine
341        state.
342    """
343    action = 'lock'
344    if not should_lock_machine:
345      action = 'unlock'
346    kwargs = {'locked': should_lock_machine}
347    kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user
348
349    if machine in self.toolchain_lab_machines:
350      m = machine.split('.')[0]
351      afe_server = self.afe
352    else:
353      m = machine
354      afe_server = self.local_afe
355
356    try:
357      afe_server.run('modify_hosts',
358                      host_filter_data={'hostname__in': [m]},
359                      update_data=kwargs)
360    except Exception as e:
361      traceback.print_exc()
362      raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e)))
363
364  def UpdateMachines(self, lock_machines):
365    """Sets the locked state of the machines to the requested value.
366
367    The machines updated are the ones in self.machines (specified when the
368    class object was intialized).
369
370    Args:
371      lock_machines:  Boolean indicating whether to lock the machines (True) or
372        unlock the machines (False).
373
374    Returns:
375      A list of the machines whose state was successfully updated.
376    """
377    updated_machines = []
378    for m in self.machines:
379      self.UpdateLockInAFE(lock_machines, m)
380
381      # Since we returned from self.UpdateLockInAFE we assume the request
382      # succeeded.
383      if lock_machines:
384        self.logger.LogOutput('Locked machine(s) %s.' % m)
385      else:
386        self.logger.LogOutput('Unlocked machine(s) %s.' % m)
387      updated_machines.append(m)
388
389    return updated_machines
390
391  def _InternalRemoveMachine(self, machine):
392    """Remove machine from internal list of machines.
393
394    Args:
395      machine: Name of machine to be removed from internal list.
396    """
397    # Check to see if machine is lab machine and if so, make sure it has
398    # ".cros" on the end.
399    cros_machine = machine
400    if machine.find('rack') > 0 and machine.find('row') > 0:
401      if machine.find('.cros') == -1:
402        cros_machine = cros_machine + '.cros'
403
404    self.machines = [m for m in self.machines if m != cros_machine and
405                     m != machine]
406
407  def CheckMachineLocks(self, machine_states, cmd):
408    """Check that every machine in requested list is in the proper state.
409
410    If the cmd is 'unlock' verify that every machine is locked by requestor.
411    If the cmd is 'lock' verify that every machine is currently unlocked.
412
413    Args:
414      machine_states: A dictionary of the current state of every machine in
415        the current AFELockManager's list of machines.  Normally obtained by
416        calling AFELockManager::GetMachineStates.
417      cmd:  'lock' or 'unlock'.  The user-requested action for the machines.
418
419    Raises:
420      DontOwnLock: The lock on a requested machine is owned by someone else.
421    """
422    for k, state in machine_states.iteritems():
423      if cmd == 'unlock':
424        if not state['locked']:
425          self.logger.LogWarning('Attempt to unlock already unlocked machine '
426                                 '(%s).' % k)
427          self._InternalRemoveMachine(k)
428
429        if state['locked'] and state['locked_by'] != self.user:
430          raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
431                            'else (%s).' % (k, state['locked_by']))
432      elif cmd == 'lock':
433        if state['locked']:
434          self.logger.LogWarning('Attempt to lock already locked machine (%s)' % k)
435          self._InternalRemoveMachine(k)
436
437  def HasAFEServer(self, local):
438    """Verifies that the AFELockManager has appropriate AFE server.
439
440    Args:
441      local: Boolean indicating whether we are checking for the local server
442        (True) or for the global server (False).
443
444    Returns:
445      A boolean indicating if the AFELockManager has the requested AFE server.
446    """
447    if local:
448      return self.local_afe is not None
449    else:
450      return self.afe is not None
451
452  def GetMachineStates(self, cmd=''):
453    """Gets the current state of all the requested machines.
454
455    Gets the current state of all the requested machines, both from the HW lab
456    sever and from the local server.  Stores the data in a dictionary keyed
457    by machine name.
458
459    Args:
460      cmd: The command for which we are getting the machine states. This is
461        important because if one of the requested machines is missing we raise
462        an exception, unless the requested command is 'add'.
463
464    Returns:
465      A dictionary of machine states for all the machines in the AFELockManager
466      object.
467
468    Raises:
469      NoAFEServer:  Cannot find the HW Lab or local AFE server.
470      AFEAccessError:  An error occurred when querying the server about a
471        machine.
472    """
473    if not self.HasAFEServer(False):
474      raise NoAFEServer('Error: Cannot connect to main AFE server.')
475
476    if self.local and not self.HasAFEServer(True):
477      raise NoAFEServer('Error: Cannot connect to local AFE server.')
478
479    machines = {}
480    for m in self.machines:
481      host_info = None
482      if m in self.toolchain_lab_machines:
483        mod_host = m.split('.')[0]
484        host_info = self.afe.get_hosts(hostname=mod_host)
485        if not host_info:
486          raise AFEAccessError('Unable to get information about %s from main'
487                               ' autotest server.' % m)
488      else:
489        host_info = self.local_afe.get_hosts(hostname=m)
490        if not host_info and cmd != 'add':
491          raise AFEAccessError('Unable to get information about %s from '
492                               'local autotest server.' % m)
493      if host_info:
494        host_info = host_info[0]
495        name = host_info.hostname
496        values = {}
497        values['board'] = host_info.platform if host_info.platform else '??'
498        values['locked'] = host_info.locked
499        if host_info.locked:
500            values['locked_by'] = host_info.locked_by
501            values['lock_time'] = host_info.lock_time
502        else:
503            values['locked_by'] = ''
504            values['lock_time'] = ''
505        machines[name] = values
506      else:
507        machines[m] = {}
508    return machines
509
510
511def Main(argv):
512    """
513    Parse the options, initialize lock manager and dispatch proper method.
514
515    Args:
516      argv:  The options with which this script was invoked.
517
518    Returns:
519      0 unless an exception is raised.
520    """
521    parser = argparse.ArgumentParser()
522
523    parser.add_argument('--list', dest='cmd', action='store_const',
524                        const='status',
525                        help='List current status of all known machines.')
526    parser.add_argument('--lock', dest='cmd', action='store_const',
527                        const='lock', help='Lock given machine(s).')
528    parser.add_argument('--unlock', dest='cmd', action='store_const',
529                        const='unlock', help='Unlock given machine(s).')
530    parser.add_argument('--status', dest='cmd', action='store_const',
531                        const='status',
532                        help='List current status of given machine(s).')
533    parser.add_argument('--add_machine', dest='cmd', action='store_const',
534                        const='add',
535                        help='Add machine to local machine server.')
536    parser.add_argument('--remove_machine', dest='cmd',
537                        action='store_const', const='remove',
538                        help='Remove machine from the local machine server.')
539    parser.add_argument('--nolocal', dest='local',
540                        action='store_false', default=True,
541                        help='Do not try to use local machine server.')
542    parser.add_argument('--remote', dest='remote',
543                        help='machines on which to operate')
544    parser.add_argument('--chromeos_root', dest='chromeos_root', required=True,
545                        help='ChromeOS root to use for autotest scripts.')
546    parser.add_argument('--local_server', dest='local_server', default=None,
547                        help='Alternate local autotest server to use.')
548    parser.add_argument('--force', dest='force', action='store_true',
549                        default=False,
550                        help='Force lock/unlock of machines, even if not'
551                        ' current lock owner.')
552
553    options = parser.parse_args(argv)
554
555    if not options.remote and options.cmd != 'status':
556      parser.error('No machines specified for operation.')
557
558    if not os.path.isdir(options.chromeos_root):
559      parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
560
561    if not options.cmd:
562      parser.error('No operation selected (--list, --status, --lock, --unlock,'
563                   ' --add_machine, --remove_machine).')
564
565    machine_list = []
566    if options.remote:
567      machine_list = options.remote.split()
568
569    lock_manager = AFELockManager(machine_list, options.force,
570                                  options.chromeos_root, options.local_server,
571                                  options.local)
572
573    machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
574    cmd = options.cmd
575
576    if cmd == 'status':
577      lock_manager.ListMachineStates(machine_states)
578
579    elif cmd == 'lock':
580      if not lock_manager.force:
581        lock_manager.CheckMachineLocks(machine_states, cmd)
582        lock_manager.UpdateMachines(True)
583
584    elif cmd == 'unlock':
585      if not lock_manager.force:
586        lock_manager.CheckMachineLocks(machine_states, cmd)
587        lock_manager.UpdateMachines(False)
588
589    elif cmd == 'add':
590      lock_manager.AddMachinesToLocalServer()
591
592    elif cmd == 'remove':
593      lock_manager.RemoveMachinesFromLocalServer()
594
595    return 0
596
597
598if __name__ == '__main__':
599    sys.exit(Main(sys.argv[1:]))
600