lab_inventory.py revision e39c827129e2a906eecbab6a6872da10ef8262bc
1#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken.  Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage:  lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17    How far back in time to search job history to determine DUT
18    status.
19
20--board-notify <address>[,<address>]
21    Send the "board status" e-mail to all the specified e-mail
22    addresses.
23
24--pool-notify <address>[,<address>]
25    Send the "pool status" e-mail to all the specified e-mail
26    addresses.
27
28--recommend <number>
29    When generating the "board status" e-mail, included a list of
30    <number> specific DUTs to be recommended for repair.
31
32--logdir <directory>
33    Log progress and actions in a file under this directory.  Text
34    of any e-mail sent will also be logged in a timestamped file in
35    this directory.
36
37--debug
38    Suppress all logging and sending e-mail.  Instead, write the
39    output that would be generated onto stdout.
40
41<board> arguments:
42    With no arguments, gathers the status for all boards in the lab.
43    With one or more named boards on the command line, restricts
44    reporting to just those boards.
45
46"""
47
48
49import argparse
50import logging
51import logging.handlers
52import os
53import re
54import sys
55import time
56
57import common
58from autotest_lib.client.bin import utils
59from autotest_lib.client.common_lib import time_utils
60from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
61from autotest_lib.server.hosts import servo_host
62from autotest_lib.site_utils import gmail_lib
63from autotest_lib.site_utils import status_history
64from autotest_lib.site_utils.suite_scheduler import constants
65
66
67# The pools in the Lab that are actually of interest.
68#
69# These are general purpose pools of DUTs that are considered
70# identical for purposes of testing.  That is, a device in one of
71# these pools can be shifted to another pool at will for purposes
72# of supplying test demand.
73#
74# Devices in these pools are not allowed to have special-purpose
75# attachments, or to be part of in any kind of custom fixture.
76# Devices in these pools are also required to reside in areas
77# managed by the Platforms team (i.e. at the time of this writing,
78# only in "Atlantis" or "Destiny").
79#
80# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
81#     to guarantee timely completion of tests from builders.
82# _SPARE_POOL - A low priority pool that is allowed to provide
83#     spares to replace broken devices in the critical pools.
84# _MANAGED_POOLS - The set of all the general purpose pools
85#     monitored by this script.
86
87_CRITICAL_POOLS = ['bvt', 'cq']
88_SPARE_POOL = 'suites'
89_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
90
91# _DEFAULT_DURATION:
92#     Default value used for the --duration command line option.
93#     Specifies how far back in time to search in order to determine
94#     DUT status.
95
96_DEFAULT_DURATION = 24
97
98# _LOGDIR:
99#     Relative path used in the calculation of the default setting
100#     for the --logdir option.  The full path path is relative to
101#     the root of the autotest directory, as determined from
102#     sys.argv[0].
103# _LOGFILE:
104#     Basename of a file to which general log information will be
105#     written.
106# _LOG_FORMAT:
107#     Format string for log messages.
108
109_LOGDIR = os.path.join('logs', 'dut-data')
110_LOGFILE = 'lab-inventory.log'
111_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
112
113# Pattern describing location-based host names in the Chrome OS test
114# labs.  Each DUT hostname designates the DUT's location:
115#   * A lab (room) that's physically separated from other labs
116#     (i.e. there's a door).
117#   * A row (or aisle) of DUTs within the lab.
118#   * A vertical rack of shelves on the row.
119#   * A specific host on one shelf of the rack.
120
121_HOSTNAME_PATTERN = re.compile(
122        r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
123
124
125class _PoolCounts(object):
126    """Maintains a set of `HostJobHistory` objects for a pool.
127
128    The collected history objects are nominally all part of a single
129    scheduling pool of DUTs.  The collection maintains a list of
130    working DUTs, a list of broken DUTs, and a list of all DUTs.
131
132    Performance note:  Certain methods in this class are potentially
133    expensive:
134      * `get_working()`
135      * `get_working_list()`
136      * `get_broken()`
137      * `get_broken_list()`
138    The first time any one of these methods is called, it causes
139    multiple RPC calls with a relatively expensive set of database
140    queries.  However, the results of the queries are cached in the
141    individual `HostJobHistory` objects, so only the first call
142    actually pays the full cost.
143
144    Additionally, `get_working_list()` and `get_broken_list()` both
145    cache their return values to avoid recalculating lists at every
146    call; this caching is separate from the caching of RPC results
147    described above.
148
149    This class is deliberately constructed to delay the RPC cost
150    until the accessor methods are called (rather than to query in
151    `record_host()`) so that it's possible to construct a complete
152    `_LabInventory` without making the expensive queries at creation
153    time.  `_populate_board_counts()`, below, assumes this behavior.
154
155    """
156
157    def __init__(self):
158        self._histories = []
159        self._working_list = None
160        self._broken_list = None
161
162
163    def record_host(self, host_history):
164        """Add one `HostJobHistory` object to the collection.
165
166        @param host_history The `HostJobHistory` object to be
167                            remembered.
168
169        """
170        self._working_list = None
171        self._broken_list = None
172        self._histories.append(host_history)
173
174
175    def get_working_list(self):
176        """Return a list of all working DUTs in the pool.
177
178        Filter `self._histories` for histories where the last
179        diagnosis is `WORKING`.
180
181        Cache the result so that we only cacluate it once.
182
183        @return A list of HostJobHistory objects.
184
185        """
186        if self._working_list is None:
187            self._working_list = [h for h in self._histories
188                    if h.last_diagnosis()[0] == status_history.WORKING]
189        return self._working_list
190
191
192    def get_working(self):
193        """Return the number of working DUTs in the pool."""
194        return len(self.get_working_list())
195
196
197    def get_broken_list(self):
198        """Return a list of all broken DUTs in the pool.
199
200        Filter `self._histories` for histories where the last
201        diagnosis is not `WORKING`.
202
203        Cache the result so that we only cacluate it once.
204
205        @return A list of HostJobHistory objects.
206
207        """
208        if self._broken_list is None:
209            self._broken_list = [h for h in self._histories
210                    if h.last_diagnosis()[0] != status_history.WORKING]
211        return self._broken_list
212
213
214    def get_broken(self):
215        """Return the number of broken DUTs in the pool."""
216        return len(self.get_broken_list())
217
218
219    def get_total(self):
220        """Return the total number of DUTs in the pool."""
221        return len(self._histories)
222
223
224class _BoardCounts(object):
225    """Maintains a set of `HostJobHistory` objects for a board.
226
227    The collected history objects are nominally all of the same
228    board.  The collection maintains a count of working DUTs, a
229    count of broken DUTs, and a total count.  The counts can be
230    obtained either for a single pool, or as a total across all
231    pools.
232
233    DUTs in the collection must be assigned to one of the pools
234    in `_MANAGED_POOLS`.
235
236    The `get_working()` and `get_broken()` methods rely on the
237    methods of the same name in _PoolCounts, so the performance
238    note in _PoolCounts applies here as well.
239
240    """
241
242    def __init__(self):
243        self._pools = {
244            pool: _PoolCounts() for pool in _MANAGED_POOLS
245        }
246
247    def record_host(self, host_history):
248        """Add one `HostJobHistory` object to the collection.
249
250        @param host_history The `HostJobHistory` object to be
251                            remembered.
252
253        """
254        pool = host_history.host_pool
255        self._pools[pool].record_host(host_history)
256
257
258    def _count_pool(self, get_pool_count, pool=None):
259        """Internal helper to count hosts in a given pool.
260
261        The `get_pool_count` parameter is a function to calculate
262        the exact count of interest for the pool.
263
264        @param get_pool_count  Function to return a count from a
265                               _PoolCount object.
266        @param pool            The pool to be counted.  If `None`,
267                               return the total across all pools.
268
269        """
270        if pool is None:
271            return sum([get_pool_count(counts)
272                            for counts in self._pools.values()])
273        else:
274            return get_pool_count(self._pools[pool])
275
276
277    def get_working_list(self):
278        """Return a list of all working DUTs for the board.
279
280        Go through all HostJobHistory objects in the board's pools,
281        selecting the ones where the last diagnosis is `WORKING`.
282
283        @return A list of HostJobHistory objects.
284
285        """
286        l = []
287        for p in self._pools.values():
288            l.extend(p.get_working_list())
289        return l
290
291
292    def get_working(self, pool=None):
293        """Return the number of working DUTs in a pool.
294
295        @param pool  The pool to be counted.  If `None`, return the
296                     total across all pools.
297
298        @return The total number of working DUTs in the selected
299                pool(s).
300        """
301        return self._count_pool(_PoolCounts.get_working, pool)
302
303
304    def get_broken_list(self):
305        """Return a list of all broken DUTs for the board.
306
307        Go through all HostJobHistory objects in the board's pools,
308        selecting the ones where the last diagnosis is not
309        `WORKING`.
310
311        @return A list of HostJobHistory objects.
312
313        """
314        l = []
315        for p in self._pools.values():
316            l.extend(p.get_broken_list())
317        return l
318
319
320    def get_broken(self, pool=None):
321        """Return the number of broken DUTs in a pool.
322
323        @param pool  The pool to be counted.  If `None`, return the
324                     total across all pools.
325
326        @return The total number of broken DUTs in the selected pool(s).
327        """
328        return self._count_pool(_PoolCounts.get_broken, pool)
329
330
331    def get_spares_buffer(self):
332        """Return the the nominal number of working spares.
333
334        Calculates and returns how many working spares there would
335        be in the spares pool if all broken DUTs were in the spares
336        pool.  This number may be negative, indicating a shortfall
337        in the critical pools.
338
339        @return The total number DUTs in the spares pool, less the total
340                number of broken DUTs in all pools.
341        """
342        return self.get_total(_SPARE_POOL) - self.get_broken()
343
344
345    def get_total(self, pool=None):
346        """Return the total number of DUTs in a pool.
347
348        @param pool  The pool to be counted.  If `None`, return the
349                     total across all pools.
350
351        @return The total number of DUTs in the selected pool(s).
352        """
353        return self._count_pool(_PoolCounts.get_total, pool)
354
355
356class _LabInventory(dict):
357    """Collection of `HostJobHistory` objects for the Lab's inventory.
358
359    The collection is indexed by board.  Indexing returns the
360    _BoardCounts object associated with the board.
361
362    The collection is also iterable.  The iterator returns all the
363    boards in the inventory, in unspecified order.
364
365    """
366
367    @classmethod
368    def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
369        """Return a Lab inventory with specified parameters.
370
371        By default, gathers inventory from `HostJobHistory` objects
372        for all DUTs in the `_MANAGED_POOLS` list.  If `boardlist`
373        is supplied, the inventory will be restricted to only the
374        given boards.
375
376        @param afe         AFE object for constructing the
377                           `HostJobHistory` objects.
378        @param start_time  Start time for the `HostJobHistory`
379                           objects.
380        @param end_time    End time for the `HostJobHistory`
381                           objects.
382        @param boardlist   List of boards to include.  If empty,
383                           include all available boards.
384        @return A `_LabInventory` object for the specified boards.
385
386        """
387        label_list = [constants.Labels.POOL_PREFIX + l
388                          for l in _MANAGED_POOLS]
389        afehosts = afe.get_hosts(labels__name__in=label_list)
390        if boardlist:
391            boardhosts = []
392            for board in boardlist:
393                board_label = constants.Labels.BOARD_PREFIX + board
394                host_list = [h for h in afehosts
395                                  if board_label in h.labels]
396                boardhosts.extend(host_list)
397            afehosts = boardhosts
398        create = lambda host: (
399                status_history.HostJobHistory(afe, host,
400                                              start_time, end_time))
401        return cls([create(host) for host in afehosts])
402
403
404    def __init__(self, histories):
405        # N.B. The query that finds our hosts is restricted to those
406        # with a valid pool: label, but doesn't check for a valid
407        # board: label.  In some (insufficiently) rare cases, the
408        # AFE hosts table has been known to (incorrectly) have DUTs
409        # with a pool: but no board: label.  We explicitly exclude
410        # those here.
411        histories = [h for h in histories
412                     if h.host_board is not None]
413        boards = set([h.host_board for h in histories])
414        initval = { board: _BoardCounts() for board in boards }
415        super(_LabInventory, self).__init__(initval)
416        self._dut_count = len(histories)
417        self._managed_boards = None
418        for h in histories:
419            self[h.host_board].record_host(h)
420
421
422    def get_managed_boards(self):
423        """Return the set of "managed" boards.
424
425        Operationally, saying a board is "managed" means that the
426        board will be included in the "board" and "repair
427        recommendations" reports.  That is, if there are failures in
428        the board's inventory then lab techs will be asked to fix
429        them without a separate ticket.
430
431        For purposes of implementation, a board is "managed" if it
432        has DUTs in both the spare and a non-spare (i.e. critical)
433        pool.
434
435        @return A set of all the boards that have both spare and
436                non-spare pools.
437        """
438        if self._managed_boards is None:
439            self._managed_boards = set()
440            for board, counts in self.items():
441                spares = counts.get_total(_SPARE_POOL)
442                total = counts.get_total()
443                if spares != 0 and spares != total:
444                    self._managed_boards.add(board)
445        return self._managed_boards
446
447
448    def get_num_duts(self):
449        """Return the total number of DUTs in the inventory."""
450        return self._dut_count
451
452
453    def get_num_boards(self):
454        """Return the total number of boards in the inventory."""
455        return len(self)
456
457
458def _sort_by_location(inventory_list):
459    """Return a list of DUTs, organized by location.
460
461    Take the given list of `HostJobHistory` objects, separate it
462    into a list per lab, and sort each lab's list by location.  The
463    order of sorting within a lab is
464      * By row number within the lab,
465      * then by rack number within the row,
466      * then by host shelf number within the rack.
467
468    Return a list of the sorted lists.
469
470    Implementation note: host locations are sorted by converting
471    each location into a base 100 number.  If row, rack or
472    host numbers exceed the range [0..99], then sorting will
473    break down.
474
475    @return A list of sorted lists of DUTs.
476
477    """
478    BASE = 100
479    lab_lists = {}
480    for history in inventory_list:
481        location = _HOSTNAME_PATTERN.match(history.host.hostname)
482        if location:
483            lab = location.group(1)
484            key = 0
485            for idx in location.group(2, 3, 4):
486                key = BASE * key + int(idx)
487            lab_lists.setdefault(lab, []).append((key, history))
488    return_list = []
489    for dut_list in lab_lists.values():
490        dut_list.sort(key=lambda t: t[0])
491        return_list.append([t[1] for t in dut_list])
492    return return_list
493
494
495def _score_repair_set(buffer_counts, repair_list):
496    """Return a numeric score rating a set of DUTs to be repaired.
497
498    `buffer_counts` is a dictionary mapping board names to the
499    size of the board's spares buffer.
500
501    `repair_list` is a list of DUTs to be repaired.
502
503    This function calculates the new set of buffer counts that would
504    result from the proposed repairs, and scores the new set using
505    two numbers:
506      * Worst case buffer count for any board (higher is better).
507        This is the more siginficant number for comparison.
508      * Number of boards at the worst case (lower is better).  This
509        is the less significant number.
510
511    Implementation note:  The score could fail to reflect the
512    intended criteria if there are more than 1000 boards in the
513    inventory.
514
515    @param spare_counts A dictionary mapping boards to buffer counts.
516    @param repair_list  A list of boards to be repaired.
517    @return A numeric score.
518
519    """
520    # Go through `buffer_counts`, and create a list of new counts
521    # that records the buffer count for each board after repair.
522    # The new list of counts discards the board names, as they don't
523    # contribute to the final score.
524    _NBOARDS = 1000
525    repair_inventory = _LabInventory(repair_list)
526    new_counts = []
527    for b, c in buffer_counts.items():
528        if b in repair_inventory:
529            newcount = repair_inventory[b].get_total()
530        else:
531            newcount = 0
532        new_counts.append(c + newcount)
533    # Go through the new list of counts.  Find the worst available
534    # spares count, and count how many times that worst case occurs.
535    worst_count = new_counts[0]
536    num_worst = 1
537    for c in new_counts[1:]:
538        if c == worst_count:
539            num_worst += 1
540        elif c < worst_count:
541            worst_count = c
542            num_worst = 1
543    # Return the calculated score
544    return _NBOARDS * worst_count - num_worst
545
546
547def _generate_repair_recommendation(inventory, num_recommend):
548    """Return a summary of selected DUTs needing repair.
549
550    Returns a message recommending a list of broken DUTs to be
551    repaired.  The list of DUTs is selected based on these
552    criteria:
553      * No more than `num_recommend` DUTs will be listed.
554      * All DUTs must be in the same lab.
555      * DUTs should be selected for some degree of physical
556        proximity.
557      * DUTs for boards with a low spares buffer are more important
558        than DUTs with larger buffers.
559
560    The algorithm used will guarantee that at least one DUT from a
561    board with the smallest spares buffer will be recommended.  If
562    the worst spares buffer number is shared by more than one board,
563    the algorithm will tend to prefer repair sets that include more
564    of those boards over sets that cover fewer boards.
565
566    @param inventory      Inventory for generating recommendations.
567    @param num_recommend  Number of DUTs to recommend for repair.
568
569    """
570    logging.debug('Creating DUT repair recommendations')
571    board_buffer_counts = {}
572    broken_list = []
573    for board in inventory.get_managed_boards():
574        logging.debug('Listing failed DUTs for %s', board)
575        counts = inventory[board]
576        if counts.get_broken() != 0:
577            board_buffer_counts[board] = counts.get_spares_buffer()
578            broken_list.extend(counts.get_broken_list())
579    # N.B. The logic inside this loop may seem complicated, but
580    # simplification is hard:
581    #   * Calculating an initial recommendation outside of
582    #     the loop likely would make things more complicated,
583    #     not less.
584    #   * It's necessary to calculate an initial lab slice once per
585    #     lab _before_ the while loop, in case the number of broken
586    #     DUTs in a lab is less than `num_recommend`.
587    recommendation = None
588    best_score = None
589    for lab_duts in _sort_by_location(broken_list):
590        start = 0
591        end = num_recommend
592        lab_slice = lab_duts[start : end]
593        lab_score = _score_repair_set(board_buffer_counts,
594                                      lab_slice)
595        while end < len(lab_duts):
596            start += 1
597            end += 1
598            new_slice = lab_duts[start : end]
599            new_score = _score_repair_set(board_buffer_counts,
600                                          new_slice)
601            if new_score > lab_score:
602                lab_slice = new_slice
603                lab_score = new_score
604        if recommendation is None or lab_score > best_score:
605            recommendation = lab_slice
606            best_score = lab_score
607    message = ['Repair recommendations:\n',
608               '%-30s %-16s %s' % (
609                       'Hostname', 'Board', 'Servo instructions')]
610    for h in recommendation:
611        servo_name = servo_host.make_servo_hostname(h.host.hostname)
612        if utils.host_is_in_lab_zone(servo_name):
613            servo_message = 'Repair servo first'
614        else:
615            servo_message = 'No servo present'
616        line = '%-30s %-16s %s' % (
617                h.host.hostname, h.host_board, servo_message)
618        message.append(line)
619    return '\n'.join(message)
620
621
622def _generate_board_inventory_message(inventory):
623    """Generate the "board inventory" e-mail message.
624
625    The board inventory is a list by board summarizing the number
626    of working and broken DUTs, and the total shortfall or surplus
627    of working devices relative to the minimum critical pool
628    requirement.
629
630    The report omits boards with no DUTs in the spare pool or with
631    no DUTs in a critical pool.
632
633    N.B. For sample output text formattted as users can expect to
634    see it in e-mail and log files, refer to the unit tests.
635
636    @param inventory  _LabInventory object with the inventory to
637                      be reported on.
638    @return String with the inventory message to be sent.
639
640    """
641    logging.debug('Creating board inventory')
642    nworking = 0
643    nbroken = 0
644    nbroken_boards = 0
645    summaries = []
646    for board in inventory.get_managed_boards():
647        logging.debug('Counting board inventory for %s', board)
648        counts = inventory[board]
649        # Summary elements laid out in the same order as the text
650        # headers:
651        #     Board Avail   Bad  Good Spare Total
652        #      e[0]  e[1]  e[2]  e[3]  e[4]  e[5]
653        element = (board,
654                   counts.get_spares_buffer(),
655                   counts.get_broken(),
656                   counts.get_working(),
657                   counts.get_total(_SPARE_POOL),
658                   counts.get_total())
659        summaries.append(element)
660        nbroken += element[2]
661        nworking += element[3]
662        if element[2]:
663            nbroken_boards += 1
664    ntotal = nworking + nbroken
665    summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
666    broken_percent = int(round(100.0 * nbroken / ntotal))
667    working_percent = 100 - broken_percent
668    message = ['Summary of DUTs in inventory:',
669               '%10s %10s %6s' % ('Bad', 'Good', 'Total'),
670               '%5d %3d%% %5d %3d%% %6d' % (
671                   nbroken, broken_percent,
672                   nworking, working_percent,
673                   ntotal),
674               '',
675               'Boards with failures: %d' % nbroken_boards,
676               'Boards in inventory:  %d' % len(summaries),
677               '', '',
678               'Full board inventory:\n',
679               '%-22s %5s %5s %5s %5s %5s' % (
680                   'Board', 'Avail', 'Bad', 'Good',
681                   'Spare', 'Total')]
682    message.extend(
683            ['%-22s %5d %5d %5d %5d %5d' % e for e in summaries])
684    return '\n'.join(message)
685
686
687_POOL_INVENTORY_HEADER = '''\
688Notice to Infrastructure deputies:  All boards shown below are at
689less than full strength, please take action to resolve the issues.
690Once you're satisified that failures won't recur, failed DUTs can
691be replaced with spares by running `balance_pool`.  Detailed
692instructions can be found here:
693    http://go/cros-manage-duts
694'''
695
696
697def _generate_pool_inventory_message(inventory):
698    """Generate the "pool inventory" e-mail message.
699
700    The pool inventory is a list by pool and board summarizing the
701    number of working and broken DUTs in the pool.  Only boards with
702    at least one broken DUT are included in the list.
703
704    N.B. For sample output text formattted as users can expect to
705    see it in e-mail and log files, refer to the unit tests.
706
707    @param inventory  _LabInventory object with the inventory to
708                      be reported on.
709    @return String with the inventory message to be sent.
710
711    """
712    logging.debug('Creating pool inventory')
713    message = [_POOL_INVENTORY_HEADER]
714    newline = ''
715    for pool in _CRITICAL_POOLS:
716        message.append(
717            '%sStatus for pool:%s, by board:' % (newline, pool))
718        message.append(
719            '%-20s   %5s %5s %5s' % (
720                'Board', 'Bad', 'Good', 'Total'))
721        data_list = []
722        for board, counts in inventory.items():
723            logging.debug('Counting inventory for %s, %s',
724                          board, pool)
725            broken = counts.get_broken(pool)
726            if broken == 0:
727                continue
728            working = counts.get_working(pool)
729            total = counts.get_total(pool)
730            data_list.append((board, broken, working, total))
731        if data_list:
732            data_list = sorted(data_list, key=lambda d: -d[1])
733            message.extend(
734                ['%-20s   %5d %5d %5d' % t for t in data_list])
735        else:
736            message.append('(All boards at full strength)')
737        newline = '\n'
738    return '\n'.join(message)
739
740
741def _send_email(arguments, tag, subject, recipients, body):
742    """Send an inventory e-mail message.
743
744    The message is logged in the selected log directory using `tag`
745    for the file name.
746
747    If the --print option was requested, the message is neither
748    logged nor sent, but merely printed on stdout.
749
750    @param arguments   Parsed command-line options.
751    @param tag         Tag identifying the inventory for logging
752                       purposes.
753    @param subject     E-mail Subject: header line.
754    @param recipients  E-mail addresses for the To: header line.
755    @param body        E-mail message body.
756
757    """
758    logging.debug('Generating email: "%s"', subject)
759    all_recipients = ', '.join(recipients)
760    report_body = '\n'.join([
761            'To: %s' % all_recipients,
762            'Subject: %s' % subject,
763            '', body, ''])
764    if arguments.debug:
765        print report_body
766    else:
767        filename = os.path.join(arguments.logdir, tag)
768        try:
769            report_file = open(filename, 'w')
770            report_file.write(report_body)
771            report_file.close()
772        except EnvironmentError as e:
773            logging.error('Failed to write %s:  %s', filename, e)
774        try:
775            gmail_lib.send_email(all_recipients, subject, body)
776        except Exception as e:
777            logging.error('Failed to send e-mail to %s:  %s',
778                          all_recipients, e)
779
780
781def _separate_email_addresses(address_list):
782    """Parse a list of comma-separated lists of e-mail addresses.
783
784    @param address_list  A list of strings containing comma
785                         separate e-mail addresses.
786    @return A list of the individual e-mail addresses.
787
788    """
789    newlist = []
790    for arg in address_list:
791        newlist.extend([email.strip() for email in arg.split(',')])
792    return newlist
793
794
795def _verify_arguments(arguments):
796    """Validate command-line arguments.
797
798    Join comma separated e-mail addresses for `--board-notify` and
799    `--pool-notify` in separate option arguments into a single list.
800
801    For non-debug uses, require that notification be requested for
802    at least one report.  For debug, if notification isn't specified,
803    treat it as "run all the reports."
804
805    The return value indicates success or failure; in the case of
806    failure, we also write an error message to stderr.
807
808    @param arguments  Command-line arguments as returned by
809                      `ArgumentParser`
810    @return True if the arguments are semantically good, or False
811            if the arguments don't meet requirements.
812
813    """
814    arguments.board_notify = _separate_email_addresses(
815            arguments.board_notify)
816    arguments.pool_notify = _separate_email_addresses(
817            arguments.pool_notify)
818    if not arguments.board_notify and not arguments.pool_notify:
819        if not arguments.debug:
820            sys.stderr.write('Must specify at least one of '
821                             '--board-notify or --pool-notify\n')
822            return False
823        else:
824            # We want to run all the reports.  An empty notify list
825            # will cause a report to be skipped, so make sure the
826            # lists are non-empty.
827            arguments.board_notify = ['']
828            arguments.pool_notify = ['']
829    return True
830
831
832def _get_logdir(script):
833    """Get the default directory for the `--logdir` option.
834
835    The default log directory is based on the parent directory
836    containing this script.
837
838    @param script  Path to this script file.
839    @return A path to a directory.
840
841    """
842    basedir = os.path.dirname(os.path.abspath(script))
843    basedir = os.path.dirname(basedir)
844    return os.path.join(basedir, _LOGDIR)
845
846
847def _parse_command(argv):
848    """Parse the command line arguments.
849
850    Create an argument parser for this command's syntax, parse the
851    command line, and return the result of the ArgumentParser
852    parse_args() method.
853
854    @param argv Standard command line argument vector; argv[0] is
855                assumed to be the command name.
856    @return Result returned by ArgumentParser.parse_args().
857
858    """
859    parser = argparse.ArgumentParser(
860            prog=argv[0],
861            description='Gather and report lab inventory statistics')
862    parser.add_argument('-d', '--duration', type=int,
863                        default=_DEFAULT_DURATION, metavar='HOURS',
864                        help='number of hours back to search for status'
865                             ' (default: %d)' % _DEFAULT_DURATION)
866    parser.add_argument('--board-notify', action='append',
867                        default=[], metavar='ADDRESS',
868                        help='Generate board inventory message, '
869                        'and send it to the given e-mail address(es)')
870    parser.add_argument('--pool-notify', action='append',
871                        default=[], metavar='ADDRESS',
872                        help='Generate pool inventory message, '
873                             'and send it to the given address(es)')
874    parser.add_argument('-r', '--recommend', type=int, default=None,
875                        help=('Specify how many DUTs should be '
876                              'recommended for repair (default: no '
877                              'recommendation)'))
878    parser.add_argument('--debug', action='store_true',
879                        help='Print e-mail messages on stdout '
880                             'without sending them.')
881    parser.add_argument('--logdir', default=_get_logdir(argv[0]),
882                        help='Directory where logs will be written.')
883    parser.add_argument('boardnames', nargs='*',
884                        metavar='BOARD',
885                        help='names of boards to report on '
886                             '(default: all boards)')
887    arguments = parser.parse_args(argv[1:])
888    if not _verify_arguments(arguments):
889        return None
890    return arguments
891
892
893def _configure_logging(arguments):
894    """Configure the `logging` module for our needs.
895
896    How we log depends on whether the `--print` option was
897    provided on the command line.  Without the option, we log all
898    messages at DEBUG level or above, and write them to a file in
899    the directory specified by the `--logdir` option.  With the
900    option, we write log messages to stdout; messages below INFO
901    level are discarded.
902
903    The log file is configured to rotate once a week on Friday
904    evening, preserving ~3 months worth of history.
905
906    @param arguments  Command-line arguments as returned by
907                      `ArgumentParser`
908
909    """
910    root_logger = logging.getLogger()
911    if arguments.debug:
912        root_logger.setLevel(logging.INFO)
913        handler = logging.StreamHandler(sys.stdout)
914        handler.setFormatter(logging.Formatter())
915    else:
916        root_logger.setLevel(logging.DEBUG)
917        logfile = os.path.join(arguments.logdir, _LOGFILE)
918        handler = logging.handlers.TimedRotatingFileHandler(
919                logfile, when='W4', backupCount=13)
920        formatter = logging.Formatter(_LOG_FORMAT,
921                                      time_utils.TIME_FMT)
922        handler.setFormatter(formatter)
923    # TODO(jrbarnette) This is gross.  Importing client.bin.utils
924    # implicitly imported logging_config, which calls
925    # logging.basicConfig() *at module level*.  That gives us an
926    # extra logging handler that we don't want.  So, clear out all
927    # the handlers here.
928    for h in root_logger.handlers:
929        root_logger.removeHandler(h)
930    root_logger.addHandler(handler)
931
932
933def _populate_board_counts(inventory):
934    """Gather board counts while providing interactive feedback.
935
936    Gathering the status of all individual DUTs in the lab can take
937    considerable time (~30 minutes at the time of this writing).
938
939    Normally, we pay that cost by querying as we go.  However, with
940    the `--print` option, a human being may be watching the
941    progress.  So, we force the first (expensive) queries to happen
942    up front, and provide a small ASCII progress bar to give an
943    indicator of how many boards have been processed.
944
945    @param inventory  _LabInventory object with the inventory to
946                      be gathered.
947
948    """
949    n = 0
950    total_broken = 0
951    for counts in inventory.values():
952        n += 1
953        if n % 10 == 5:
954            c = '+'
955        elif n % 10 == 0:
956            c = '%d' % ((n / 10) % 10)
957        else:
958            c = '.'
959        sys.stdout.write(c)
960        sys.stdout.flush()
961        # This next call is where all the time goes - it forces all
962        # of a board's HostJobHistory objects to query the database
963        # and cache their results.
964        total_broken += counts.get_broken()
965    sys.stdout.write('\n')
966    sys.stdout.write('Found %d broken DUTs\n' % total_broken)
967
968
969def main(argv):
970    """Standard main routine.
971    @param argv  Command line arguments including `sys.argv[0]`.
972    """
973    arguments = _parse_command(argv)
974    if not arguments:
975        sys.exit(1)
976    _configure_logging(arguments)
977    try:
978        end_time = int(time.time())
979        start_time = end_time - arguments.duration * 60 * 60
980        timestamp = time.strftime('%Y-%m-%d.%H',
981                                  time.localtime(end_time))
982        logging.debug('Starting lab inventory for %s', timestamp)
983        if arguments.board_notify:
984            if arguments.recommend:
985                logging.debug('Will include repair recommendations')
986            logging.debug('Will include board inventory')
987        if arguments.pool_notify:
988            logging.debug('Will include pool inventory')
989
990        afe = frontend_wrappers.RetryingAFE(server=None)
991        inventory = _LabInventory.create_inventory(
992                afe, start_time, end_time, arguments.boardnames)
993        logging.info('Found %d hosts across %d boards',
994                         inventory.get_num_duts(),
995                         inventory.get_num_boards())
996
997        if arguments.debug:
998            _populate_board_counts(inventory)
999
1000        if arguments.board_notify:
1001            if arguments.recommend:
1002                recommend_message = _generate_repair_recommendation(
1003                        inventory, arguments.recommend) + '\n\n\n'
1004            else:
1005                recommend_message = ''
1006            board_message = _generate_board_inventory_message(inventory)
1007            _send_email(arguments,
1008                        'boards-%s.txt' % timestamp,
1009                        'DUT board inventory %s' % timestamp,
1010                        arguments.board_notify,
1011                        recommend_message + board_message)
1012
1013        if arguments.pool_notify:
1014            _send_email(arguments,
1015                        'pools-%s.txt' % timestamp,
1016                        'DUT pool inventory %s' % timestamp,
1017                        arguments.pool_notify,
1018                        _generate_pool_inventory_message(inventory))
1019    except KeyboardInterrupt:
1020        pass
1021    except EnvironmentError as e:
1022        logging.exception('Unexpected OS error: %s', e)
1023    except Exception as e:
1024        logging.exception('Unexpected exception: %s', e)
1025
1026
1027if __name__ == '__main__':
1028    main(sys.argv)
1029