1# Copyright 2015 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Services relating to DUT status and job history.
6
7The central abstraction of this module is the `HostJobHistory`
8class.  This class provides two related pieces of information
9regarding a single DUT:
10  * A history of tests and special tasks that have run on
11    the DUT in a given time range.
12  * Whether the DUT was "working" or "broken" at a given
13    time.
14
15The "working" or "broken" status of a DUT is determined by
16the DUT's special task history.  At the end of any job or
17task, the status is indicated as follows:
18  * After any successful special task, the DUT is considered
19    "working".
20  * After any failed Repair task, the DUT is considered "broken".
21  * After any other special task or after any regular test job, the
22    DUT's status is considered unchanged.
23
24Definitions for terms used in the code below:
25  * status task - Any special task that determines the DUT's
26    status; that is, any successful task, or any failed Repair.
27  * diagnosis interval - A time interval during which DUT status
28    changed either from "working" to "broken", or vice versa.  The
29    interval starts with the last status task with the old status,
30    and ends after the first status task with the new status.
31
32Diagnosis intervals are interesting because they normally contain
33the logs explaining a failure or repair event.
34
35"""
36
37import common
38import os
39from autotest_lib.frontend import setup_django_environment
40from django.db import models as django_models
41
42from autotest_lib.client.common_lib import global_config
43from autotest_lib.client.common_lib import site_utils
44from autotest_lib.client.common_lib import time_utils
45from autotest_lib.frontend.afe import models as afe_models
46from autotest_lib.site_utils.suite_scheduler import constants
47
48
49# Values used to describe the diagnosis of a DUT.  These values are
50# used to indicate both DUT status after a job or task, and also
51# diagnosis of whether the DUT was working at the end of a given
52# time interval.
53#
54# UNUSED:  Used when there are no events recorded in a given
55#     time interval.
56# UNKNOWN:  For an individual event, indicates that the DUT status
57#     is unchanged from the previous event.  For a time interval,
58#     indicates that the DUT's status can't be determined from the
59#     DUT's history.
60# WORKING:  Indicates that the DUT was working normally after the
61#     event, or at the end of the time interval.
62# BROKEN:  Indicates that the DUT needed manual repair after the
63#     event, or at the end of the time interval.
64#
65UNUSED = 0
66UNKNOWN = 1
67WORKING = 2
68BROKEN = 3
69
70
71def parse_time(time_string):
72    """Parse time according to a canonical form.
73
74    The "canonical" form is the form in which date/time
75    values are stored in the database.
76
77    @param time_string Time to be parsed.
78    """
79    return int(time_utils.to_epoch_time(time_string))
80
81
82class _JobEvent(object):
83    """Information about an event in host history.
84
85    This remembers the relevant data from a single event in host
86    history.  An event is any change in DUT state caused by a job
87    or special task.  The data captured are the start and end times
88    of the event, the URL of logs to the job or task causing the
89    event, and a diagnosis of whether the DUT was working or failed
90    afterwards.
91
92    This class is an adapter around the database model objects
93    describing jobs and special tasks.  This is an abstract
94    superclass, with concrete subclasses for `HostQueueEntry` and
95    `SpecialTask` objects.
96
97    @property start_time  Time the job or task began execution.
98    @property end_time    Time the job or task finished execution.
99    @property id          id of the event in the AFE database.
100    @property name        Name of the event, derived from the AFE database.
101    @property job_status  Short string describing the event's final status.
102    @property logdir      Relative path to the logs for the event's job.
103    @property job_url     URL to the logs for the event's job.
104    @property gs_url      GS URL to the logs for the event's job.
105    @property job_id      id of the AFE job for HQEs.  None otherwise.
106    @property diagnosis   Working status of the DUT after the event.
107    @property is_special  Boolean indicating if the event is a special task.
108
109    """
110
111    get_config_value = global_config.global_config.get_config_value
112    _LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern')
113
114    @classmethod
115    def get_log_url(cls, afe_hostname, logdir):
116        """Return a URL to job results.
117
118        The URL is constructed from a base URL determined by the
119        global config, plus the relative path of the job's log
120        directory.
121
122        @param afe_hostname Hostname for autotest frontend
123        @param logdir Relative path of the results log directory.
124
125        @return A URL to the requested results log.
126
127        """
128        return cls._LOG_URL_PATTERN % (afe_hostname, logdir)
129
130
131    @classmethod
132    def get_gs_url(cls, logdir):
133        """Return a GS URL to job results.
134
135        The URL is constructed from a base URL determined by the
136        global config, plus the relative path of the job's log
137        directory.
138
139        @param logdir Relative path of the results log directory.
140
141        @return A URL to the requested results log.
142
143        """
144        return os.path.join(site_utils.get_offload_gsuri(), logdir)
145
146
147    def __init__(self, start_time, end_time):
148        self.start_time = parse_time(start_time)
149        self.end_time = parse_time(end_time)
150
151
152    def __cmp__(self, other):
153        """Compare two jobs by their start time.
154
155        This is a standard Python `__cmp__` method to allow sorting
156        `_JobEvent` objects by their times.
157
158        @param other The `_JobEvent` object to compare to `self`.
159
160        """
161        return self.start_time - other.start_time
162
163
164    @property
165    def id(self):
166        """Return the id of the event in the AFE database."""
167        raise NotImplementedError()
168
169
170    @property
171    def name(self):
172        """Return the name of the event."""
173        raise NotImplementedError()
174
175
176    @property
177    def job_status(self):
178        """Return a short string describing the event's final status."""
179        raise NotImplementedError()
180
181
182    @property
183    def logdir(self):
184        """Return the relative path for this event's job logs."""
185        raise NotImplementedError()
186
187
188    @property
189    def job_url(self):
190        """Return the URL for this event's job logs."""
191        raise NotImplementedError()
192
193
194    @property
195    def gs_url(self):
196        """Return the GS URL for this event's job logs."""
197        raise NotImplementedError()
198
199
200    @property
201    def job_id(self):
202        """Return the id of the AFE job for HQEs.  None otherwise."""
203        raise NotImplementedError()
204
205
206    @property
207    def diagnosis(self):
208        """Return the status of the DUT after this event.
209
210        The diagnosis is interpreted as follows:
211          UNKNOWN - The DUT status was the same before and after
212              the event.
213          WORKING - The DUT appeared to be working after the event.
214          BROKEN - The DUT likely required manual intervention
215              after the event.
216
217        @return A valid diagnosis value.
218
219        """
220        raise NotImplementedError()
221
222
223    @property
224    def is_special(self):
225        """Return if the event is for a special task."""
226        raise NotImplementedError()
227
228
229class _SpecialTaskEvent(_JobEvent):
230    """`_JobEvent` adapter for special tasks.
231
232    This class wraps the standard `_JobEvent` interface around a row
233    in the `afe_special_tasks` table.
234
235    """
236
237    @classmethod
238    def get_tasks(cls, afe, host_id, start_time, end_time):
239        """Return special tasks for a host in a given time range.
240
241        Return a list of `_SpecialTaskEvent` objects representing all
242        special tasks that ran on the given host in the given time
243        range.  The list is ordered as it was returned by the query
244        (i.e. unordered).
245
246        @param afe         Autotest frontend
247        @param host_id     Database host id of the desired host.
248        @param start_time  Start time of the range of interest.
249        @param end_time    End time of the range of interest.
250
251        @return A list of `_SpecialTaskEvent` objects.
252
253        """
254        query_start = time_utils.epoch_time_to_date_string(start_time)
255        query_end = time_utils.epoch_time_to_date_string(end_time)
256        tasks = afe.get_host_special_tasks(
257                host_id,
258                time_started__gte=query_start,
259                time_finished__lte=query_end,
260                is_complete=1)
261        return [cls(afe.server, t) for t in tasks]
262
263
264    @classmethod
265    def get_status_task(cls, afe, host_id, end_time):
266        """Return the task indicating a host's status at a given time.
267
268        The task returned determines the status of the DUT; the
269        diagnosis on the task indicates the diagnosis for the DUT at
270        the given `end_time`.
271
272        @param afe         Autotest frontend
273        @param host_id     Database host id of the desired host.
274        @param end_time    Find status as of this time.
275
276        @return A `_SpecialTaskEvent` object for the requested task,
277                or `None` if no task was found.
278
279        """
280        query_end = time_utils.epoch_time_to_date_string(end_time)
281        task = afe.get_host_status_task(host_id, query_end)
282        return cls(afe.server, task) if task else None
283
284
285    def __init__(self, afe_hostname, afetask):
286        self._afe_hostname = afe_hostname
287        self._afetask = afetask
288        super(_SpecialTaskEvent, self).__init__(
289                afetask.time_started, afetask.time_finished)
290
291
292    @property
293    def id(self):
294        return self._afetask.id
295
296
297    @property
298    def name(self):
299        return self._afetask.task
300
301
302    @property
303    def job_status(self):
304        if self._afetask.is_aborted:
305            return 'ABORTED'
306        elif self._afetask.success:
307            return 'PASS'
308        else:
309            return 'FAIL'
310
311
312    @property
313    def logdir(self):
314        return ('hosts/%s/%s-%s' %
315                (self._afetask.host.hostname, self._afetask.id,
316                 self._afetask.task.lower()))
317
318
319    @property
320    def job_url(self):
321        return _SpecialTaskEvent.get_log_url(self._afe_hostname, self.logdir)
322
323
324    @property
325    def gs_url(self):
326        return _SpecialTaskEvent.get_gs_url(self.logdir)
327
328
329    @property
330    def job_id(self):
331        return None
332
333
334    @property
335    def diagnosis(self):
336        if self._afetask.success:
337            return WORKING
338        elif self._afetask.task == 'Repair':
339            return BROKEN
340        else:
341            return UNKNOWN
342
343
344    @property
345    def is_special(self):
346        return True
347
348
349class _TestJobEvent(_JobEvent):
350    """`_JobEvent` adapter for regular test jobs.
351
352    This class wraps the standard `_JobEvent` interface around a row
353    in the `afe_host_queue_entries` table.
354
355    """
356
357    @classmethod
358    def get_hqes(cls, afe, host_id, start_time, end_time):
359        """Return HQEs for a host in a given time range.
360
361        Return a list of `_TestJobEvent` objects representing all the
362        HQEs of all the jobs that ran on the given host in the given
363        time range.  The list is ordered as it was returned by the
364        query (i.e. unordered).
365
366        @param afe         Autotest frontend
367        @param host_id     Database host id of the desired host.
368        @param start_time  Start time of the range of interest.
369        @param end_time    End time of the range of interest.
370
371        @return A list of `_TestJobEvent` objects.
372
373        """
374        query_start = time_utils.epoch_time_to_date_string(start_time)
375        query_end = time_utils.epoch_time_to_date_string(end_time)
376        hqelist = afe.get_host_queue_entries(
377                host_id=host_id,
378                start_time=query_start,
379                end_time=query_end,
380                complete=1)
381        return [cls(afe.server, hqe) for hqe in hqelist]
382
383
384    def __init__(self, afe_hostname, hqe):
385        self._afe_hostname = afe_hostname
386        self._hqe = hqe
387        super(_TestJobEvent, self).__init__(
388                hqe.started_on, hqe.finished_on)
389
390
391    @property
392    def id(self):
393        return self._hqe.id
394
395
396    @property
397    def name(self):
398        return self._hqe.job.name
399
400
401    @property
402    def job_status(self):
403        return self._hqe.status
404
405
406    @property
407    def logdir(self):
408        return _get_job_logdir(self._hqe.job)
409
410
411    @property
412    def job_url(self):
413        return _TestJobEvent.get_log_url(self._afe_hostname, self.logdir)
414
415
416    @property
417    def gs_url(self):
418        return _TestJobEvent.get_gs_url(self.logdir)
419
420
421    @property
422    def job_id(self):
423        return self._hqe.job.id
424
425
426    @property
427    def diagnosis(self):
428        return UNKNOWN
429
430
431    @property
432    def is_special(self):
433        return False
434
435
436class HostJobHistory(object):
437    """Class to query and remember DUT execution and status history.
438
439    This class is responsible for querying the database to determine
440    the history of a single DUT in a time interval of interest, and
441    for remembering the query results for reporting.
442
443    @property hostname    Host name of the DUT.
444    @property start_time  Start of the requested time interval, as a unix
445                          timestamp (epoch time).
446                          This field may be `None`.
447    @property end_time    End of the requested time interval, as a unix
448                          timestamp (epoch time).
449    @property _afe        Autotest frontend for queries.
450    @property _host       Database host object for the DUT.
451    @property _history    A list of jobs and special tasks that
452                          ran on the DUT in the requested time
453                          interval, ordered in reverse, from latest
454                          to earliest.
455
456    @property _status_interval   A list of all the jobs and special
457                                 tasks that ran on the DUT in the
458                                 last diagnosis interval prior to
459                                 `end_time`, ordered from latest to
460                                 earliest.
461    @property _status_diagnosis  The DUT's status as of `end_time`.
462    @property _status_task       The DUT's last status task as of
463                                 `end_time`.
464
465    """
466
467    @classmethod
468    def get_host_history(cls, afe, hostname, start_time, end_time):
469        """Create a `HostJobHistory` instance for a single host.
470
471        Simple factory method to construct host history from a
472        hostname.  Simply looks up the host in the AFE database, and
473        passes it to the class constructor.
474
475        @param afe         Autotest frontend
476        @param hostname    Name of the host.
477        @param start_time  Start time for the history's time
478                           interval.
479        @param end_time    End time for the history's time interval.
480
481        @return A new `HostJobHistory` instance.
482
483        """
484        afehost = afe.get_hosts(hostname=hostname)[0]
485        return cls(afe, afehost, start_time, end_time)
486
487
488    @classmethod
489    def get_multiple_histories(cls, afe, start_time, end_time,
490                               board=None, pool=None):
491        """Create `HostJobHistory` instances for a set of hosts.
492
493        The set of hosts can be specified as "all hosts of a given
494        board type", "all hosts in a given pool", or "all hosts
495        of a given board and pool".
496
497        @param afe         Autotest frontend
498        @param start_time  Start time for the history's time
499                           interval.
500        @param end_time    End time for the history's time interval.
501        @param board       All hosts must have this board type; if
502                           `None`, all boards are allowed.
503        @param pool        All hosts must be in this pool; if
504                           `None`, all pools are allowed.
505
506        @return A list of new `HostJobHistory` instances.
507
508        """
509        # If `board` or `pool` are both `None`, we could search the
510        # entire database, which is more expensive than we want.
511        # Our caller currently won't (can't) do this, but assert to
512        # be safe.
513        assert board is not None or pool is not None
514        labels = []
515        if board is not None:
516            labels.append(constants.Labels.BOARD_PREFIX + board)
517        if pool is not None:
518            labels.append(constants.Labels.POOL_PREFIX + pool)
519        kwargs = {'multiple_labels': labels}
520        hosts = afe.get_hosts(**kwargs)
521        return [cls(afe, h, start_time, end_time) for h in hosts]
522
523
524    def __init__(self, afe, afehost, start_time, end_time):
525        self._afe = afe
526        self.hostname = afehost.hostname
527        self.end_time = end_time
528        self.start_time = start_time
529        self._host = afehost
530        # Don't spend time on queries until they're needed.
531        self._history = None
532        self._status_interval = None
533        self._status_diagnosis = None
534        self._status_task = None
535
536
537    def _get_history(self, start_time, end_time):
538        """Get the list of events for the given interval."""
539        newtasks = _SpecialTaskEvent.get_tasks(
540                self._afe, self._host.id, start_time, end_time)
541        newhqes = _TestJobEvent.get_hqes(
542                self._afe, self._host.id, start_time, end_time)
543        newhistory = newtasks + newhqes
544        newhistory.sort(reverse=True)
545        return newhistory
546
547
548    def __iter__(self):
549        if self._history is None:
550            self._history = self._get_history(self.start_time,
551                                              self.end_time)
552        return self._history.__iter__()
553
554
555    def _extract_prefixed_label(self, prefix):
556        labels = [l for l in self._host.labels
557                    if l.startswith(prefix)]
558        return labels[0][len(prefix) : ] if labels else None
559
560
561    @property
562    def host(self):
563        """Return the AFE host object for this history."""
564        return self._host
565
566
567    @property
568    def host_board(self):
569        """Return the board name for this history's DUT."""
570        prefix = constants.Labels.BOARD_PREFIX
571        return self._extract_prefixed_label(prefix)
572
573
574    @property
575    def host_pool(self):
576        """Return the pool name for this history's DUT."""
577        prefix = constants.Labels.POOL_PREFIX
578        return self._extract_prefixed_label(prefix)
579
580
581    def _init_status_task(self):
582        """Fill in `self._status_diagnosis` and `_status_task`."""
583        if self._status_diagnosis is not None:
584            return
585        self._status_task = _SpecialTaskEvent.get_status_task(
586                self._afe, self._host.id, self.end_time)
587        if self._status_task is not None:
588            self._status_diagnosis = self._status_task.diagnosis
589        else:
590            self._status_diagnosis = UNKNOWN
591
592
593    def _init_status_interval(self):
594        """Fill in `self._status_interval`."""
595        if self._status_interval is not None:
596            return
597        self._init_status_task()
598        self._status_interval = []
599        if self._status_task is None:
600            return
601        query_end = time_utils.epoch_time_to_date_string(self.end_time)
602        interval = self._afe.get_host_diagnosis_interval(
603                self._host.id, query_end,
604                self._status_diagnosis != WORKING)
605        if not interval:
606            return
607        self._status_interval = self._get_history(
608                parse_time(interval[0]),
609                parse_time(interval[1]))
610
611
612    def diagnosis_interval(self):
613        """Find this history's most recent diagnosis interval.
614
615        Returns a list of `_JobEvent` instances corresponding to the
616        most recent diagnosis interval occurring before this
617        history's end time.
618
619        The list is returned as with `self._history`, ordered from
620        most to least recent.
621
622        @return The list of the `_JobEvent`s in the diagnosis
623                interval.
624
625        """
626        self._init_status_interval()
627        return self._status_interval
628
629
630    def last_diagnosis(self):
631        """Return the diagnosis of whether the DUT is working.
632
633        This searches the DUT's job history, looking for the most
634        recent status task for the DUT.  Return a tuple of
635        `(diagnosis, task)`.
636
637        The `diagnosis` entry in the tuple is one of these values:
638          * UNUSED - The host's last status task is older than
639              `self.start_time`.
640          * WORKING - The DUT is working.
641          * BROKEN - The DUT likely requires manual intervention.
642          * UNKNOWN - No task could be found indicating status for
643              the DUT.
644
645        If the DUT was working at last check, but hasn't been used
646        inside this history's time interval, the status `UNUSED` is
647        returned with the last status task, instead of `WORKING`.
648
649        The `task` entry in the tuple is the status task that led to
650        the diagnosis.  The task will be `None` if the diagnosis is
651        `UNKNOWN`.
652
653        @return A tuple with the DUT's diagnosis and the task that
654                determined it.
655
656        """
657        self._init_status_task()
658        diagnosis = self._status_diagnosis
659        if (self.start_time is not None and
660                self._status_task is not None and
661                self._status_task.end_time < self.start_time and
662                diagnosis == WORKING):
663            diagnosis = UNUSED
664        return diagnosis, self._status_task
665
666
667def get_diagnosis_interval(host_id, end_time, success):
668    """Return the last diagnosis interval for a given host and time.
669
670    This routine queries the database for the special tasks on a
671    given host before a given time.  From those tasks it selects the
672    last status task before a change in status, and the first status
673    task after the change.  When `success` is true, the change must
674    be from "working" to "broken".  When false, the search is for a
675    change in the opposite direction.
676
677    A "successful status task" is any successful special task.  A
678    "failed status task" is a failed Repair task.  These criteria
679    are based on the definition of "status task" in the module-level
680    docstring, above.
681
682    This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`.
683
684    @param host_id     Database host id of the desired host.
685    @param end_time    Find the last eligible interval before this time.
686    @param success     Whether the eligible interval should start with a
687                       success or a failure.
688
689    @return A list containing the start time of the earliest job
690            selected, and the end time of the latest job.
691
692    """
693    base_query = afe_models.SpecialTask.objects.filter(
694            host_id=host_id, is_complete=True)
695    success_query = base_query.filter(success=True)
696    failure_query = base_query.filter(success=False, task='Repair')
697    if success:
698        query0 = success_query
699        query1 = failure_query
700    else:
701        query0 = failure_query
702        query1 = success_query
703    query0 = query0.filter(time_finished__lte=end_time)
704    query0 = query0.order_by('time_started').reverse()
705    if not query0:
706        return []
707    task0 = query0[0]
708    query1 = query1.filter(time_finished__gt=task0.time_finished)
709    task1 = query1.order_by('time_started')[0]
710    return [task0.time_started.strftime(time_utils.TIME_FMT),
711            task1.time_finished.strftime(time_utils.TIME_FMT)]
712
713
714def get_status_task(host_id, end_time):
715    """Get the last status task for a host before a given time.
716
717    This routine returns a Django query for the AFE database to find
718    the last task that finished on the given host before the given
719    time that was either a successful task, or a Repair task.  The
720    query criteria are based on the definition of "status task" in
721    the module-level docstring, above.
722
723    This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`.
724
725    @param host_id     Database host id of the desired host.
726    @param end_time    End time of the range of interest.
727
728    @return A Django query-set selecting the single special task of
729            interest.
730
731    """
732    # Selects status tasks:  any Repair task, or any successful task.
733    status_tasks = (django_models.Q(task='Repair') |
734                    django_models.Q(success=True))
735    # Our caller needs a Django query set in order to serialize the
736    # result, so we don't resolve the query here; we just return a
737    # slice with at most one element.
738    return afe_models.SpecialTask.objects.filter(
739            status_tasks,
740            host_id=host_id,
741            time_finished__lte=end_time,
742            is_complete=True).order_by('time_started').reverse()[0:1]
743
744
745def _get_job_logdir(job):
746    """Gets the logdir for an AFE job.
747
748    @param job Job object which has id and owner properties.
749
750    @return Relative path of the results log directory.
751    """
752    return '%s-%s' % (job.id, job.owner)
753
754
755def get_job_gs_url(job):
756    """Gets the GS URL for an AFE job.
757
758    @param job Job object which has id and owner properties.
759
760    @return Absolute GS URL to the results log directory.
761    """
762    return _JobEvent.get_gs_url(_get_job_logdir(job))
763