1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import datetime
6import logging
7import time
8
9import common
10
11from autotest_lib.client.common_lib import base_job
12from autotest_lib.client.common_lib import error
13from autotest_lib.client.common_lib import priorities
14from autotest_lib.client.common_lib import time_utils
15from autotest_lib.client.common_lib import utils
16from autotest_lib.client.common_lib.cros import dev_server
17from autotest_lib.server.cros import provision
18from autotest_lib.server.cros.dynamic_suite import constants
19from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
20from autotest_lib.server.cros.dynamic_suite import tools
21from autotest_lib.server.cros.dynamic_suite.suite import Suite
22from autotest_lib.tko import utils as tko_utils
23
24
25"""CrOS dynamic test suite generation and execution module.
26
27This module implements runtime-generated test suites for CrOS.
28Design doc: http://goto.google.com/suitesv2
29
30Individual tests can declare themselves as a part of one or more
31suites, and the code here enables control files to be written
32that can refer to these "dynamic suites" by name.  We also provide
33support for reimaging devices with a given build and running a
34dynamic suite across all reimaged devices.
35
36The public API for defining a suite includes one method: reimage_and_run().
37A suite control file can be written by importing this module and making
38an appropriate call to this single method.  In normal usage, this control
39file will be run in a 'hostless' server-side autotest job, scheduling
40sub-jobs to do the needed reimaging and test running.
41
42Example control file:
43
44import common
45from autotest_lib.server.cros import provision
46from autotest_lib.server.cros.dynamic_suite import dynamic_suite
47
48dynamic_suite.reimage_and_run(
49    builds={provision.CROS_VERSION_PREFIX: build}, board=board, name='bvt',
50    job=job, pool=pool, check_hosts=check_hosts, add_experimental=True, num=num,
51    devserver_url=devserver_url)
52
53This will -- at runtime -- find all control files that contain "bvt" in their
54"SUITE=" clause, schedule jobs to reimage |num| or less devices in the
55specified pool of the specified board with the specified build and, upon
56completion of those jobs, schedule and wait for jobs that run all the tests it
57discovered.
58
59Suites can be run by using the atest command-line tool:
60  atest suite create -b <board> -i <build/name> <suite>
61e.g.
62  atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt
63
64-------------------------------------------------------------------------
65Implementation details
66
67A Suite instance represents a single test suite, defined by some predicate
68run over all known control files.  The simplest example is creating a Suite
69by 'name'.
70
71create_suite_job() takes the parameters needed to define a suite run (board,
72build to test, machine pool, and which suite to run), ensures important
73preconditions are met, finds the appropraite suite control file, and then
74schedules the hostless job that will do the rest of the work.
75
76Note that we have more than one Dev server in our test lab architecture.
77We currently load balance per-build being tested, so one and only one dev
78server is used by any given run through the reimaging/testing flow.
79
80- create_suite_job()
81The primary role of create_suite_job() is to ensure that the required
82artifacts for the build to be tested are staged on the dev server.  This
83includes payloads required to autoupdate machines to the desired build, as
84well as the autotest control files appropriate for that build.  Then, the
85RPC pulls the control file for the suite to be run from the dev server and
86uses it to create the suite job with the autotest frontend.
87
88     +----------------+
89     | Google Storage |                                Client
90     +----------------+                                   |
91               | ^                                        | create_suite_job()
92 payloads/     | |                                        |
93 control files | | request                                |
94               V |                                        V
95       +-------------+   download request    +--------------------------+
96       |             |<----------------------|                          |
97       | Dev Server  |                       | Autotest Frontend (AFE)  |
98       |             |---------------------->|                          |
99       +-------------+  suite control file   +--------------------------+
100                                                          |
101                                                          V
102                                                      Suite Job (hostless)
103
104- Reimage and Run
105The overall process is to schedule all the tests, and then wait for the tests
106to complete.
107
108- The Reimaging Process
109
110As an artifact of an old implementation, the number of machines to use
111is called the 'sharding_factor', and the default is defined in the [CROS]
112section of global_config.ini.  This can be overridden by passing a 'num=N'
113parameter to create_suite_job(), which is piped through to reimage_and_run()
114just like the 'build' and 'board' parameters are.  However, with provisioning,
115this machine accounting hasn't been implemented nor removed.  However, 'num' is
116still passed around, as it might be used one day.
117
118A test control file can specify a list of DEPENDENCIES, which are really just
119the set of labels a host needs to have in order for that test to be scheduled
120on it.  In the case of a dynamic_suite, many tests in the suite may have
121DEPENDENCIES specified.  All tests are scheduled with the DEPENDENCIES that
122they specify, along with any suite dependencies that were specified, and the
123scheduler will find and provision a host capable of running the test.
124
125- Scheduling Suites
126A Suite instance uses the labels specified in the suite dependencies to
127schedule tests across all the hosts in the pool.  It then waits for all these
128jobs.  As an optimization, the Dev server stages the payloads necessary to
129run a suite in the background _after_ it has completed all the things
130necessary for reimaging.  Before running a suite, reimage_and_run() calls out
131to the Dev server and blocks until it's completed staging all build artifacts
132needed to run test suites.
133
134Step by step:
1350) At instantiation time, find all appropriate control files for this suite
136   that were included in the build to be tested.  To do this, we consult the
137   Dev Server, where all these control files are staged.
138
139          +------------+    control files?     +--------------------------+
140          |            |<----------------------|                          |
141          | Dev Server |                       | Autotest Frontend (AFE)  |
142          |            |---------------------->|       [Suite Job]        |
143          +------------+    control files!     +--------------------------+
144
1451) Now that the Suite instance exists, it schedules jobs for every control
146   file it deemed appropriate, to be run on the hosts that were labeled
147   by the provisioning.  We stuff keyvals into these jobs, indicating what
148   build they were testing and which suite they were for.
149
150   +--------------------------+ Job for VersLabel       +--------+
151   |                          |------------------------>| Host 1 | VersLabel
152   | Autotest Frontend (AFE)  |            +--------+   +--------+
153   |       [Suite Job]        |----------->| Host 2 |
154   +--------------------------+ Job for    +--------+
155       |                ^       VersLabel        VersLabel
156       |                |
157       +----------------+
158        One job per test
159        {'build': build/name,
160         'suite': suite_name}
161
1622) Now that all jobs are scheduled, they'll be doled out as labeled hosts
163   finish their assigned work and become available again.
164
165- Waiting on Suites
1660) As we clean up each test job, we check to see if any crashes occurred.  If
167   they did, we look at the 'build' keyval in the job to see which build's debug
168   symbols we'll need to symbolicate the crash dump we just found.
169
1701) Using this info, we tell a special Crash Server to stage the required debug
171   symbols. Once that's done, we ask the Crash Server to use those symbols to
172   symbolicate the crash dump in question.
173
174     +----------------+
175     | Google Storage |
176     +----------------+
177          |     ^
178 symbols! |     | symbols?
179          V     |
180      +------------+  stage symbols for build  +--------------------------+
181      |            |<--------------------------|                          |
182      |   Crash    |                           |                          |
183      |   Server   |   dump to symbolicate     | Autotest Frontend (AFE)  |
184      |            |<--------------------------|       [Suite Job]        |
185      |            |-------------------------->|                          |
186      +------------+    symbolicated dump      +--------------------------+
187
1882) As jobs finish, we record their success or failure in the status of the suite
189   job.  We also record a 'job keyval' in the suite job for each test, noting
190   the job ID and job owner.  This can be used to refer to test logs later.
1913) Once all jobs are complete, status is recorded for the suite job, and the
192   job_repo_url host attribute is removed from all hosts used by the suite.
193
194"""
195
196
197DEFAULT_TRY_JOB_TIMEOUT_MINS = tools.try_job_timeout_mins()
198
199# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.
200
201class SuiteSpec(object):
202    """
203    This class contains the info that defines a suite run.
204
205    Currently required:
206    @var build: the build to install e.g.
207                  x86-alex-release/R18-1655.0.0-a1-b1584.
208    @var board: which kind of devices to reimage.
209    @var devserver: An instance of the devserver to use with this suite.
210    @var name: a value of the SUITE control file variable to search for.
211    @var job: an instance of client.common_lib.base_job representing the
212                currently running suite job.
213
214    Currently supported optional fields:
215    @var pool: specify the pool of machines to use for scheduling purposes.
216               Default: None
217    @var num: the maximum number of devices to reimage.
218              Default in global_config
219    @var check_hosts: require appropriate hosts to be available now.
220    @var add_experimental: schedule experimental tests as well, or not.
221                           Default: True
222    @var dependencies: map of test names to dependency lists.
223                       Initially {'': []}.
224    @param suite_dependencies: A string with a comma separated list of suite
225                               level dependencies, which act just like test
226                               dependencies and are appended to each test's
227                               set of dependencies at job creation time.
228    @param predicate: Optional argument. If present, should be a function
229                      mapping ControlData objects to True if they should be
230                      included in suite. If argument is absent, suite
231                      behavior will default to creating a suite of based
232                      on the SUITE field of control files.
233    @param test_args: A dict of args passed all the way to each individual test
234                      that will be actually ran.
235    """
236
237    _REQUIRED_KEYWORDS = {
238            'board': str,
239            'builds': dict,
240            'name': str,
241            'job': base_job.base_job,
242            'devserver_url': str,
243    }
244
245    _VERSION_PREFIXES = frozenset((
246            provision.CROS_VERSION_PREFIX,
247            provision.ANDROID_BUILD_VERSION_PREFIX,
248    ))
249
250    def __init__(
251            self,
252            builds=None,
253            board=None,
254            name=None,
255            job=None,
256            pool=None,
257            num=None,
258            check_hosts=True,
259            add_experimental=True,
260            file_bugs=False,
261            file_experimental_bugs=False,
262            max_runtime_mins=24*60,
263            timeout=24,
264            timeout_mins=None,
265            suite_dependencies=None,
266            bug_template=None,
267            devserver_url=None,
268            priority=priorities.Priority.DEFAULT,
269            predicate=None,
270            wait_for_results=True,
271            job_retry=False,
272            max_retries=None,
273            offload_failures_only=False,
274            test_source_build=None,
275            run_prod_code=False,
276            delay_minutes=0,
277            job_keyvals=None,
278            test_args = None,
279            **dargs):
280        """
281        Vets arguments for reimage_and_run() and populates self with supplied
282        values.
283
284        Currently required args:
285        @param board: which kind of devices to reimage.
286        @param name: a value of the SUITE control file variable to search for.
287        @param job: an instance of client.common_lib.base_job representing the
288                    currently running suite job.
289        @param devserver_url: url to the selected devserver.
290        @param builds: the builds to install e.g.
291                       {'cros-version:': 'x86-alex-release/R18-1655.0.0',
292                        'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'}
293
294        Currently supported optional args:
295        @param test_source_build: Build that contains the server-side test code,
296                e.g., it can be the value of builds['cros-version:'] or
297                builds['fw-version:']. Default is None, that is, use
298                the server-side test code from builds['cros-version:']
299        @param pool: specify the pool of machines to use for scheduling purposes
300                     Default: None
301        @param num: the maximum number of devices to reimage.
302                    Default in global_config
303        @param check_hosts: require appropriate hosts to be available now.
304        @param add_experimental: schedule experimental tests as well, or not.
305                                 Default: True
306        @param file_bugs: File bugs when tests in this suite fail.
307                          Default: False
308        @param file_experimental_bugs: File bugs when experimental tests in
309                                       this suite fail.
310                                       Default: False
311        @param max_runtime_mins: Max runtime in mins for each of the sub-jobs
312                                 this suite will run.
313        @param timeout: Max lifetime in hours for each of the sub-jobs that
314                        this suite run.
315        @param suite_dependencies: A list of strings of suite level
316                                   dependencies, which act just like test
317                                   dependencies and are appended to each test's
318                                   set of dependencies at job creation time.
319                                   A string of comma seperated labels is
320                                   accepted for backwards compatibility.
321        @param bug_template: A template dictionary specifying the default bug
322                             filing options for failures in this suite.
323        @param priority: Integer priority level.  Higher is more important.
324        @param predicate: Optional argument. If present, should be a function
325                          mapping ControlData objects to True if they should be
326                          included in suite. If argument is absent, suite
327                          behavior will default to creating a suite of based
328                          on the SUITE field of control files.
329        @param wait_for_results: Set to False to run the suite job without
330                                 waiting for test jobs to finish. Default is
331                                 True.
332        @param job_retry: Set to True to enable job-level retry. Default is
333                          False.
334        @param max_retries: Maximum retry limit at suite level.
335                            Regardless how many times each individual test
336                            has been retried, the total number of retries
337                            happening in the suite can't exceed _max_retries.
338                            Default to None, no max.
339        @param offload_failures_only: Only enable gs_offloading for failed
340                                      jobs.
341        @param run_prod_code: If true, the suite will run the test code that
342                              lives in prod aka the test code currently on the
343                              lab servers.
344        @param delay_minutes: Delay the creation of test jobs for a given number
345                              of minutes.
346        @param job_keyvals: General job keyvals to be inserted into keyval file
347        @param test_args: A dict of args passed all the way to each individual
348                          test that will be actually ran.
349        @param **dargs: these arguments will be ignored.  This allows us to
350                        deprecate and remove arguments in ToT while not
351                        breaking branch builds.
352        """
353        self._check_init_params(
354                board=board,
355                builds=builds,
356                name=name,
357                job=job,
358                devserver_url=devserver_url)
359
360        self.board = 'board:%s' % board
361        self.builds = builds
362        self.name = name
363        self.job = job
364        self.pool = ('pool:%s' % pool) if pool else pool
365        self.num = num
366        self.check_hosts = check_hosts
367        self.skip_reimage = skip_reimage
368        self.add_experimental = add_experimental
369        self.file_bugs = file_bugs
370        self.file_experimental_bugs = file_experimental_bugs
371        self.dependencies = {'': []}
372        self.max_runtime_mins = max_runtime_mins
373        self.timeout = timeout
374        self.timeout_mins = timeout_mins or timeout * 60
375        self.bug_template = {} if bug_template is None else bug_template
376        self.priority = priority
377        self.wait_for_results = wait_for_results
378        self.job_retry = job_retry
379        self.max_retries = max_retries
380        self.offload_failures_only = offload_failures_only
381        self.run_prod_code = run_prod_code
382        self.delay_minutes = delay_minutes
383        self.job_keyvals = job_keyvals
384        self.test_args = test_args
385
386        self._init_predicate(predicate)
387        self._init_suite_dependencies(suite_dependencies)
388        self._init_devserver(devserver_url)
389        self._init_test_source_build(test_source_build)
390        self._translate_builds()
391        self._add_builds_to_suite_deps()
392
393    def _check_init_params(self, **kwargs):
394        for key, expected_type in self._REQUIRED_KEYWORDS.iteritems():
395            value = kwargs.get(key)
396            # TODO(ayatane): `not value` includes both the cases where value is
397            # None and where value is the correct type, but empty (e.g., empty
398            # dict).  It looks like this is NOT the intended behavior, but I'm
399            # hesitant to remove it in case something is actually relying on
400            # this behavior.
401            if not value or not isinstance(value, expected_type):
402                raise error.SuiteArgumentException(
403                        'reimage_and_run() needs %s=<%r>'
404                        % (key, expected_type))
405
406    def _init_predicate(self, predicate):
407        """Initialize predicate attribute."""
408        if predicate is None:
409            self.predicate = Suite.name_in_tag_predicate(self.name)
410        else:
411            self.predicate = predicate
412
413
414    def _init_suite_dependencies(self, suite_dependencies):
415        """Initialize suite dependencies attribute."""
416        if suite_dependencies is None:
417            self.suite_dependencies = []
418        elif isinstance(suite_dependencies, str):
419            self.suite_dependencies = [dep.strip(' ') for dep
420                                       in suite_dependencies.split(',')]
421        else:
422            self.suite_dependencies = suite_dependencies
423
424    def _init_devserver(self, devserver_url):
425        """Initialize devserver attribute."""
426        if provision.ANDROID_BUILD_VERSION_PREFIX in self.builds:
427            self.devserver = dev_server.AndroidBuildServer(devserver_url)
428        else:
429            self.devserver = dev_server.ImageServer(devserver_url)
430
431    def _init_test_source_build(self, test_source_build):
432        """Initialize test_source_build attribute."""
433        if test_source_build:
434            test_source_build = self.devserver.translate(test_source_build)
435
436        self.test_source_build = Suite.get_test_source_build(
437                self.builds, test_source_build=test_source_build)
438
439    def _translate_builds(self):
440        """Translate build names if they are in LATEST format."""
441        for prefix in self._VERSION_PREFIXES:
442            if prefix in self.builds:
443                translated_build = self.devserver.translate(
444                        self.builds[prefix])
445                self.builds[prefix] = translated_build
446
447    def _add_builds_to_suite_deps(self):
448        """Add builds to suite_dependencies.
449
450        To support provision both CrOS and firmware, option builds are added to
451        SuiteSpec, e.g.,
452
453        builds = {'cros-version:': 'x86-alex-release/R18-1655.0.0',
454                  'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'}
455
456        version_prefix+build should make it into each test as a DEPENDENCY.
457        The easiest way to do this is to tack it onto the suite_dependencies.
458        """
459        self.suite_dependencies.extend(
460                provision.join(version_prefix, build)
461                for version_prefix, build in self.builds.iteritems()
462        )
463
464
465def skip_reimage(g):
466    """
467    Pulls the SKIP_IMAGE value out of a global variables dictionary.
468    @param g: The global variables dictionary.
469    @return:  Value associated with SKIP-IMAGE
470    """
471    return False
472
473
474def reimage_and_run(**dargs):
475    """
476    Backward-compatible API for dynamic_suite.
477
478    Will re-image a number of devices (of the specified board) with the
479    provided builds, and then run the indicated test suite on them.
480    Guaranteed to be compatible with any build from stable to dev.
481
482    @param dargs: Dictionary containing the arguments listed below.
483
484    Currently required args:
485    @param board: which kind of devices to reimage.
486    @param name: a value of the SUITE control file variable to search for.
487    @param job: an instance of client.common_lib.base_job representing the
488                currently running suite job.
489
490    Currently supported optional args:
491    @param builds: the builds to install e.g.
492                   {'cros-version:': 'x86-alex-release/R18-1655.0.0',
493                    'fw-version:':  'x86-alex-firmware/R36-5771.50.0'}
494    @param pool: specify the pool of machines to use for scheduling purposes.
495                 Default: None
496    @param num: the maximum number of devices to reimage.
497                Default in global_config
498    @param check_hosts: require appropriate hosts to be available now.
499    @param add_experimental: schedule experimental tests as well, or not.
500                             Default: True
501    @param file_bugs: automatically file bugs on test failures.
502                      Default: False
503    @param suite_dependencies: A string with a comma separated list of suite
504                               level dependencies, which act just like test
505                               dependencies and are appended to each test's
506                               set of dependencies at job creation time.
507    @param devserver_url: url to the selected devserver.
508    @param predicate: Optional argument. If present, should be a function
509                      mapping ControlData objects to True if they should be
510                      included in suite. If argument is absent, suite
511                      behavior will default to creating a suite of based
512                      on the SUITE field of control files.
513    @param job_retry: A bool value indicating whether jobs should be retired
514                      on failure. If True, the field 'JOB_RETRIES' in control
515                      files will be respected. If False, do not retry.
516    @param max_retries: Maximum retry limit at suite level.
517                        Regardless how many times each individual test
518                        has been retried, the total number of retries
519                        happening in the suite can't exceed _max_retries.
520                        Default to None, no max.
521    @param offload_failures_only: Only enable gs_offloading for failed jobs.
522    @param test_args: A dict of args passed all the way to each individual test
523                      that will be actually ran.
524    @raises AsynchronousBuildFailure: if there was an issue finishing staging
525                                      from the devserver.
526    @raises MalformedDependenciesException: if the dependency_info file for
527                                            the required build fails to parse.
528    """
529    suite_spec = SuiteSpec(**dargs)
530
531    afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10,
532                                        user=suite_spec.job.user, debug=False)
533    tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10,
534                                        user=suite_spec.job.user, debug=False)
535
536    try:
537        my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag))
538        logging.debug('Determined own job id: %d', my_job_id)
539    except ValueError:
540        my_job_id = None
541        logging.warning('Could not determine own job id.')
542
543    _perform_reimage_and_run(suite_spec, afe, tko, suite_job_id=my_job_id)
544
545    logging.debug('Returning from dynamic_suite.reimage_and_run.')
546
547
548def _perform_reimage_and_run(spec, afe, tko, suite_job_id=None):
549    """
550    Do the work of reimaging hosts and running tests.
551
552    @param spec: a populated SuiteSpec object.
553    @param afe: an instance of AFE as defined in server/frontend.py.
554    @param tko: an instance of TKO as defined in server/frontend.py.
555    @param suite_job_id: Job id that will act as parent id to all sub jobs.
556                         Default: None
557    """
558    # We can't do anything else until the devserver has finished downloading
559    # control_files and test_suites packages so that we can get the control
560    # files we should schedule.
561    if not spec.run_prod_code:
562        _stage_artifacts(spec)
563
564    timestamp = datetime.datetime.now().strftime(time_utils.TIME_FMT)
565    utils.write_keyval(
566        spec.job.resultdir,
567        {constants.ARTIFACT_FINISHED_TIME: timestamp})
568
569    suite = Suite.create_from_predicates(
570            predicates=[spec.predicate],
571            name=spec.name,
572            builds=spec.builds,
573            board=spec.board,
574            devserver=spec.devserver,
575            afe=afe,
576            tko=tko,
577            pool=spec.pool,
578            results_dir=spec.job.resultdir,
579            max_runtime_mins=spec.max_runtime_mins,
580            timeout_mins=spec.timeout_mins,
581            file_bugs=spec.file_bugs,
582            file_experimental_bugs=spec.file_experimental_bugs,
583            suite_job_id=suite_job_id,
584            extra_deps=spec.suite_dependencies,
585            priority=spec.priority,
586            wait_for_results=spec.wait_for_results,
587            job_retry=spec.job_retry,
588            max_retries=spec.max_retries,
589            offload_failures_only=spec.offload_failures_only,
590            test_source_build=spec.test_source_build,
591            run_prod_code=spec.run_prod_code,
592            job_keyvals=spec.job_keyvals,
593            test_args=spec.test_args)
594
595    if spec.delay_minutes:
596        logging.debug('delay_minutes is set. Sleeping %d minutes before '
597                      'creating test jobs.', spec.delay_minutes)
598        time.sleep(spec.delay_minutes*60)
599        logging.debug('Finished waiting for %d minutes before creating test '
600                      'jobs.', spec.delay_minutes)
601
602    # Now we get to asychronously schedule tests.
603    suite.schedule(spec.job.record_entry, spec.add_experimental)
604
605    if suite.wait_for_results:
606        logging.debug('Waiting on suite.')
607        suite.wait(spec.job.record_entry, spec.bug_template)
608        logging.debug('Finished waiting on suite. '
609                      'Returning from _perform_reimage_and_run.')
610    else:
611        logging.info('wait_for_results is set to False, suite job will exit '
612                     'without waiting for test jobs to finish.')
613
614
615def _stage_artifacts(suite_spec):
616    """Stage artifacts for a suite job.
617
618    @param suite_spec: a populated SuiteSpec object.
619    """
620    try:
621        suite_spec.devserver.stage_artifacts(
622                image=suite_spec.test_source_build,
623                artifacts=['control_files', 'test_suites'])
624    except dev_server.DevServerException as e:
625        # If we can't get the control files, there's nothing to run.
626        raise error.AsynchronousBuildFailure(e)
627