1# Copyright 2017 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""This throttler tries to remove the remove repeated files sharing the same
6prefix, for example, screenshots or dumps in the same folder. The dedupe logic
7does not compare the file content, instead, it sorts the files with the same
8prefix and remove files in the middle part.
9"""
10
11import os
12import re
13
14import result_info_lib
15import throttler_lib
16import utils_lib
17
18
19# Number of files to keep for the oldest files.
20OLDEST_FILES_TO_KEEP_COUNT = 2
21# Number of files to keep for the newest files.
22NEWEST_FILES_TO_KEEP_COUNT = 1
23
24# Files with path mathing following patterns should not be deduped.
25NO_DEDUPE_FILE_PATTERNS = [
26        'debug/.*',
27        '.*perf.data$',       # Performance test data.
28        '.*/debug/.*',
29        '.*dir_summary_\d+.json',
30        ]
31
32# regex pattern to get the prefix of a file.
33PREFIX_PATTERN = '([a-zA-Z_-]*).*'
34
35def _group_by(file_infos, keys):
36    """Group the file infos by the given keys.
37
38    @param file_infos: A list of ResultInfo objects.
39    @param keys: A list of names of the attribute to group the file infos by.
40    @return: A dictionary of grouped_key: [ResultInfo].
41    """
42    grouped_infos = {}
43    for info in file_infos:
44        key_values = []
45        for key in keys:
46            key_values.append(getattr(info, key))
47        grouped_key = os.sep.join(key_values)
48        if grouped_key not in grouped_infos:
49            grouped_infos[grouped_key] = []
50        grouped_infos[grouped_key].append(info)
51    return grouped_infos
52
53
54def _dedupe_files(summary, file_infos, max_result_size_KB):
55    """Delete the given file and update the summary.
56
57    @param summary: A ResultInfo object containing result summary.
58    @param file_infos: A list of ResultInfo objects to be de-duplicated.
59    @param max_result_size_KB: Maximum test result size in KB.
60    """
61    # Sort file infos based on the modify date of the file.
62    file_infos.sort(
63            key=lambda f: result_info_lib.get_last_modification_time(f.path))
64    file_infos_to_delete = file_infos[
65            OLDEST_FILES_TO_KEEP_COUNT:-NEWEST_FILES_TO_KEEP_COUNT]
66
67    for file_info in file_infos_to_delete:
68        if throttler_lib.try_delete_file_on_disk(file_info.path):
69            file_info.trimmed_size = 0
70
71            if throttler_lib.check_throttle_limit(summary, max_result_size_KB):
72                return
73
74
75def throttle(summary, max_result_size_KB):
76    """Throttle the files in summary by de-duplicating files.
77
78    Stop throttling until all files are processed or the result size is already
79    reduced to be under the given max_result_size_KB.
80
81    @param summary: A ResultInfo object containing result summary.
82    @param max_result_size_KB: Maximum test result size in KB.
83    """
84    _, grouped_files = throttler_lib.sort_result_files(summary)
85    for pattern in throttler_lib.RESULT_THROTTLE_PRIORITY:
86        throttable_files = list(throttler_lib.get_throttleable_files(
87                grouped_files[pattern], NO_DEDUPE_FILE_PATTERNS))
88
89        for info in throttable_files:
90            info.parent_dir = os.path.dirname(info.path)
91            info.prefix = re.match(PREFIX_PATTERN, info.name).group(1)
92
93        # Group files for each parent directory
94        grouped_infos = _group_by(throttable_files, ['parent_dir', 'prefix'])
95
96        for infos in grouped_infos.values():
97            if (len(infos) <=
98                OLDEST_FILES_TO_KEEP_COUNT + NEWEST_FILES_TO_KEEP_COUNT):
99                # No need to dedupe if the count of file is too few.
100                continue
101
102            # Remove files can be deduped
103            utils_lib.LOG('De-duplicating files in %s with the same prefix of '
104                          '"%s"' % (infos[0].parent_dir, infos[0].prefix))
105            #dedupe_file_infos = [i.result_info for i in infos]
106            _dedupe_files(summary, infos, max_result_size_KB)
107
108            if throttler_lib.check_throttle_limit(summary, max_result_size_KB):
109                return
110