select_optimal_benchmark_set.py revision edc6eb87218afbde3c3df6e0faa99958f709c825
1#!/usr/bin/python2
2
3# Copyright 2016 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6"""Selects the optimal set of benchmarks.
7
8For each benchmark, there is a file with the common functions, as extracted by
9the process_hot_functions module.
10
11The script receives as input the CSV file with the CWP inclusive count values,
12the file with Chrome OS groups and the path containing a file with common
13functions for every benchmark.
14
15It extracts for every benchmark and for the CWP data all the functions that
16match the given Chrome OS groups.
17
18It generates all possible combinations of benchmark sets of a given size and
19it computes for every set a metric.
20It outputs the optimal sets, based on which ones have the best metric.
21
22Three different metrics have been used: function count, distance
23variation and score.
24
25For the function count metric, we count the unique functions covered by a
26set of benchmarks. Besides the number of unique functions, we compute also
27the fraction of unique functions out of the amount of CWP functions from the
28given groups. The benchmark set with the highest amount of unique functions
29that belong to all the given groups is considered better.
30
31For the distance variation metric, we compute the sum of the distance variations
32of the functions covered by a set of benchmarks. We define the distance
33variation as the difference between the distance value of a function and the
34ideal distance value (1.0). If a function appears in multiple common functions
35files, we consider only the minimum value. We compute also the distance
36variation per function. The set that has the smaller value for the
37distance variation per function is considered better.
38
39For the score metric, we compute the sum of the scores of the functions from a
40set of benchmarks. If a function appears in multiple common functions files,
41we consider only the maximum value. We compute also the fraction of this sum
42from the sum of all the scores of the functions from the CWP data covering the
43given groups, in the ideal case (the ideal score of a function is 1.0).
44
45We compute the metrics in the same manner for individual Chrome OS groups.
46"""
47
48from collections import defaultdict
49
50import argparse
51import csv
52import itertools
53import json
54import operator
55import os
56import sys
57
58import benchmark_metrics
59import utils
60
61
62class BenchmarkSet(object):
63  """Selects the optimal set of benchmarks of given size."""
64
65  # Constants that specify the metric type.
66  FUNCTION_COUNT_METRIC = 'function_count'
67  DISTANCE_METRIC = 'distance_variation'
68  SCORE_METRIC = 'score_fraction'
69
70  def __init__(self, benchmark_set_size, benchmark_set_output_file,
71               benchmark_set_common_functions_path, cwp_inclusive_count_file,
72               cwp_function_groups_file, metric):
73    """Initializes the BenchmarkSet.
74
75    Args:
76      benchmark_set_size: Constant representing the size of a benchmark set.
77      benchmark_set_output_file: The output file that will contain the set of
78        optimal benchmarks with the metric values.
79      benchmark_set_common_functions_path: The directory containing the files
80        with the common functions for the list of benchmarks.
81      cwp_inclusive_count_file: The CSV file containing the CWP functions with
82        their inclusive count values.
83      cwp_function_groups_file: The file that contains the CWP function groups.
84      metric: The type of metric used for the analysis.
85    """
86    self._benchmark_set_size = int(benchmark_set_size)
87    self._benchmark_set_output_file = benchmark_set_output_file
88    self._benchmark_set_common_functions_path = \
89        benchmark_set_common_functions_path
90    self._cwp_inclusive_count_file = cwp_inclusive_count_file
91    self._cwp_function_groups_file = cwp_function_groups_file
92    self._metric = metric
93
94  @staticmethod
95  def OrganizeCWPFunctionsInGroups(cwp_inclusive_count_statistics,
96                                   cwp_function_groups):
97    """Selects the CWP functions that match the given Chrome OS groups.
98
99    Args:
100      cwp_inclusive_count_statistics: A dict with the CWP functions.
101      cwp_function_groups: A list with the CWP function groups.
102
103    Returns:
104      A dict having as a key the name of the groups and as a value the list of
105      CWP functions that match an individual group.
106    """
107    cwp_functions_grouped = defaultdict(list)
108    for function_key in cwp_inclusive_count_statistics:
109      _, file_name = function_key.split(',')
110      for group_name, file_path in cwp_function_groups:
111        if file_path not in file_name:
112          continue
113        cwp_functions_grouped[group_name].append(function_key)
114        break
115    return cwp_functions_grouped
116
117  @staticmethod
118  def OrganizeBenchmarkSetFunctionsInGroups(benchmark_set_files,
119                                            benchmark_set_common_functions_path,
120                                            cwp_function_groups):
121    """Selects the benchmark functions that match the given Chrome OS groups.
122
123    Args:
124      benchmark_set_files: The list of common functions files corresponding to a
125        benchmark.
126      benchmark_set_common_functions_path: The directory containing the files
127        with the common functions for the list of benchmarks.
128      cwp_function_groups: A list with the CWP function groups.
129
130    Returns:
131      A dict having as a key the name of a common functions file. The value is
132      a dict having as a key the name of a group and as value a list of
133      functions that match the given group.
134    """
135
136    benchmark_set_functions_grouped = {}
137    for benchmark_file_name in benchmark_set_files:
138      benchmark_full_file_path = \
139          os.path.join(benchmark_set_common_functions_path,
140                       benchmark_file_name)
141      with open(benchmark_full_file_path) as input_file:
142        statistics_reader = \
143            csv.DictReader(input_file, delimiter=',')
144        benchmark_functions_grouped = defaultdict(dict)
145        for statistic in statistics_reader:
146          function_name = statistic['function']
147          file_name = statistic['file']
148          for group_name, file_path in cwp_function_groups:
149            if file_path not in file_name:
150              continue
151            function_key = ','.join([function_name, file_name])
152            distance = float(statistic['distance'])
153            score = float(statistic['score'])
154            benchmark_functions_grouped[group_name][function_key] = \
155                (distance, score)
156            break
157          benchmark_set_functions_grouped[benchmark_file_name] = \
158              benchmark_functions_grouped
159    return benchmark_set_functions_grouped
160
161  @staticmethod
162  def SelectOptimalBenchmarkSetBasedOnMetric(all_benchmark_combinations_sets,
163                                             benchmark_set_functions_grouped,
164                                             cwp_functions_grouped,
165                                             metric_function_for_set,
166                                             metric_comparison_operator,
167                                             metric_default_value,
168                                             metric_string):
169    """Generic method that selects the optimal benchmark set based on a metric.
170
171    The reason of implementing a generic function is to avoid logic duplication
172    for selecting a benchmark set based on the three different metrics.
173
174    Args:
175      all_benchmark_combinations_sets: The list with all the sets of benchmark
176        combinations.
177      benchmark_set_functions_grouped: A dict with benchmark functions as
178        returned by OrganizeBenchmarkSetFunctionsInGroups.
179      cwp_functions_grouped: A dict with the CWP functions as returned by
180        OrganizeCWPFunctionsInGroups.
181      metric_function_for_set: The method used to compute the metric for a given
182        benchmark set.
183      metric_comparison_operator: A comparison operator used to compare two
184        values of the same metric (i.e: operator.lt or operator.gt).
185      metric_default_value: The default value for the metric.
186      metric_string: A tuple of strings used in the JSON output for the pair of
187        the values of the metric.
188
189    Returns:
190      A list of tuples containing for each optimal benchmark set. A tuple
191      contains the list of benchmarks from the set, the pair of metric values
192      and a dictionary with the metrics for each group.
193    """
194    optimal_sets = [([], metric_default_value, {})]
195
196    for benchmark_combination_set in all_benchmark_combinations_sets:
197      function_metrics = [benchmark_set_functions_grouped[benchmark]
198                          for benchmark in benchmark_combination_set]
199      set_metrics, set_groups_metrics = \
200          metric_function_for_set(function_metrics, cwp_functions_grouped,
201                                  metric_string)
202      optimal_value = optimal_sets[0][1][0]
203      if metric_comparison_operator(set_metrics[0], optimal_value):
204        optimal_sets = \
205            [(benchmark_combination_set, set_metrics, set_groups_metrics)]
206      elif set_metrics[0] == optimal_sets[0][1][0]:
207        optimal_sets.append(
208            (benchmark_combination_set, set_metrics, set_groups_metrics))
209
210    return optimal_sets
211
212  def SelectOptimalBenchmarkSet(self):
213    """Selects the optimal benchmark sets and writes them in JSON format.
214
215    Parses the CWP inclusive count statistics and benchmark common functions
216    files. Organizes the functions into groups. For every optimal benchmark
217    set, the method writes in the self._benchmark_set_output_file the list of
218    benchmarks, the pair of metrics and a dictionary with the pair of
219    metrics for each group covered by the benchmark set.
220    """
221
222    benchmark_set_files = os.listdir(self._benchmark_set_common_functions_path)
223    all_benchmark_combinations_sets = \
224        itertools.combinations(benchmark_set_files, self._benchmark_set_size)
225
226    with open(self._cwp_function_groups_file) as input_file:
227      cwp_function_groups = utils.ParseFunctionGroups(input_file.readlines())
228
229    cwp_inclusive_count_statistics = \
230        utils.ParseCWPInclusiveCountFile(self._cwp_inclusive_count_file)
231    cwp_functions_grouped = self.OrganizeCWPFunctionsInGroups(
232        cwp_inclusive_count_statistics, cwp_function_groups)
233    benchmark_set_functions_grouped = \
234        self.OrganizeBenchmarkSetFunctionsInGroups(
235            benchmark_set_files, self._benchmark_set_common_functions_path,
236            cwp_function_groups)
237
238    if self._metric == self.FUNCTION_COUNT_METRIC:
239      metric_function_for_benchmark_set = \
240          benchmark_metrics.ComputeFunctionCountForBenchmarkSet
241      metric_comparison_operator = operator.gt
242      metric_default_value = (0, 0.0)
243      metric_string = ('function_count', 'function_count_fraction')
244    elif self._metric == self.DISTANCE_METRIC:
245      metric_function_for_benchmark_set = \
246          benchmark_metrics.ComputeDistanceForBenchmarkSet
247      metric_comparison_operator = operator.lt
248      metric_default_value = (float('inf'), float('inf'))
249      metric_string = \
250          ('distance_variation_per_function', 'total_distance_variation')
251    elif self._metric == self.SCORE_METRIC:
252      metric_function_for_benchmark_set = \
253          benchmark_metrics.ComputeScoreForBenchmarkSet
254      metric_comparison_operator = operator.gt
255      metric_default_value = (0.0, 0.0)
256      metric_string = ('score_fraction', 'total_score')
257    else:
258      raise ValueError("Invalid metric")
259
260    optimal_benchmark_sets = \
261        self.SelectOptimalBenchmarkSetBasedOnMetric(
262            all_benchmark_combinations_sets, benchmark_set_functions_grouped,
263            cwp_functions_grouped, metric_function_for_benchmark_set,
264            metric_comparison_operator, metric_default_value, metric_string)
265
266    json_output = []
267
268    for benchmark_set in optimal_benchmark_sets:
269      json_entry = {
270          'benchmark_set':
271              list(benchmark_set[0]),
272          'metrics': {
273              metric_string[0]: benchmark_set[1][0],
274              metric_string[1]: benchmark_set[1][1]
275          },
276          'groups':
277              dict(benchmark_set[2])
278      }
279      json_output.append(json_entry)
280
281    with open(self._benchmark_set_output_file, 'w') as output_file:
282      json.dump(json_output, output_file)
283
284
285def ParseArguments(arguments):
286  parser = argparse.ArgumentParser()
287
288  parser.add_argument(
289      '--benchmark_set_common_functions_path',
290      required=True,
291      help='The directory containing the CSV files with the common functions '
292      'of the benchmark profiles and CWP data. A file will contain all the hot '
293      'functions from a pprof top output file that are also included in the '
294      'file containing the cwp inclusive count values. The CSV fields are: the '
295      'function name, the file and the object where the function is declared, '
296      'the CWP inclusive count and inclusive count fraction values, the '
297      'cumulative and average distance, the cumulative and average score. The '
298      'files with the common functions will have the same names with the '
299      'corresponding pprof output files.')
300  parser.add_argument(
301      '--cwp_inclusive_count_file',
302      required=True,
303      help='The CSV file containing the CWP hot functions with their '
304      'inclusive_count values. The CSV fields include the name of the '
305      'function, the file and the object with the definition, the inclusive '
306      'count value and the inclusive count fraction out of the total amount of '
307      'inclusive count values.')
308  parser.add_argument(
309      '--benchmark_set_size',
310      required=True,
311      help='The size of the benchmark sets.')
312  parser.add_argument(
313      '--benchmark_set_output_file',
314      required=True,
315      help='The JSON output file containing optimal benchmark sets with their '
316      'metrics. For every optimal benchmark set, the file contains the list of '
317      'benchmarks, the pair of metrics and a dictionary with the pair of '
318      'metrics for each group covered by the benchmark set.')
319  parser.add_argument(
320      '--metric',
321      required=True,
322      help='The metric used to select the optimal benchmark set. The possible '
323      'values are: distance_variation, function_count and score_fraction.')
324  parser.add_argument(
325      '--cwp_function_groups_file',
326      required=True,
327      help='The file that contains the CWP function groups. A line consists in '
328      'the group name and a file path describing the group. A group must '
329      'represent a Chrome OS component.')
330
331  options = parser.parse_args(arguments)
332
333  return options
334
335
336def Main(argv):
337  options = ParseArguments(argv)
338  benchmark_set = BenchmarkSet(options.benchmark_set_size,
339                               options.benchmark_set_output_file,
340                               options.benchmark_set_common_functions_path,
341                               options.cwp_inclusive_count_file,
342                               options.cwp_function_groups_file, options.metric)
343  benchmark_set.SelectOptimalBenchmarkSet()
344
345
346if __name__ == '__main__':
347  Main(sys.argv[1:])
348