toolchain-utils/user_activity_benchmarks/select_optimal_benchmark_set.py

#!/usr/bin/python2

# Copyright 2016 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Selects the optimal set of benchmarks.

For each benchmark, there is a file with the common functions, as extracted by
the process_hot_functions module.

The script receives as input the CSV file with the CWP inclusive count values,
the file with Chrome OS groups and the path containing a file with common
functions for every benchmark.

It extracts for every benchmark and for the CWP data all the functions that
match the given Chrome OS groups.

It generates all possible combinations of benchmark sets of a given size and
it computes for every set a metric.
It outputs the optimal sets, based on which ones have the best metric.

Three different metrics have been used: function count, distance
variation and score.

For the function count metric, we count the unique functions covered by a
set of benchmarks. Besides the number of unique functions, we compute also
the fraction of unique functions out of the amount of CWP functions from the
given groups. The benchmark set with the highest amount of unique functions
that belong to all the given groups is considered better.

For the distance variation metric, we compute the sum of the distance variations
of the functions covered by a set of benchmarks. We define the distance
variation as the difference between the distance value of a function and the
ideal distance value (1.0). If a function appears in multiple common functions
files, we consider only the minimum value. We compute also the distance
variation per function. The set that has the smaller value for the
distance variation per function is considered better.

For the score metric, we compute the sum of the scores of the functions from a
set of benchmarks. If a function appears in multiple common functions files,
we consider only the maximum value. We compute also the fraction of this sum
from the sum of all the scores of the functions from the CWP data covering the
given groups, in the ideal case (the ideal score of a function is 1.0).

We compute the metrics in the same manner for individual Chrome OS groups.
"""

from collections import defaultdict

import argparse
import csv
import itertools
import json
import operator
import os
import sys

import benchmark_metrics
import utils


class BenchmarkSet(object):
  """Selects the optimal set of benchmarks of given size."""

  # Constants that specify the metric type.
  FUNCTION_COUNT_METRIC = 'function_count'
  DISTANCE_METRIC = 'distance_variation'
  SCORE_METRIC = 'score_fraction'

  def __init__(self, benchmark_set_size, benchmark_set_output_file,
               benchmark_set_common_functions_path, cwp_inclusive_count_file,
               cwp_function_groups_file, metric):
    """Initializes the BenchmarkSet.

    Args:
      benchmark_set_size: Constant representing the size of a benchmark set.
      benchmark_set_output_file: The output file that will contain the set of
        optimal benchmarks with the metric values.
      benchmark_set_common_functions_path: The directory containing the files
        with the common functions for the list of benchmarks.
      cwp_inclusive_count_file: The CSV file containing the CWP functions with
        their inclusive count values.
      cwp_function_groups_file: The file that contains the CWP function groups.
      metric: The type of metric used for the analysis.
    """
    self._benchmark_set_size = int(benchmark_set_size)
    self._benchmark_set_output_file = benchmark_set_output_file
    self._benchmark_set_common_functions_path = \
        benchmark_set_common_functions_path
    self._cwp_inclusive_count_file = cwp_inclusive_count_file
    self._cwp_function_groups_file = cwp_function_groups_file
    self._metric = metric

  @staticmethod
  def OrganizeCWPFunctionsInGroups(cwp_inclusive_count_statistics,
                                   cwp_function_groups):
    """Selects the CWP functions that match the given Chrome OS groups.

    Args:
      cwp_inclusive_count_statistics: A dict with the CWP functions.
      cwp_function_groups: A list with the CWP function groups.

    Returns:
      A dict having as a key the name of the groups and as a value the list of
      CWP functions that match an individual group.
    """
    cwp_functions_grouped = defaultdict(list)
    for function_key in cwp_inclusive_count_statistics:
      _, file_name = function_key.split(',')
      for group_name, file_path in cwp_function_groups:
        if file_path not in file_name:
          continue
        cwp_functions_grouped[group_name].append(function_key)
        break
    return cwp_functions_grouped

  @staticmethod
  def OrganizeBenchmarkSetFunctionsInGroups(benchmark_set_files,
                                            benchmark_set_common_functions_path,
                                            cwp_function_groups):
    """Selects the benchmark functions that match the given Chrome OS groups.

    Args:
      benchmark_set_files: The list of common functions files corresponding to a
        benchmark.
      benchmark_set_common_functions_path: The directory containing the files
        with the common functions for the list of benchmarks.
      cwp_function_groups: A list with the CWP function groups.

    Returns:
      A dict having as a key the name of a common functions file. The value is
      a dict having as a key the name of a group and as value a list of
      functions that match the given group.
    """

    benchmark_set_functions_grouped = {}
    for benchmark_file_name in benchmark_set_files:
      benchmark_full_file_path = \
          os.path.join(benchmark_set_common_functions_path,
                       benchmark_file_name)
      with open(benchmark_full_file_path) as input_file:
        statistics_reader = \
            csv.DictReader(input_file, delimiter=',')
        benchmark_functions_grouped = defaultdict(dict)
        for statistic in statistics_reader:
          function_name = statistic['function']
          file_name = statistic['file']
          for group_name, file_path in cwp_function_groups:
            if file_path not in file_name:
              continue
            function_key = ','.join([function_name, file_name])
            distance = float(statistic['distance'])
            score = float(statistic['score'])
            benchmark_functions_grouped[group_name][function_key] = \
                (distance, score)
            break
          benchmark_set_functions_grouped[benchmark_file_name] = \
              benchmark_functions_grouped
    return benchmark_set_functions_grouped

  @staticmethod
  def SelectOptimalBenchmarkSetBasedOnMetric(all_benchmark_combinations_sets,
                                             benchmark_set_functions_grouped,
                                             cwp_functions_grouped,
                                             metric_function_for_set,
                                             metric_comparison_operator,
                                             metric_default_value,
                                             metric_string):
    """Generic method that selects the optimal benchmark set based on a metric.

    The reason of implementing a generic function is to avoid logic duplication
    for selecting a benchmark set based on the three different metrics.

    Args:
      all_benchmark_combinations_sets: The list with all the sets of benchmark
        combinations.
      benchmark_set_functions_grouped: A dict with benchmark functions as
        returned by OrganizeBenchmarkSetFunctionsInGroups.
      cwp_functions_grouped: A dict with the CWP functions as returned by
        OrganizeCWPFunctionsInGroups.
      metric_function_for_set: The method used to compute the metric for a given
        benchmark set.
      metric_comparison_operator: A comparison operator used to compare two
        values of the same metric (i.e: operator.lt or operator.gt).
      metric_default_value: The default value for the metric.
      metric_string: A tuple of strings used in the JSON output for the pair of
        the values of the metric.

    Returns:
      A list of tuples containing for each optimal benchmark set. A tuple
      contains the list of benchmarks from the set, the pair of metric values
      and a dictionary with the metrics for each group.
    """
    optimal_sets = [([], metric_default_value, {})]

    for benchmark_combination_set in all_benchmark_combinations_sets:
      function_metrics = [benchmark_set_functions_grouped[benchmark]
                          for benchmark in benchmark_combination_set]
      set_metrics, set_groups_metrics = \
          metric_function_for_set(function_metrics, cwp_functions_grouped,
                                  metric_string)
      optimal_value = optimal_sets[0][1][0]
      if metric_comparison_operator(set_metrics[0], optimal_value):
        optimal_sets = \
            [(benchmark_combination_set, set_metrics, set_groups_metrics)]
      elif set_metrics[0] == optimal_sets[0][1][0]:
        optimal_sets.append(
            (benchmark_combination_set, set_metrics, set_groups_metrics))

    return optimal_sets

  def SelectOptimalBenchmarkSet(self):
    """Selects the optimal benchmark sets and writes them in JSON format.

    Parses the CWP inclusive count statistics and benchmark common functions
    files. Organizes the functions into groups. For every optimal benchmark
    set, the method writes in the self._benchmark_set_output_file the list of
    benchmarks, the pair of metrics and a dictionary with the pair of
    metrics for each group covered by the benchmark set.
    """

    benchmark_set_files = os.listdir(self._benchmark_set_common_functions_path)
    all_benchmark_combinations_sets = \
        itertools.combinations(benchmark_set_files, self._benchmark_set_size)

    with open(self._cwp_function_groups_file) as input_file:
      cwp_function_groups = utils.ParseFunctionGroups(input_file.readlines())

    cwp_inclusive_count_statistics = \
        utils.ParseCWPInclusiveCountFile(self._cwp_inclusive_count_file)
    cwp_functions_grouped = self.OrganizeCWPFunctionsInGroups(
        cwp_inclusive_count_statistics, cwp_function_groups)
    benchmark_set_functions_grouped = \
        self.OrganizeBenchmarkSetFunctionsInGroups(
            benchmark_set_files, self._benchmark_set_common_functions_path,
            cwp_function_groups)

    if self._metric == self.FUNCTION_COUNT_METRIC:
      metric_function_for_benchmark_set = \
          benchmark_metrics.ComputeFunctionCountForBenchmarkSet
      metric_comparison_operator = operator.gt
      metric_default_value = (0, 0.0)
      metric_string = ('function_count', 'function_count_fraction')
    elif self._metric == self.DISTANCE_METRIC:
      metric_function_for_benchmark_set = \
          benchmark_metrics.ComputeDistanceForBenchmarkSet
      metric_comparison_operator = operator.lt
      metric_default_value = (float('inf'), float('inf'))
      metric_string = \
          ('distance_variation_per_function', 'total_distance_variation')
    elif self._metric == self.SCORE_METRIC:
      metric_function_for_benchmark_set = \
          benchmark_metrics.ComputeScoreForBenchmarkSet
      metric_comparison_operator = operator.gt
      metric_default_value = (0.0, 0.0)
      metric_string = ('score_fraction', 'total_score')
    else:
      raise ValueError("Invalid metric")

    optimal_benchmark_sets = \
        self.SelectOptimalBenchmarkSetBasedOnMetric(
            all_benchmark_combinations_sets, benchmark_set_functions_grouped,
            cwp_functions_grouped, metric_function_for_benchmark_set,
            metric_comparison_operator, metric_default_value, metric_string)

    json_output = []

    for benchmark_set in optimal_benchmark_sets:
      json_entry = {
          'benchmark_set':
              list(benchmark_set[0]),
          'metrics': {
              metric_string[0]: benchmark_set[1][0],
              metric_string[1]: benchmark_set[1][1]
          },
          'groups':
              dict(benchmark_set[2])
      }
      json_output.append(json_entry)

    with open(self._benchmark_set_output_file, 'w') as output_file:
      json.dump(json_output, output_file)


def ParseArguments(arguments):
  parser = argparse.ArgumentParser()

  parser.add_argument(
      '--benchmark_set_common_functions_path',
      required=True,
      help='The directory containing the CSV files with the common functions '
      'of the benchmark profiles and CWP data. A file will contain all the hot '
      'functions from a pprof top output file that are also included in the '
      'file containing the cwp inclusive count values. The CSV fields are: the '
      'function name, the file and the object where the function is declared, '
      'the CWP inclusive count and inclusive count fraction values, the '
      'cumulative and average distance, the cumulative and average score. The '
      'files with the common functions will have the same names with the '
      'corresponding pprof output files.')
  parser.add_argument(
      '--cwp_inclusive_count_file',
      required=True,
      help='The CSV file containing the CWP hot functions with their '
      'inclusive_count values. The CSV fields include the name of the '
      'function, the file and the object with the definition, the inclusive '
      'count value and the inclusive count fraction out of the total amount of '
      'inclusive count values.')
  parser.add_argument(
      '--benchmark_set_size',
      required=True,
      help='The size of the benchmark sets.')
  parser.add_argument(
      '--benchmark_set_output_file',
      required=True,
      help='The JSON output file containing optimal benchmark sets with their '
      'metrics. For every optimal benchmark set, the file contains the list of '
      'benchmarks, the pair of metrics and a dictionary with the pair of '
      'metrics for each group covered by the benchmark set.')
  parser.add_argument(
      '--metric',
      required=True,
      help='The metric used to select the optimal benchmark set. The possible '
      'values are: distance_variation, function_count and score_fraction.')
  parser.add_argument(
      '--cwp_function_groups_file',
      required=True,
      help='The file that contains the CWP function groups. A line consists in '
      'the group name and a file path describing the group. A group must '
      'represent a Chrome OS component.')

  options = parser.parse_args(arguments)

  return options


def Main(argv):
  options = ParseArguments(argv)
  benchmark_set = BenchmarkSet(options.benchmark_set_size,
                               options.benchmark_set_output_file,
                               options.benchmark_set_common_functions_path,
                               options.cwp_inclusive_count_file,
                               options.cwp_function_groups_file, options.metric)
  benchmark_set.SelectOptimalBenchmarkSet()


if __name__ == '__main__':
  Main(sys.argv[1:])