utils.py revision c7faa09f456ca5c651ac373ad897aa4be6ad2717
1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4"""Utility functions for parsing pprof, CWP data and Chrome OS groups files."""
5
6from collections import defaultdict
7
8import csv
9import os
10import re
11
12SEPARATOR_REGEX = re.compile(r'-+\+-+')
13FUNCTION_STATISTIC_REGEX = \
14    re.compile(r'(\S+)\s+(\S+)%\s+(\S+)%\s+(\S+)\s+(\S+)%')
15CHILD_FUNCTION_PERCENTAGE_REGEX = re.compile(r'([0-9.]+)%')
16FUNCTION_KEY_SEPARATOR_REGEX = re.compile(r'\|\s+')
17# Constants used to identify if a function is common in the pprof and CWP
18# files.
19COMMON_FUNCTION = 'common'
20EXTRA_FUNCTION = 'extra'
21PARENT_CHILD_FUNCTIONS_SEPARATOR = ';;'
22# List of pairs of strings used for make substitutions in file names to make
23# CWP and pprof data consistent.
24FILE_NAME_REPLACING_PAIR_STRINGS = [('/build/gnawty', '/build/BOARD'),
25                                    ('/build/amd64-generic', '/build/BOARD'),
26                                    (' ../sysdeps', ',sysdeps'),
27                                    (' ../nptl', ',nptl'),
28                                    ('  aes-x86_64.s', ',aes-x86_64.s'),
29                                    (' (inline)', ''),
30                                    (' (partial-inline)', ''),
31                                    (' ../', ','),
32                                    ('../', '')]
33# Separator used to delimit the function from the file name.
34FUNCTION_FILE_SEPARATOR = ' /'
35
36
37def MakeCWPAndPprofFileNamesConsistent(file_name):
38  """Makes the CWP and pprof file names consistent.
39
40  For the same function, it may happen for some file paths to differ slightly
41  in the CWP data compared to the pprof output. In a file name, for each tuple
42  element of the list, we substitute the first element with the second one.
43
44  Args:
45    file_name: A string representing the name of the file.
46
47  Returns:
48    A string representing the modified name of tihe file.
49  """
50  file_name = file_name.replace(', ', '; ')
51  for replacing_pair_string in FILE_NAME_REPLACING_PAIR_STRINGS:
52    file_name = file_name.replace(replacing_pair_string[0],
53                                  replacing_pair_string[1])
54
55  return file_name
56
57def MakePprofFunctionKey(function_and_file_name):
58  """Creates the function key from the function and file name.
59
60  Parsing the the pprof --top and --tree outputs is difficult due to the fact
61  that it hard to extract the function and file name (i.e the function names
62  can have a lot of unexpected charachters such as spaces, operators etc).
63  For the moment, we used FUNCTION_FILE_SEPARATOR as delimiter between the
64  function and the file name. However, there are some cases where the file name
65  does not start with / and we treat this cases separately (i.e ../sysdeps,
66  ../nptl, aes-x86_64.s).
67
68  Args:
69    function_and_file_name: A string representing the function and the file name
70      as it appears in the pprof output.
71
72  Returns:
73    A string representing the function key, composed from the function and file
74    name, comma separated.
75  """
76  # TODO(evelinad): Use pprof --topproto instead of pprof --top to parse
77  # protobuffers instead of text output. Investigate if there is an equivalent
78  # for pprof --tree that gives protobuffer output.
79  #
80  # In the CWP output, we replace the , with ; as a workaround for parsing
81  # csv files. We do the same for the pprof output.
82  #
83  # TODO(evelinad): Use dremel --csv_dialect=excel-tab in the queries for
84  # replacing the , delimiter with tab.
85  function_and_file_name = function_and_file_name.replace(', ', '; ')
86  # If the function and file name sequence contains the FUNCTION_FILE_SEPARATOR,
87  # we normalize the path name of the file and make the string subtitutions
88  # to make the CWP and pprof data  consistent. The returned key is composed
89  # from the function name and normalized file path name, separated by a comma.
90  # If the function and file name does not contain the FUNCTION_FILE_SEPARATOR,
91  # we just do the strings substitution.
92  if FUNCTION_FILE_SEPARATOR in function_and_file_name:
93    function_name, file_name = \
94        function_and_file_name.split(FUNCTION_FILE_SEPARATOR)
95    file_name = \
96        MakeCWPAndPprofFileNamesConsistent(os.path.normpath("/" + file_name))
97    return ','.join([function_name, file_name])
98
99  return MakeCWPAndPprofFileNamesConsistent(function_and_file_name)
100
101def ParseFunctionGroups(cwp_function_groups_lines):
102  """Parses the contents of the function groups file.
103
104  Args:
105    cwp_function_groups_lines: A list of the lines contained in the CWP
106      function groups file. A line contains the group name and the file path
107      that describes the group, separated by a space.
108
109  Returns:
110    A list of tuples containing the group name and the file path.
111  """
112  # The order of the groups mentioned in the cwp_function_groups file
113  # matters. A function declared in a file will belong to the first
114  # mentioned group that matches its path to the one of the file.
115  # It is possible to have multiple paths that belong to the same group.
116  return [tuple(line.split()) for line in cwp_function_groups_lines]
117
118
119def ParsePprofTopOutput(file_name):
120  """Parses a file that contains the output of the pprof --top command.
121
122  Args:
123    file_name: The name of the file containing the pprof --top output.
124
125  Returns:
126    A dict having as a key the name of the function and the file containing
127    the declaration of the function, separated by a comma, and as a value
128    a tuple containing the flat, flat percentage, sum percentage, cummulative
129    and cummulative percentage values.
130  """
131
132  pprof_top_statistics = {}
133
134  # In the pprof top output, the statistics of the functions start from the
135  # 6th line.
136  with open(file_name) as input_file:
137    pprof_top_content = input_file.readlines()[6:]
138
139  for line in pprof_top_content:
140    function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line)
141    flat, flat_p, sum_p, cum, cum_p = function_statistic_match.groups()
142    flat_p = str(float(flat_p) / 100.0)
143    sum_p = str(float(sum_p) / 100.0)
144    cum_p = str(float(cum_p) / 100.0)
145    lookup_index = function_statistic_match.end()
146    function_and_file_name = line[lookup_index + 2 : -1]
147    key = MakePprofFunctionKey(function_and_file_name)
148    pprof_top_statistics[key] = (flat, flat_p, sum_p, cum, cum_p)
149  return pprof_top_statistics
150
151
152def ParsePprofTreeOutput(file_name):
153  """Parses a file that contains the output of the pprof --tree command.
154
155  Args:
156    file_name: The name of the file containing the pprof --tree output.
157
158  Returns:
159    A dict including the statistics for pairs of parent and child functions.
160    The key is the name of the parent function and the file where the
161    function is declared, separated by a comma. The value is a dict having as
162    a key the name of the child function and the file where the function is
163    delcared, comma separated and as a value the percentage of time the
164    parent function spends in the child function.
165  """
166
167  # In the pprof output, the statistics of the functions start from the 9th
168  # line.
169  with open(file_name) as input_file:
170    pprof_tree_content = input_file.readlines()[9:]
171
172  pprof_tree_statistics = defaultdict(lambda: defaultdict(float))
173  track_child_functions = False
174
175  # The statistics of a given function, its parent and child functions are
176  # included between two separator marks.
177  # All the parent function statistics are above the line containing the
178  # statistics of the given function.
179  # All the statistics of a child function are below the statistics of the
180  # given function.
181  # The statistics of a parent or a child function contain the calls, calls
182  # percentage, the function name and the file where the function is declared.
183  # The statistics of the given function contain the flat, flat percentage,
184  # sum percentage, cummulative, cummulative percentage, function name and the
185  # name of the file containing the declaration of the function.
186  for line in pprof_tree_content:
187    separator_match = SEPARATOR_REGEX.search(line)
188
189    if separator_match:
190      track_child_functions = False
191      continue
192
193    parent_function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line)
194
195    if parent_function_statistic_match:
196      track_child_functions = True
197      lookup_index = parent_function_statistic_match.end()
198      parent_function_key_match = \
199          FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index)
200      lookup_index = parent_function_key_match.end()
201      parent_function_key = MakePprofFunctionKey(line[lookup_index:-1])
202      continue
203
204    if not track_child_functions:
205      continue
206
207    child_function_statistic_match = \
208        CHILD_FUNCTION_PERCENTAGE_REGEX.search(line)
209    child_function_percentage = \
210        float(child_function_statistic_match.group(1))
211    lookup_index = child_function_statistic_match.end()
212    child_function_key_match = \
213        FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index)
214    lookup_index = child_function_key_match.end()
215    child_function_key = MakePprofFunctionKey(line[lookup_index:-1])
216
217    pprof_tree_statistics[parent_function_key][child_function_key] += \
218        child_function_percentage / 100.0
219
220  return pprof_tree_statistics
221
222
223def ParseCWPInclusiveCountFile(file_name):
224  """Parses the CWP inclusive count files.
225
226  A line should contain the name of the function, the file name with the
227  declaration, the inclusive count and inclusive count fraction out of the
228  total extracted inclusive count values.
229
230  Args:
231    file_name: The file containing the inclusive count values of the CWP
232    functions.
233
234  Returns:
235    A dict containing the inclusive count statistics. The key is the name of
236    the function and the file name, comma separated. The value represents a
237    tuple with the object name containing the function declaration, the
238    inclusive count and inclusive count fraction values, and a marker to
239    identify if the function is present in one of the benchmark profiles.
240  """
241  cwp_inclusive_count_statistics = defaultdict(lambda: ('', 0, 0.0, 0))
242
243  with open(file_name) as input_file:
244    statistics_reader = csv.DictReader(input_file, delimiter=',')
245    for statistic in statistics_reader:
246      function_name = statistic['function']
247      file_name = MakeCWPAndPprofFileNamesConsistent(
248          os.path.normpath(statistic['file']))
249      dso_name = statistic['dso']
250      inclusive_count = statistic['inclusive_count']
251      inclusive_count_fraction = statistic['inclusive_count_fraction']
252
253      # We ignore the lines that have empty fields(i.e they specify only the
254      # addresses of the functions and the inclusive counts values).
255      if all([
256          function_name, file_name, dso_name, inclusive_count,
257          inclusive_count_fraction
258      ]):
259        key = '%s,%s' % (function_name, file_name)
260
261        # There might be situations where a function appears in multiple files
262        # or objects. Such situations can occur when in the Dremel queries there
263        # are not specified the Chrome OS version and the name of the board (i.e
264        # the files can belong to different kernel or library versions).
265        inclusive_count_sum = \
266            cwp_inclusive_count_statistics[key][1] + int(inclusive_count)
267        inclusive_count_fraction_sum = \
268            cwp_inclusive_count_statistics[key][2] + \
269            float(inclusive_count_fraction)
270
271        # All the functions are initially marked as EXTRA_FUNCTION.
272        value = \
273            (dso_name, inclusive_count_sum, inclusive_count_fraction_sum,
274             EXTRA_FUNCTION)
275        cwp_inclusive_count_statistics[key] = value
276
277  return cwp_inclusive_count_statistics
278
279
280def ParseCWPPairwiseInclusiveCountFile(file_name):
281  """Parses the CWP pairwise inclusive count files.
282
283  A line of the file should contain a pair of a parent and a child function,
284  concatenated by the PARENT_CHILD_FUNCTIONS_SEPARATOR, the name of the file
285  where the child function is declared and the inclusive count fractions of
286  the pair of functions out of the total amount of inclusive count values.
287
288  Args:
289    file_name: The file containing the pairwise inclusive_count statistics of
290      the
291    CWP functions.
292
293  Returns:
294    A dict containing the statistics of the parent functions and each of
295    their child functions. The key of the dict is the name of the parent
296    function. The value is a dict having as a key the name of the child
297    function with its file name separated by a ',' and as a value the
298    inclusive count value of the parent-child function pair.
299  """
300  pairwise_inclusive_count_statistics = defaultdict(lambda: defaultdict(float))
301
302  with open(file_name) as input_file:
303    statistics_reader = csv.DictReader(input_file, delimiter=',')
304
305    for statistic in statistics_reader:
306      parent_function_name, child_function_name = \
307          statistic['parent_child_functions'].split(
308              PARENT_CHILD_FUNCTIONS_SEPARATOR)
309      child_function_file_name = MakeCWPAndPprofFileNamesConsistent(
310          os.path.normpath(statistic['child_function_file']))
311      inclusive_count = statistic['inclusive_count']
312
313      # There might be situations where a child function appears in
314      # multiple files or objects. Such situations can occur when in the
315      # Dremel queries are not specified the Chrome OS version and the
316      # name of the board (i.e the files can belong to different kernel or
317      # library versions), when the child function is a template function
318      # that is declared in a header file or there are name collisions
319      # between multiple executable objects.
320      # If a pair of child and parent functions appears multiple times, we
321      # add their inclusive count values.
322      child_function_key = ','.join(
323          [child_function_name, child_function_file_name])
324      pairwise_inclusive_count_statistics[parent_function_name] \
325          [child_function_key] += float(inclusive_count)
326
327  return pairwise_inclusive_count_statistics
328