utils.py revision c7faa09f456ca5c651ac373ad897aa4be6ad2717
1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4"""Utility functions for parsing pprof, CWP data and Chrome OS groups files.""" 5 6from collections import defaultdict 7 8import csv 9import os 10import re 11 12SEPARATOR_REGEX = re.compile(r'-+\+-+') 13FUNCTION_STATISTIC_REGEX = \ 14 re.compile(r'(\S+)\s+(\S+)%\s+(\S+)%\s+(\S+)\s+(\S+)%') 15CHILD_FUNCTION_PERCENTAGE_REGEX = re.compile(r'([0-9.]+)%') 16FUNCTION_KEY_SEPARATOR_REGEX = re.compile(r'\|\s+') 17# Constants used to identify if a function is common in the pprof and CWP 18# files. 19COMMON_FUNCTION = 'common' 20EXTRA_FUNCTION = 'extra' 21PARENT_CHILD_FUNCTIONS_SEPARATOR = ';;' 22# List of pairs of strings used for make substitutions in file names to make 23# CWP and pprof data consistent. 24FILE_NAME_REPLACING_PAIR_STRINGS = [('/build/gnawty', '/build/BOARD'), 25 ('/build/amd64-generic', '/build/BOARD'), 26 (' ../sysdeps', ',sysdeps'), 27 (' ../nptl', ',nptl'), 28 (' aes-x86_64.s', ',aes-x86_64.s'), 29 (' (inline)', ''), 30 (' (partial-inline)', ''), 31 (' ../', ','), 32 ('../', '')] 33# Separator used to delimit the function from the file name. 34FUNCTION_FILE_SEPARATOR = ' /' 35 36 37def MakeCWPAndPprofFileNamesConsistent(file_name): 38 """Makes the CWP and pprof file names consistent. 39 40 For the same function, it may happen for some file paths to differ slightly 41 in the CWP data compared to the pprof output. In a file name, for each tuple 42 element of the list, we substitute the first element with the second one. 43 44 Args: 45 file_name: A string representing the name of the file. 46 47 Returns: 48 A string representing the modified name of tihe file. 49 """ 50 file_name = file_name.replace(', ', '; ') 51 for replacing_pair_string in FILE_NAME_REPLACING_PAIR_STRINGS: 52 file_name = file_name.replace(replacing_pair_string[0], 53 replacing_pair_string[1]) 54 55 return file_name 56 57def MakePprofFunctionKey(function_and_file_name): 58 """Creates the function key from the function and file name. 59 60 Parsing the the pprof --top and --tree outputs is difficult due to the fact 61 that it hard to extract the function and file name (i.e the function names 62 can have a lot of unexpected charachters such as spaces, operators etc). 63 For the moment, we used FUNCTION_FILE_SEPARATOR as delimiter between the 64 function and the file name. However, there are some cases where the file name 65 does not start with / and we treat this cases separately (i.e ../sysdeps, 66 ../nptl, aes-x86_64.s). 67 68 Args: 69 function_and_file_name: A string representing the function and the file name 70 as it appears in the pprof output. 71 72 Returns: 73 A string representing the function key, composed from the function and file 74 name, comma separated. 75 """ 76 # TODO(evelinad): Use pprof --topproto instead of pprof --top to parse 77 # protobuffers instead of text output. Investigate if there is an equivalent 78 # for pprof --tree that gives protobuffer output. 79 # 80 # In the CWP output, we replace the , with ; as a workaround for parsing 81 # csv files. We do the same for the pprof output. 82 # 83 # TODO(evelinad): Use dremel --csv_dialect=excel-tab in the queries for 84 # replacing the , delimiter with tab. 85 function_and_file_name = function_and_file_name.replace(', ', '; ') 86 # If the function and file name sequence contains the FUNCTION_FILE_SEPARATOR, 87 # we normalize the path name of the file and make the string subtitutions 88 # to make the CWP and pprof data consistent. The returned key is composed 89 # from the function name and normalized file path name, separated by a comma. 90 # If the function and file name does not contain the FUNCTION_FILE_SEPARATOR, 91 # we just do the strings substitution. 92 if FUNCTION_FILE_SEPARATOR in function_and_file_name: 93 function_name, file_name = \ 94 function_and_file_name.split(FUNCTION_FILE_SEPARATOR) 95 file_name = \ 96 MakeCWPAndPprofFileNamesConsistent(os.path.normpath("/" + file_name)) 97 return ','.join([function_name, file_name]) 98 99 return MakeCWPAndPprofFileNamesConsistent(function_and_file_name) 100 101def ParseFunctionGroups(cwp_function_groups_lines): 102 """Parses the contents of the function groups file. 103 104 Args: 105 cwp_function_groups_lines: A list of the lines contained in the CWP 106 function groups file. A line contains the group name and the file path 107 that describes the group, separated by a space. 108 109 Returns: 110 A list of tuples containing the group name and the file path. 111 """ 112 # The order of the groups mentioned in the cwp_function_groups file 113 # matters. A function declared in a file will belong to the first 114 # mentioned group that matches its path to the one of the file. 115 # It is possible to have multiple paths that belong to the same group. 116 return [tuple(line.split()) for line in cwp_function_groups_lines] 117 118 119def ParsePprofTopOutput(file_name): 120 """Parses a file that contains the output of the pprof --top command. 121 122 Args: 123 file_name: The name of the file containing the pprof --top output. 124 125 Returns: 126 A dict having as a key the name of the function and the file containing 127 the declaration of the function, separated by a comma, and as a value 128 a tuple containing the flat, flat percentage, sum percentage, cummulative 129 and cummulative percentage values. 130 """ 131 132 pprof_top_statistics = {} 133 134 # In the pprof top output, the statistics of the functions start from the 135 # 6th line. 136 with open(file_name) as input_file: 137 pprof_top_content = input_file.readlines()[6:] 138 139 for line in pprof_top_content: 140 function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line) 141 flat, flat_p, sum_p, cum, cum_p = function_statistic_match.groups() 142 flat_p = str(float(flat_p) / 100.0) 143 sum_p = str(float(sum_p) / 100.0) 144 cum_p = str(float(cum_p) / 100.0) 145 lookup_index = function_statistic_match.end() 146 function_and_file_name = line[lookup_index + 2 : -1] 147 key = MakePprofFunctionKey(function_and_file_name) 148 pprof_top_statistics[key] = (flat, flat_p, sum_p, cum, cum_p) 149 return pprof_top_statistics 150 151 152def ParsePprofTreeOutput(file_name): 153 """Parses a file that contains the output of the pprof --tree command. 154 155 Args: 156 file_name: The name of the file containing the pprof --tree output. 157 158 Returns: 159 A dict including the statistics for pairs of parent and child functions. 160 The key is the name of the parent function and the file where the 161 function is declared, separated by a comma. The value is a dict having as 162 a key the name of the child function and the file where the function is 163 delcared, comma separated and as a value the percentage of time the 164 parent function spends in the child function. 165 """ 166 167 # In the pprof output, the statistics of the functions start from the 9th 168 # line. 169 with open(file_name) as input_file: 170 pprof_tree_content = input_file.readlines()[9:] 171 172 pprof_tree_statistics = defaultdict(lambda: defaultdict(float)) 173 track_child_functions = False 174 175 # The statistics of a given function, its parent and child functions are 176 # included between two separator marks. 177 # All the parent function statistics are above the line containing the 178 # statistics of the given function. 179 # All the statistics of a child function are below the statistics of the 180 # given function. 181 # The statistics of a parent or a child function contain the calls, calls 182 # percentage, the function name and the file where the function is declared. 183 # The statistics of the given function contain the flat, flat percentage, 184 # sum percentage, cummulative, cummulative percentage, function name and the 185 # name of the file containing the declaration of the function. 186 for line in pprof_tree_content: 187 separator_match = SEPARATOR_REGEX.search(line) 188 189 if separator_match: 190 track_child_functions = False 191 continue 192 193 parent_function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line) 194 195 if parent_function_statistic_match: 196 track_child_functions = True 197 lookup_index = parent_function_statistic_match.end() 198 parent_function_key_match = \ 199 FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index) 200 lookup_index = parent_function_key_match.end() 201 parent_function_key = MakePprofFunctionKey(line[lookup_index:-1]) 202 continue 203 204 if not track_child_functions: 205 continue 206 207 child_function_statistic_match = \ 208 CHILD_FUNCTION_PERCENTAGE_REGEX.search(line) 209 child_function_percentage = \ 210 float(child_function_statistic_match.group(1)) 211 lookup_index = child_function_statistic_match.end() 212 child_function_key_match = \ 213 FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index) 214 lookup_index = child_function_key_match.end() 215 child_function_key = MakePprofFunctionKey(line[lookup_index:-1]) 216 217 pprof_tree_statistics[parent_function_key][child_function_key] += \ 218 child_function_percentage / 100.0 219 220 return pprof_tree_statistics 221 222 223def ParseCWPInclusiveCountFile(file_name): 224 """Parses the CWP inclusive count files. 225 226 A line should contain the name of the function, the file name with the 227 declaration, the inclusive count and inclusive count fraction out of the 228 total extracted inclusive count values. 229 230 Args: 231 file_name: The file containing the inclusive count values of the CWP 232 functions. 233 234 Returns: 235 A dict containing the inclusive count statistics. The key is the name of 236 the function and the file name, comma separated. The value represents a 237 tuple with the object name containing the function declaration, the 238 inclusive count and inclusive count fraction values, and a marker to 239 identify if the function is present in one of the benchmark profiles. 240 """ 241 cwp_inclusive_count_statistics = defaultdict(lambda: ('', 0, 0.0, 0)) 242 243 with open(file_name) as input_file: 244 statistics_reader = csv.DictReader(input_file, delimiter=',') 245 for statistic in statistics_reader: 246 function_name = statistic['function'] 247 file_name = MakeCWPAndPprofFileNamesConsistent( 248 os.path.normpath(statistic['file'])) 249 dso_name = statistic['dso'] 250 inclusive_count = statistic['inclusive_count'] 251 inclusive_count_fraction = statistic['inclusive_count_fraction'] 252 253 # We ignore the lines that have empty fields(i.e they specify only the 254 # addresses of the functions and the inclusive counts values). 255 if all([ 256 function_name, file_name, dso_name, inclusive_count, 257 inclusive_count_fraction 258 ]): 259 key = '%s,%s' % (function_name, file_name) 260 261 # There might be situations where a function appears in multiple files 262 # or objects. Such situations can occur when in the Dremel queries there 263 # are not specified the Chrome OS version and the name of the board (i.e 264 # the files can belong to different kernel or library versions). 265 inclusive_count_sum = \ 266 cwp_inclusive_count_statistics[key][1] + int(inclusive_count) 267 inclusive_count_fraction_sum = \ 268 cwp_inclusive_count_statistics[key][2] + \ 269 float(inclusive_count_fraction) 270 271 # All the functions are initially marked as EXTRA_FUNCTION. 272 value = \ 273 (dso_name, inclusive_count_sum, inclusive_count_fraction_sum, 274 EXTRA_FUNCTION) 275 cwp_inclusive_count_statistics[key] = value 276 277 return cwp_inclusive_count_statistics 278 279 280def ParseCWPPairwiseInclusiveCountFile(file_name): 281 """Parses the CWP pairwise inclusive count files. 282 283 A line of the file should contain a pair of a parent and a child function, 284 concatenated by the PARENT_CHILD_FUNCTIONS_SEPARATOR, the name of the file 285 where the child function is declared and the inclusive count fractions of 286 the pair of functions out of the total amount of inclusive count values. 287 288 Args: 289 file_name: The file containing the pairwise inclusive_count statistics of 290 the 291 CWP functions. 292 293 Returns: 294 A dict containing the statistics of the parent functions and each of 295 their child functions. The key of the dict is the name of the parent 296 function. The value is a dict having as a key the name of the child 297 function with its file name separated by a ',' and as a value the 298 inclusive count value of the parent-child function pair. 299 """ 300 pairwise_inclusive_count_statistics = defaultdict(lambda: defaultdict(float)) 301 302 with open(file_name) as input_file: 303 statistics_reader = csv.DictReader(input_file, delimiter=',') 304 305 for statistic in statistics_reader: 306 parent_function_name, child_function_name = \ 307 statistic['parent_child_functions'].split( 308 PARENT_CHILD_FUNCTIONS_SEPARATOR) 309 child_function_file_name = MakeCWPAndPprofFileNamesConsistent( 310 os.path.normpath(statistic['child_function_file'])) 311 inclusive_count = statistic['inclusive_count'] 312 313 # There might be situations where a child function appears in 314 # multiple files or objects. Such situations can occur when in the 315 # Dremel queries are not specified the Chrome OS version and the 316 # name of the board (i.e the files can belong to different kernel or 317 # library versions), when the child function is a template function 318 # that is declared in a header file or there are name collisions 319 # between multiple executable objects. 320 # If a pair of child and parent functions appears multiple times, we 321 # add their inclusive count values. 322 child_function_key = ','.join( 323 [child_function_name, child_function_file_name]) 324 pairwise_inclusive_count_statistics[parent_function_name] \ 325 [child_function_key] += float(inclusive_count) 326 327 return pairwise_inclusive_count_statistics 328