1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4"""Utility functions for parsing pprof, CWP data and Chrome OS groups files.""" 5 6from collections import defaultdict 7 8import csv 9import os 10import re 11 12SEPARATOR_REGEX = re.compile(r'-+\+-+') 13FUNCTION_STATISTIC_REGEX = \ 14 re.compile(r'(\S+)\s+(\S+)%\s+(\S+)%\s+(\S+)\s+(\S+)%') 15CHILD_FUNCTION_PERCENTAGE_REGEX = re.compile(r'([0-9.]+)%') 16FUNCTION_KEY_SEPARATOR_REGEX = re.compile(r'\|\s+') 17# Constants used to identify if a function is common in the pprof and CWP 18# files. 19COMMON_FUNCTION = 'common' 20EXTRA_FUNCTION = 'extra' 21PARENT_CHILD_FUNCTIONS_SEPARATOR = ';;' 22# List of pairs of strings used for make substitutions in file names to make 23# CWP and pprof data consistent. 24FILE_NAME_REPLACING_PAIR_STRINGS = [('gnawty', 'BOARD'), 25 ('amd64-generic', 'BOARD'), 26 (' ../sysdeps', ',sysdeps'), 27 (' ../nptl', ',nptl'), 28 (' aes-x86_64.s', ',aes-x86_64.s'), 29 (' (inline)', ''), 30 (' (partial-inline)', ''), 31 (' ../', ','), 32 ('../', '')] 33# Separator used to delimit the function from the file name. 34FUNCTION_FILE_SEPARATOR = ' /' 35 36 37def MakeCWPAndPprofFileNamesConsistent(file_name): 38 """Makes the CWP and pprof file names consistent. 39 40 For the same function, it may happen for some file paths to differ slightly 41 in the CWP data compared to the pprof output. In a file name, for each tuple 42 element of the list, we substitute the first element with the second one. 43 44 Args: 45 file_name: A string representing the name of the file. 46 47 Returns: 48 A string representing the modified name of tihe file. 49 """ 50 file_name = file_name.replace(', ', '; ') 51 for replacing_pair_string in FILE_NAME_REPLACING_PAIR_STRINGS: 52 file_name = file_name.replace(replacing_pair_string[0], 53 replacing_pair_string[1]) 54 55 return file_name 56 57def MakePprofFunctionKey(function_and_file_name): 58 """Creates the function key from the function and file name. 59 60 Parsing the the pprof --top and --tree outputs is difficult due to the fact 61 that it hard to extract the function and file name (i.e the function names 62 can have a lot of unexpected charachters such as spaces, operators etc). 63 For the moment, we used FUNCTION_FILE_SEPARATOR as delimiter between the 64 function and the file name. However, there are some cases where the file name 65 does not start with / and we treat this cases separately (i.e ../sysdeps, 66 ../nptl, aes-x86_64.s). 67 68 Args: 69 function_and_file_name: A string representing the function and the file name 70 as it appears in the pprof output. 71 72 Returns: 73 A string representing the function key, composed from the function and file 74 name, comma separated. 75 """ 76 # TODO(evelinad): Use pprof --topproto instead of pprof --top to parse 77 # protobuffers instead of text output. Investigate if there is an equivalent 78 # for pprof --tree that gives protobuffer output. 79 # 80 # In the CWP output, we replace the , with ; as a workaround for parsing 81 # csv files. We do the same for the pprof output. 82 # 83 # TODO(evelinad): Use dremel --csv_dialect=excel-tab in the queries for 84 # replacing the , delimiter with tab. 85 function_and_file_name = function_and_file_name.replace(', ', '; ') 86 # If the function and file name sequence contains the FUNCTION_FILE_SEPARATOR, 87 # we normalize the path name of the file and make the string subtitutions 88 # to make the CWP and pprof data consistent. The returned key is composed 89 # from the function name and normalized file path name, separated by a comma. 90 # If the function and file name does not contain the FUNCTION_FILE_SEPARATOR, 91 # we just do the strings substitution. 92 if FUNCTION_FILE_SEPARATOR in function_and_file_name: 93 function_name, file_name = \ 94 function_and_file_name.split(FUNCTION_FILE_SEPARATOR) 95 file_name = \ 96 MakeCWPAndPprofFileNamesConsistent(os.path.normpath("/" + file_name)) 97 return ','.join([function_name, file_name]) 98 99 return MakeCWPAndPprofFileNamesConsistent(function_and_file_name) 100 101 102def ComputeCWPCummulativeInclusiveStatistics(cwp_inclusive_count_statistics): 103 """Computes the cumulative inclusive count value of a function. 104 105 A function might appear declared in multiple files or objects. When 106 computing the fraction of the inclusive count value from a child function to 107 the parent function, we take into consideration the sum of the 108 inclusive_count 109 count values from all the ocurences of that function. 110 111 Args: 112 cwp_inclusive_count_statistics: A dict containing the inclusive count 113 statistics extracted by the ParseCWPInclusiveCountFile method. 114 115 Returns: 116 A dict having as a ket the name of the function and as a value the sum of 117 the inclusive count values of the occurences of the functions from all 118 the files and objects. 119 """ 120 cwp_inclusive_count_statistics_cumulative = defaultdict(int) 121 122 for function_key, function_statistics \ 123 in cwp_inclusive_count_statistics.iteritems(): 124 function_name, _ = function_key.split(',') 125 cwp_inclusive_count_statistics_cumulative[function_name] += \ 126 function_statistics[1] 127 128 return cwp_inclusive_count_statistics_cumulative 129 130def ComputeCWPChildFunctionsFractions(cwp_inclusive_count_statistics_cumulative, 131 cwp_pairwise_inclusive_count_statistics): 132 """Computes the fractions of the inclusive count values for child functions. 133 134 The fraction represents the inclusive count value of a child function over 135 the one of the parent function. 136 137 Args: 138 cwp_inclusive_count_statistics_cumulative: A dict containing the 139 cumulative inclusive count values of the CWP functions. 140 cwp_pairwise_inclusive_count_statistics: A dict containing the inclusive 141 count statistics for pairs of parent and child functions. The key is the 142 parent function. The value is a dict with the key the name of the child 143 function and the file name, comma separated, and the value is the 144 inclusive count value of the pair of parent and child functions. 145 146 Returns: 147 A dict containing the inclusive count statistics for pairs of parent 148 and child functions. The key is the parent function. The value is a 149 dict with the key the name of the child function and the file name, 150 comma separated, and the value is the inclusive count fraction of the 151 child function out of the parent function. 152 """ 153 154 pairwise_inclusive_count_fractions = {} 155 156 for parent_function_key, child_functions_metrics in \ 157 cwp_pairwise_inclusive_count_statistics.iteritems(): 158 child_functions_fractions = {} 159 parent_function_inclusive_count = \ 160 cwp_inclusive_count_statistics_cumulative.get(parent_function_key, 0.0) 161 162 if parent_function_key in cwp_inclusive_count_statistics_cumulative: 163 for child_function_key, child_function_inclusive_count \ 164 in child_functions_metrics.iteritems(): 165 child_functions_fractions[child_function_key] = \ 166 child_function_inclusive_count / parent_function_inclusive_count 167 else: 168 for child_function_key, child_function_inclusive_count \ 169 in child_functions_metrics.iteritems(): 170 child_functions_fractions[child_function_key] = 0.0 171 pairwise_inclusive_count_fractions[parent_function_key] = \ 172 child_functions_fractions 173 174 return pairwise_inclusive_count_fractions 175 176def ParseFunctionGroups(cwp_function_groups_lines): 177 """Parses the contents of the function groups file. 178 179 Args: 180 cwp_function_groups_lines: A list of the lines contained in the CWP 181 function groups file. A line contains the group name and the file path 182 that describes the group, separated by a space. 183 184 Returns: 185 A list of tuples containing the group name and the file path. 186 """ 187 # The order of the groups mentioned in the cwp_function_groups file 188 # matters. A function declared in a file will belong to the first 189 # mentioned group that matches its path to the one of the file. 190 # It is possible to have multiple paths that belong to the same group. 191 return [tuple(line.split()) for line in cwp_function_groups_lines] 192 193 194def ParsePprofTopOutput(file_name): 195 """Parses a file that contains the output of the pprof --top command. 196 197 Args: 198 file_name: The name of the file containing the pprof --top output. 199 200 Returns: 201 A dict having as a key the name of the function and the file containing 202 the declaration of the function, separated by a comma, and as a value 203 a tuple containing the flat, flat percentage, sum percentage, cummulative 204 and cummulative percentage values. 205 """ 206 207 pprof_top_statistics = {} 208 209 # In the pprof top output, the statistics of the functions start from the 210 # 6th line. 211 with open(file_name) as input_file: 212 pprof_top_content = input_file.readlines()[6:] 213 214 for line in pprof_top_content: 215 function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line) 216 flat, flat_p, sum_p, cum, cum_p = function_statistic_match.groups() 217 flat_p = str(float(flat_p) / 100.0) 218 sum_p = str(float(sum_p) / 100.0) 219 cum_p = str(float(cum_p) / 100.0) 220 lookup_index = function_statistic_match.end() 221 function_and_file_name = line[lookup_index + 2 : -1] 222 key = MakePprofFunctionKey(function_and_file_name) 223 pprof_top_statistics[key] = (flat, flat_p, sum_p, cum, cum_p) 224 return pprof_top_statistics 225 226 227def ParsePprofTreeOutput(file_name): 228 """Parses a file that contains the output of the pprof --tree command. 229 230 Args: 231 file_name: The name of the file containing the pprof --tree output. 232 233 Returns: 234 A dict including the statistics for pairs of parent and child functions. 235 The key is the name of the parent function and the file where the 236 function is declared, separated by a comma. The value is a dict having as 237 a key the name of the child function and the file where the function is 238 delcared, comma separated and as a value the percentage of time the 239 parent function spends in the child function. 240 """ 241 242 # In the pprof output, the statistics of the functions start from the 9th 243 # line. 244 with open(file_name) as input_file: 245 pprof_tree_content = input_file.readlines()[9:] 246 247 pprof_tree_statistics = defaultdict(lambda: defaultdict(float)) 248 track_child_functions = False 249 250 # The statistics of a given function, its parent and child functions are 251 # included between two separator marks. 252 # All the parent function statistics are above the line containing the 253 # statistics of the given function. 254 # All the statistics of a child function are below the statistics of the 255 # given function. 256 # The statistics of a parent or a child function contain the calls, calls 257 # percentage, the function name and the file where the function is declared. 258 # The statistics of the given function contain the flat, flat percentage, 259 # sum percentage, cummulative, cummulative percentage, function name and the 260 # name of the file containing the declaration of the function. 261 for line in pprof_tree_content: 262 separator_match = SEPARATOR_REGEX.search(line) 263 264 if separator_match: 265 track_child_functions = False 266 continue 267 268 parent_function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line) 269 270 if parent_function_statistic_match: 271 track_child_functions = True 272 lookup_index = parent_function_statistic_match.end() 273 parent_function_key_match = \ 274 FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index) 275 lookup_index = parent_function_key_match.end() 276 parent_function_key = MakePprofFunctionKey(line[lookup_index:-1]) 277 continue 278 279 if not track_child_functions: 280 continue 281 282 child_function_statistic_match = \ 283 CHILD_FUNCTION_PERCENTAGE_REGEX.search(line) 284 child_function_percentage = \ 285 float(child_function_statistic_match.group(1)) 286 lookup_index = child_function_statistic_match.end() 287 child_function_key_match = \ 288 FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index) 289 lookup_index = child_function_key_match.end() 290 child_function_key = MakePprofFunctionKey(line[lookup_index:-1]) 291 292 pprof_tree_statistics[parent_function_key][child_function_key] += \ 293 child_function_percentage / 100.0 294 295 return pprof_tree_statistics 296 297 298def ParseCWPInclusiveCountFile(file_name): 299 """Parses the CWP inclusive count files. 300 301 A line should contain the name of the function, the file name with the 302 declaration, the inclusive count and inclusive count fraction out of the 303 total extracted inclusive count values. 304 305 Args: 306 file_name: The file containing the inclusive count values of the CWP 307 functions. 308 309 Returns: 310 A dict containing the inclusive count statistics. The key is the name of 311 the function and the file name, comma separated. The value represents a 312 tuple with the object name containing the function declaration, the 313 inclusive count and inclusive count fraction values, and a marker to 314 identify if the function is present in one of the benchmark profiles. 315 """ 316 cwp_inclusive_count_statistics = defaultdict(lambda: ('', 0, 0.0, 0)) 317 318 with open(file_name) as input_file: 319 statistics_reader = csv.DictReader(input_file, delimiter=',') 320 for statistic in statistics_reader: 321 function_name = statistic['function'] 322 file_name = MakeCWPAndPprofFileNamesConsistent( 323 os.path.normpath(statistic['file'])) 324 dso_name = statistic['dso'] 325 inclusive_count = statistic['inclusive_count'] 326 inclusive_count_fraction = statistic['inclusive_count_fraction'] 327 328 # We ignore the lines that have empty fields(i.e they specify only the 329 # addresses of the functions and the inclusive counts values). 330 if all([ 331 function_name, file_name, dso_name, inclusive_count, 332 inclusive_count_fraction 333 ]): 334 key = '%s,%s' % (function_name, file_name) 335 336 # There might be situations where a function appears in multiple files 337 # or objects. Such situations can occur when in the Dremel queries there 338 # are not specified the Chrome OS version and the name of the board (i.e 339 # the files can belong to different kernel or library versions). 340 inclusive_count_sum = \ 341 cwp_inclusive_count_statistics[key][1] + int(inclusive_count) 342 inclusive_count_fraction_sum = \ 343 cwp_inclusive_count_statistics[key][2] + \ 344 float(inclusive_count_fraction) 345 346 # All the functions are initially marked as EXTRA_FUNCTION. 347 value = \ 348 (dso_name, inclusive_count_sum, inclusive_count_fraction_sum, 349 EXTRA_FUNCTION) 350 cwp_inclusive_count_statistics[key] = value 351 352 return cwp_inclusive_count_statistics 353 354 355def ParseCWPPairwiseInclusiveCountFile(file_name): 356 """Parses the CWP pairwise inclusive count files. 357 358 A line of the file should contain a pair of a parent and a child function, 359 concatenated by the PARENT_CHILD_FUNCTIONS_SEPARATOR, the name of the file 360 where the child function is declared and the inclusive count fractions of 361 the pair of functions out of the total amount of inclusive count values. 362 363 Args: 364 file_name: The file containing the pairwise inclusive_count statistics of 365 the 366 CWP functions. 367 368 Returns: 369 A dict containing the statistics of the parent functions and each of 370 their child functions. The key of the dict is the name of the parent 371 function. The value is a dict having as a key the name of the child 372 function with its file name separated by a ',' and as a value the 373 inclusive count value of the parent-child function pair. 374 """ 375 pairwise_inclusive_count_statistics = defaultdict(lambda: defaultdict(float)) 376 377 with open(file_name) as input_file: 378 statistics_reader = csv.DictReader(input_file, delimiter=',') 379 380 for statistic in statistics_reader: 381 parent_function_name, child_function_name = \ 382 statistic['parent_child_functions'].split( 383 PARENT_CHILD_FUNCTIONS_SEPARATOR) 384 child_function_file_name = MakeCWPAndPprofFileNamesConsistent( 385 os.path.normpath(statistic['child_function_file'])) 386 inclusive_count = statistic['inclusive_count'] 387 388 # There might be situations where a child function appears in 389 # multiple files or objects. Such situations can occur when in the 390 # Dremel queries are not specified the Chrome OS version and the 391 # name of the board (i.e the files can belong to different kernel or 392 # library versions), when the child function is a template function 393 # that is declared in a header file or there are name collisions 394 # between multiple executable objects. 395 # If a pair of child and parent functions appears multiple times, we 396 # add their inclusive count values. 397 child_function_key = ','.join( 398 [child_function_name, child_function_file_name]) 399 pairwise_inclusive_count_statistics[parent_function_name] \ 400 [child_function_key] += float(inclusive_count) 401 402 return pairwise_inclusive_count_statistics 403