1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4"""Utility functions for parsing pprof, CWP data and Chrome OS groups files."""
5
6from collections import defaultdict
7
8import csv
9import os
10import re
11
12SEPARATOR_REGEX = re.compile(r'-+\+-+')
13FUNCTION_STATISTIC_REGEX = \
14    re.compile(r'(\S+)\s+(\S+)%\s+(\S+)%\s+(\S+)\s+(\S+)%')
15CHILD_FUNCTION_PERCENTAGE_REGEX = re.compile(r'([0-9.]+)%')
16FUNCTION_KEY_SEPARATOR_REGEX = re.compile(r'\|\s+')
17# Constants used to identify if a function is common in the pprof and CWP
18# files.
19COMMON_FUNCTION = 'common'
20EXTRA_FUNCTION = 'extra'
21PARENT_CHILD_FUNCTIONS_SEPARATOR = ';;'
22# List of pairs of strings used for make substitutions in file names to make
23# CWP and pprof data consistent.
24FILE_NAME_REPLACING_PAIR_STRINGS = [('gnawty', 'BOARD'),
25                                    ('amd64-generic', 'BOARD'),
26                                    (' ../sysdeps', ',sysdeps'),
27                                    (' ../nptl', ',nptl'),
28                                    ('  aes-x86_64.s', ',aes-x86_64.s'),
29                                    (' (inline)', ''),
30                                    (' (partial-inline)', ''),
31                                    (' ../', ','),
32                                    ('../', '')]
33# Separator used to delimit the function from the file name.
34FUNCTION_FILE_SEPARATOR = ' /'
35
36
37def MakeCWPAndPprofFileNamesConsistent(file_name):
38  """Makes the CWP and pprof file names consistent.
39
40  For the same function, it may happen for some file paths to differ slightly
41  in the CWP data compared to the pprof output. In a file name, for each tuple
42  element of the list, we substitute the first element with the second one.
43
44  Args:
45    file_name: A string representing the name of the file.
46
47  Returns:
48    A string representing the modified name of tihe file.
49  """
50  file_name = file_name.replace(', ', '; ')
51  for replacing_pair_string in FILE_NAME_REPLACING_PAIR_STRINGS:
52    file_name = file_name.replace(replacing_pair_string[0],
53                                  replacing_pair_string[1])
54
55  return file_name
56
57def MakePprofFunctionKey(function_and_file_name):
58  """Creates the function key from the function and file name.
59
60  Parsing the the pprof --top and --tree outputs is difficult due to the fact
61  that it hard to extract the function and file name (i.e the function names
62  can have a lot of unexpected charachters such as spaces, operators etc).
63  For the moment, we used FUNCTION_FILE_SEPARATOR as delimiter between the
64  function and the file name. However, there are some cases where the file name
65  does not start with / and we treat this cases separately (i.e ../sysdeps,
66  ../nptl, aes-x86_64.s).
67
68  Args:
69    function_and_file_name: A string representing the function and the file name
70      as it appears in the pprof output.
71
72  Returns:
73    A string representing the function key, composed from the function and file
74    name, comma separated.
75  """
76  # TODO(evelinad): Use pprof --topproto instead of pprof --top to parse
77  # protobuffers instead of text output. Investigate if there is an equivalent
78  # for pprof --tree that gives protobuffer output.
79  #
80  # In the CWP output, we replace the , with ; as a workaround for parsing
81  # csv files. We do the same for the pprof output.
82  #
83  # TODO(evelinad): Use dremel --csv_dialect=excel-tab in the queries for
84  # replacing the , delimiter with tab.
85  function_and_file_name = function_and_file_name.replace(', ', '; ')
86  # If the function and file name sequence contains the FUNCTION_FILE_SEPARATOR,
87  # we normalize the path name of the file and make the string subtitutions
88  # to make the CWP and pprof data  consistent. The returned key is composed
89  # from the function name and normalized file path name, separated by a comma.
90  # If the function and file name does not contain the FUNCTION_FILE_SEPARATOR,
91  # we just do the strings substitution.
92  if FUNCTION_FILE_SEPARATOR in function_and_file_name:
93    function_name, file_name = \
94        function_and_file_name.split(FUNCTION_FILE_SEPARATOR)
95    file_name = \
96        MakeCWPAndPprofFileNamesConsistent(os.path.normpath("/" + file_name))
97    return ','.join([function_name, file_name])
98
99  return MakeCWPAndPprofFileNamesConsistent(function_and_file_name)
100
101
102def ComputeCWPCummulativeInclusiveStatistics(cwp_inclusive_count_statistics):
103  """Computes the cumulative inclusive count value of a function.
104
105  A function might appear declared in multiple files or objects. When
106  computing the fraction of the inclusive count value from a child function to
107  the parent function, we take into consideration the sum of the
108  inclusive_count
109  count values from all the ocurences of that function.
110
111  Args:
112    cwp_inclusive_count_statistics: A dict containing the inclusive count
113    statistics extracted by the ParseCWPInclusiveCountFile method.
114
115  Returns:
116    A dict having as a ket the name of the function and as a value the sum of
117    the inclusive count values of the occurences of the functions from all
118    the files and objects.
119  """
120  cwp_inclusive_count_statistics_cumulative = defaultdict(int)
121
122  for function_key, function_statistics \
123      in cwp_inclusive_count_statistics.iteritems():
124    function_name, _ = function_key.split(',')
125    cwp_inclusive_count_statistics_cumulative[function_name] += \
126        function_statistics[1]
127
128  return cwp_inclusive_count_statistics_cumulative
129
130def ComputeCWPChildFunctionsFractions(cwp_inclusive_count_statistics_cumulative,
131                                      cwp_pairwise_inclusive_count_statistics):
132  """Computes the fractions of the inclusive count values for child functions.
133
134  The fraction represents the inclusive count value of a child function over
135  the one of the parent function.
136
137  Args:
138    cwp_inclusive_count_statistics_cumulative: A dict containing the
139      cumulative inclusive count values of the CWP functions.
140    cwp_pairwise_inclusive_count_statistics: A dict containing the inclusive
141      count statistics for pairs of parent and child functions. The key is the
142      parent function. The value is a dict with the key the name of the child
143      function and the file name, comma separated, and the value is the
144      inclusive count value of the pair of parent and child functions.
145
146  Returns:
147      A dict containing the inclusive count statistics for pairs of parent
148      and child functions. The key is the parent function. The value is a
149      dict with the key the name of the child function and the file name,
150      comma separated, and the value is the inclusive count fraction of the
151      child function out of the parent function.
152  """
153
154  pairwise_inclusive_count_fractions = {}
155
156  for parent_function_key, child_functions_metrics in \
157      cwp_pairwise_inclusive_count_statistics.iteritems():
158    child_functions_fractions = {}
159    parent_function_inclusive_count = \
160    cwp_inclusive_count_statistics_cumulative.get(parent_function_key, 0.0)
161
162    if parent_function_key in cwp_inclusive_count_statistics_cumulative:
163      for child_function_key, child_function_inclusive_count \
164          in child_functions_metrics.iteritems():
165        child_functions_fractions[child_function_key] = \
166           child_function_inclusive_count / parent_function_inclusive_count
167    else:
168      for child_function_key, child_function_inclusive_count \
169          in child_functions_metrics.iteritems():
170        child_functions_fractions[child_function_key] = 0.0
171    pairwise_inclusive_count_fractions[parent_function_key] = \
172        child_functions_fractions
173
174  return pairwise_inclusive_count_fractions
175
176def ParseFunctionGroups(cwp_function_groups_lines):
177  """Parses the contents of the function groups file.
178
179  Args:
180    cwp_function_groups_lines: A list of the lines contained in the CWP
181      function groups file. A line contains the group name and the file path
182      that describes the group, separated by a space.
183
184  Returns:
185    A list of tuples containing the group name and the file path.
186  """
187  # The order of the groups mentioned in the cwp_function_groups file
188  # matters. A function declared in a file will belong to the first
189  # mentioned group that matches its path to the one of the file.
190  # It is possible to have multiple paths that belong to the same group.
191  return [tuple(line.split()) for line in cwp_function_groups_lines]
192
193
194def ParsePprofTopOutput(file_name):
195  """Parses a file that contains the output of the pprof --top command.
196
197  Args:
198    file_name: The name of the file containing the pprof --top output.
199
200  Returns:
201    A dict having as a key the name of the function and the file containing
202    the declaration of the function, separated by a comma, and as a value
203    a tuple containing the flat, flat percentage, sum percentage, cummulative
204    and cummulative percentage values.
205  """
206
207  pprof_top_statistics = {}
208
209  # In the pprof top output, the statistics of the functions start from the
210  # 6th line.
211  with open(file_name) as input_file:
212    pprof_top_content = input_file.readlines()[6:]
213
214  for line in pprof_top_content:
215    function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line)
216    flat, flat_p, sum_p, cum, cum_p = function_statistic_match.groups()
217    flat_p = str(float(flat_p) / 100.0)
218    sum_p = str(float(sum_p) / 100.0)
219    cum_p = str(float(cum_p) / 100.0)
220    lookup_index = function_statistic_match.end()
221    function_and_file_name = line[lookup_index + 2 : -1]
222    key = MakePprofFunctionKey(function_and_file_name)
223    pprof_top_statistics[key] = (flat, flat_p, sum_p, cum, cum_p)
224  return pprof_top_statistics
225
226
227def ParsePprofTreeOutput(file_name):
228  """Parses a file that contains the output of the pprof --tree command.
229
230  Args:
231    file_name: The name of the file containing the pprof --tree output.
232
233  Returns:
234    A dict including the statistics for pairs of parent and child functions.
235    The key is the name of the parent function and the file where the
236    function is declared, separated by a comma. The value is a dict having as
237    a key the name of the child function and the file where the function is
238    delcared, comma separated and as a value the percentage of time the
239    parent function spends in the child function.
240  """
241
242  # In the pprof output, the statistics of the functions start from the 9th
243  # line.
244  with open(file_name) as input_file:
245    pprof_tree_content = input_file.readlines()[9:]
246
247  pprof_tree_statistics = defaultdict(lambda: defaultdict(float))
248  track_child_functions = False
249
250  # The statistics of a given function, its parent and child functions are
251  # included between two separator marks.
252  # All the parent function statistics are above the line containing the
253  # statistics of the given function.
254  # All the statistics of a child function are below the statistics of the
255  # given function.
256  # The statistics of a parent or a child function contain the calls, calls
257  # percentage, the function name and the file where the function is declared.
258  # The statistics of the given function contain the flat, flat percentage,
259  # sum percentage, cummulative, cummulative percentage, function name and the
260  # name of the file containing the declaration of the function.
261  for line in pprof_tree_content:
262    separator_match = SEPARATOR_REGEX.search(line)
263
264    if separator_match:
265      track_child_functions = False
266      continue
267
268    parent_function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line)
269
270    if parent_function_statistic_match:
271      track_child_functions = True
272      lookup_index = parent_function_statistic_match.end()
273      parent_function_key_match = \
274          FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index)
275      lookup_index = parent_function_key_match.end()
276      parent_function_key = MakePprofFunctionKey(line[lookup_index:-1])
277      continue
278
279    if not track_child_functions:
280      continue
281
282    child_function_statistic_match = \
283        CHILD_FUNCTION_PERCENTAGE_REGEX.search(line)
284    child_function_percentage = \
285        float(child_function_statistic_match.group(1))
286    lookup_index = child_function_statistic_match.end()
287    child_function_key_match = \
288        FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index)
289    lookup_index = child_function_key_match.end()
290    child_function_key = MakePprofFunctionKey(line[lookup_index:-1])
291
292    pprof_tree_statistics[parent_function_key][child_function_key] += \
293        child_function_percentage / 100.0
294
295  return pprof_tree_statistics
296
297
298def ParseCWPInclusiveCountFile(file_name):
299  """Parses the CWP inclusive count files.
300
301  A line should contain the name of the function, the file name with the
302  declaration, the inclusive count and inclusive count fraction out of the
303  total extracted inclusive count values.
304
305  Args:
306    file_name: The file containing the inclusive count values of the CWP
307    functions.
308
309  Returns:
310    A dict containing the inclusive count statistics. The key is the name of
311    the function and the file name, comma separated. The value represents a
312    tuple with the object name containing the function declaration, the
313    inclusive count and inclusive count fraction values, and a marker to
314    identify if the function is present in one of the benchmark profiles.
315  """
316  cwp_inclusive_count_statistics = defaultdict(lambda: ('', 0, 0.0, 0))
317
318  with open(file_name) as input_file:
319    statistics_reader = csv.DictReader(input_file, delimiter=',')
320    for statistic in statistics_reader:
321      function_name = statistic['function']
322      file_name = MakeCWPAndPprofFileNamesConsistent(
323          os.path.normpath(statistic['file']))
324      dso_name = statistic['dso']
325      inclusive_count = statistic['inclusive_count']
326      inclusive_count_fraction = statistic['inclusive_count_fraction']
327
328      # We ignore the lines that have empty fields(i.e they specify only the
329      # addresses of the functions and the inclusive counts values).
330      if all([
331          function_name, file_name, dso_name, inclusive_count,
332          inclusive_count_fraction
333      ]):
334        key = '%s,%s' % (function_name, file_name)
335
336        # There might be situations where a function appears in multiple files
337        # or objects. Such situations can occur when in the Dremel queries there
338        # are not specified the Chrome OS version and the name of the board (i.e
339        # the files can belong to different kernel or library versions).
340        inclusive_count_sum = \
341            cwp_inclusive_count_statistics[key][1] + int(inclusive_count)
342        inclusive_count_fraction_sum = \
343            cwp_inclusive_count_statistics[key][2] + \
344            float(inclusive_count_fraction)
345
346        # All the functions are initially marked as EXTRA_FUNCTION.
347        value = \
348            (dso_name, inclusive_count_sum, inclusive_count_fraction_sum,
349             EXTRA_FUNCTION)
350        cwp_inclusive_count_statistics[key] = value
351
352  return cwp_inclusive_count_statistics
353
354
355def ParseCWPPairwiseInclusiveCountFile(file_name):
356  """Parses the CWP pairwise inclusive count files.
357
358  A line of the file should contain a pair of a parent and a child function,
359  concatenated by the PARENT_CHILD_FUNCTIONS_SEPARATOR, the name of the file
360  where the child function is declared and the inclusive count fractions of
361  the pair of functions out of the total amount of inclusive count values.
362
363  Args:
364    file_name: The file containing the pairwise inclusive_count statistics of
365      the
366    CWP functions.
367
368  Returns:
369    A dict containing the statistics of the parent functions and each of
370    their child functions. The key of the dict is the name of the parent
371    function. The value is a dict having as a key the name of the child
372    function with its file name separated by a ',' and as a value the
373    inclusive count value of the parent-child function pair.
374  """
375  pairwise_inclusive_count_statistics = defaultdict(lambda: defaultdict(float))
376
377  with open(file_name) as input_file:
378    statistics_reader = csv.DictReader(input_file, delimiter=',')
379
380    for statistic in statistics_reader:
381      parent_function_name, child_function_name = \
382          statistic['parent_child_functions'].split(
383              PARENT_CHILD_FUNCTIONS_SEPARATOR)
384      child_function_file_name = MakeCWPAndPprofFileNamesConsistent(
385          os.path.normpath(statistic['child_function_file']))
386      inclusive_count = statistic['inclusive_count']
387
388      # There might be situations where a child function appears in
389      # multiple files or objects. Such situations can occur when in the
390      # Dremel queries are not specified the Chrome OS version and the
391      # name of the board (i.e the files can belong to different kernel or
392      # library versions), when the child function is a template function
393      # that is declared in a header file or there are name collisions
394      # between multiple executable objects.
395      # If a pair of child and parent functions appears multiple times, we
396      # add their inclusive count values.
397      child_function_key = ','.join(
398          [child_function_name, child_function_file_name])
399      pairwise_inclusive_count_statistics[parent_function_name] \
400          [child_function_key] += float(inclusive_count)
401
402  return pairwise_inclusive_count_statistics
403