1#!/usr/bin/env python
2#
3# Copyright (C) 2016 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""annotate.py: annotate source files based on perf.data.
19"""
20
21
22import argparse
23import os
24import os.path
25import shutil
26import subprocess
27import sys
28
29from simpleperf_report_lib import *
30from utils import *
31
32class SourceLine(object):
33    def __init__(self, file, function, line):
34        self.file = file
35        self.function = function
36        self.line = line
37
38    @property
39    def file_key(self):
40        return self.file
41
42    @property
43    def function_key(self):
44        return (self.file, self.function)
45
46    @property
47    def line_key(self):
48        return (self.file, self.line)
49
50
51# TODO: using addr2line can't convert from function_start_address to
52# source_file:line very well for java code. Because in .debug_line section,
53# there is some distance between function_start_address and the address
54# of the first instruction which can be mapped to source line.
55class Addr2Line(object):
56    """collect information of how to map [dso_name,vaddr] to [source_file:line].
57    """
58    def __init__(self, addr2line_path, symfs_dir=None):
59        self.dso_dict = dict()
60        self.addr2line_path = addr2line_path
61        self.symfs_dir = symfs_dir
62
63
64    def add_addr(self, dso_name, addr):
65        dso = self.dso_dict.get(dso_name)
66        if dso is None:
67            self.dso_dict[dso_name] = dso = dict()
68        if not dso.has_key(addr):
69            dso[addr] = None
70
71
72    def convert_addrs_to_lines(self):
73        # store a list of source files
74        self.file_list = []
75        # map from file to id with file_list[id] == file
76        self.file_dict = {}
77        self.file_list.append('')
78        self.file_dict[''] = 0
79
80        for dso_name in self.dso_dict.keys():
81            self._convert_addrs_to_lines(dso_name, self.dso_dict[dso_name])
82        self._combine_source_files()
83
84
85    def _convert_addrs_to_lines(self, dso_name, dso):
86        dso_path = self._find_dso_path(dso_name)
87        if dso_path is None:
88            log_warning("can't find dso '%s'" % dso_name)
89            dso.clear()
90            return
91        addrs = sorted(dso.keys())
92        addr_str = []
93        for addr in addrs:
94            addr_str.append('0x%x' % addr)
95        addr_str = '\n'.join(addr_str)
96        subproc = subprocess.Popen([self.addr2line_path, '-e', dso_path, '-aifC'],
97                                   stdin=subprocess.PIPE, stdout=subprocess.PIPE)
98        (stdoutdata, _) = subproc.communicate(addr_str)
99        stdoutdata = stdoutdata.strip().split('\n')
100        if len(stdoutdata) < len(addrs):
101            log_fatal("addr2line didn't output enough lines")
102        addr_pos = 0
103        out_pos = 0
104        while addr_pos < len(addrs) and out_pos < len(stdoutdata):
105            addr_line = stdoutdata[out_pos]
106            out_pos += 1
107            assert addr_line[:2] == "0x"
108            assert out_pos < len(stdoutdata)
109            assert addrs[addr_pos] == int(addr_line, 16)
110            source_lines = []
111            while out_pos < len(stdoutdata) and stdoutdata[out_pos][:2] != "0x":
112                function = stdoutdata[out_pos]
113                out_pos += 1
114                assert out_pos < len(stdoutdata)
115                file, line = stdoutdata[out_pos].split(':')
116                line = line.split()[0]  # Remove comments after line number
117                out_pos += 1
118                if file.find('?') != -1:
119                    file = 0
120                else:
121                    file = self._get_file_id(file)
122                if line.find('?') != -1:
123                    line = 0
124                else:
125                    line = int(line)
126                source_lines.append(SourceLine(file, function, line))
127                dso[addrs[addr_pos]] = source_lines
128                addr_pos += 1
129        assert addr_pos == len(addrs)
130
131
132    def _get_file_id(self, file):
133        id = self.file_dict.get(file)
134        if id is None:
135            id = len(self.file_list)
136            self.file_list.append(file)
137            self.file_dict[file] = id
138        return id
139
140    def _combine_source_files(self):
141        """It is possible that addr2line gives us different names for the same
142           file, like:
143            /usr/local/.../src/main/jni/sudo-game-jni.cpp
144            sudo-game-jni.cpp
145           We'd better combine these two files. We can do it by combining
146           source files with no conflicts in path.
147        """
148        # Collect files having the same filename.
149        filename_dict = dict()
150        for file in self.file_list:
151            index = max(file.rfind('/'), file.rfind(os.sep))
152            filename = file[index+1:]
153            entry = filename_dict.get(filename)
154            if entry is None:
155                filename_dict[filename] = entry = []
156            entry.append(file)
157
158        # Combine files having the same filename and having no conflicts in path.
159        for filename in filename_dict.keys():
160            files = filename_dict[filename]
161            if len(files) == 1:
162                continue
163            for file in files:
164                to_file = file
165                # Test if we can merge files[i] with another file having longer
166                # path.
167                for f in files:
168                    if len(f) > len(to_file) and f.find(file) != -1:
169                        to_file = f
170                if to_file != file:
171                    from_id = self.file_dict[file]
172                    to_id = self.file_dict[to_file]
173                    self.file_list[from_id] = self.file_list[to_id]
174
175
176    def get_sources(self, dso_name, addr):
177        dso = self.dso_dict.get(dso_name)
178        if dso is None:
179            return []
180        item = dso.get(addr, [])
181        source_lines = []
182        for source in item:
183            source_lines.append(SourceLine(self.file_list[source.file],
184                                           source.function, source.line))
185        return source_lines
186
187
188    def _find_dso_path(self, dso):
189        if dso[0] != '/' or dso == '//anon':
190            return None
191        if self.symfs_dir:
192            dso_path = os.path.join(self.symfs_dir, dso[1:])
193            if os.path.isfile(dso_path):
194                return dso_path
195        if os.path.isfile(dso):
196            return dso
197        return None
198
199
200class Period(object):
201    """event count information. It can be used to represent event count
202       of a line, a function, a source file, or a binary. It contains two
203       parts: period and acc_period.
204       When used for a line, period is the event count occurred when running
205       that line, acc_period is the accumulated event count occurred when
206       running that line and functions called by that line. Same thing applies
207       when it is used for a function, a source file, or a binary.
208    """
209    def __init__(self, period=0, acc_period=0):
210        self.period = period
211        self.acc_period = acc_period
212
213
214    def __iadd__(self, other):
215        self.period += other.period
216        self.acc_period += other.acc_period
217        return self
218
219
220class DsoPeriod(object):
221    """Period for each shared library"""
222    def __init__(self, dso_name):
223        self.dso_name = dso_name
224        self.period = Period()
225
226
227    def add_period(self, period):
228        self.period += period
229
230
231class FilePeriod(object):
232    """Period for each source file"""
233    def __init__(self, file):
234        self.file = file
235        self.period = Period()
236        # Period for each line in the file.
237        self.line_dict = {}
238        # Period for each function in the source file.
239        self.function_dict = {}
240
241
242    def add_period(self, period):
243        self.period += period
244
245
246    def add_line_period(self, line, period):
247        a = self.line_dict.get(line)
248        if a is None:
249            self.line_dict[line] = a = Period()
250        a += period
251
252
253    def add_function_period(self, function_name, function_start_line, period):
254        a = self.function_dict.get(function_name)
255        if not a:
256            if function_start_line is None:
257                function_start_line = -1
258            self.function_dict[function_name] = a = [function_start_line, Period()]
259        a[1] += period
260
261
262class SourceFileAnnotator(object):
263    """group code for annotating source files"""
264    def __init__(self, config):
265        # check config variables
266        config_names = ['perf_data_list', 'symfs_dir', 'source_dirs',
267                        'annotate_dest_dir', 'comm_filters', 'pid_filters',
268                        'tid_filters', 'dso_filters', 'addr2line_path']
269        for name in config_names:
270            if not config.has_key(name):
271                log_fatal('config [%s] is missing' % name)
272        symfs_dir = config['symfs_dir']
273        if symfs_dir and not os.path.isdir(symfs_dir):
274            log_fatal('[symfs_dir] "%s" is not a dir' % symfs_dir)
275        kallsyms = config['kallsyms']
276        if kallsyms and not os.path.isfile(kallsyms):
277            log_fatal('[kallsyms] "%s" is not a file' % kallsyms)
278        source_dirs = config['source_dirs']
279        for dir in source_dirs:
280            if not os.path.isdir(dir):
281                log_fatal('[source_dirs] "%s" is not a dir' % dir)
282
283        # init member variables
284        self.config = config
285        self.symfs_dir = config.get('symfs_dir')
286        self.kallsyms = config.get('kallsyms')
287        self.comm_filter = set(config['comm_filters']) if config.get('comm_filters') else None
288        if config.get('pid_filters'):
289            self.pid_filter = {int(x) for x in config['pid_filters']}
290        else:
291            self.pid_filter = None
292        if config.get('tid_filters'):
293            self.tid_filter = {int(x) for x in config['tid_filters']}
294        else:
295            self.tid_filter = None
296        self.dso_filter = set(config['dso_filters']) if config.get('dso_filters') else None
297
298        output_dir = config['annotate_dest_dir']
299        if os.path.isdir(output_dir):
300            shutil.rmtree(output_dir)
301        os.makedirs(output_dir)
302
303        self.addr2line = Addr2Line(self.config['addr2line_path'], symfs_dir)
304
305
306    def annotate(self):
307        self._collect_addrs()
308        self._convert_addrs_to_lines()
309        self._generate_periods()
310        self._write_summary()
311        self._collect_source_files()
312        self._annotate_files()
313
314
315    def _collect_addrs(self):
316        """Read perf.data, collect all addresses we need to convert to
317           source file:line.
318        """
319        for perf_data in self.config['perf_data_list']:
320            lib = ReportLib()
321            lib.SetRecordFile(perf_data)
322            if self.symfs_dir:
323                lib.SetSymfs(self.symfs_dir)
324            if self.kallsyms:
325                lib.SetKallsymsFile(self.kallsyms)
326            while True:
327                sample = lib.GetNextSample()
328                if sample is None:
329                    lib.Close()
330                    break
331                if not self._filter_sample(sample):
332                    continue
333                symbols = []
334                symbols.append(lib.GetSymbolOfCurrentSample())
335                callchain = lib.GetCallChainOfCurrentSample()
336                for i in range(callchain.nr):
337                    symbols.append(callchain.entries[i].symbol)
338                for symbol in symbols:
339                    if self._filter_symbol(symbol):
340                        self.addr2line.add_addr(symbol.dso_name, symbol.vaddr_in_file)
341                        self.addr2line.add_addr(symbol.dso_name, symbol.symbol_addr)
342
343
344    def _filter_sample(self, sample):
345        """Return true if the sample can be used."""
346        if self.comm_filter:
347            if sample.thread_comm not in self.comm_filter:
348                return False
349        if self.pid_filter:
350            if sample.pid not in self.pid_filter:
351                return False
352        if self.tid_filter:
353            if sample.tid not in self.tid_filter:
354                return False
355        return True
356
357
358    def _filter_symbol(self, symbol):
359        if not self.dso_filter or symbol.dso_name in self.dso_filter:
360            return True
361        return False
362
363
364    def _convert_addrs_to_lines(self):
365        self.addr2line.convert_addrs_to_lines()
366
367
368    def _generate_periods(self):
369        """read perf.data, collect Period for all types:
370            binaries, source files, functions, lines.
371        """
372        self.period = 0
373        self.dso_periods = dict()
374        self.file_periods = dict()
375        for perf_data in self.config['perf_data_list']:
376            lib = ReportLib()
377            lib.SetRecordFile(perf_data)
378            if self.symfs_dir:
379                lib.SetSymfs(self.symfs_dir)
380            if self.kallsyms:
381                lib.SetKallsymsFile(self.kallsyms)
382            while True:
383                sample = lib.GetNextSample()
384                if sample is None:
385                    lib.Close()
386                    break
387                if not self._filter_sample(sample):
388                    continue
389                symbols = []
390                symbols.append(lib.GetSymbolOfCurrentSample())
391                callchain = lib.GetCallChainOfCurrentSample()
392                for i in range(callchain.nr):
393                    symbols.append(callchain.entries[i].symbol)
394                # Each sample has a callchain, but its period is only used once
395                # to add period for each function/source_line/source_file/binary.
396                # For example, if more than one entry in the callchain hits a
397                # function, the event count of that function is only increased once.
398                # Otherwise, we may get periods > 100%.
399                is_sample_used = False
400                used_dso_dict = dict()
401                used_file_dict = dict()
402                used_function_dict = dict()
403                used_line_dict = dict()
404                period = Period(sample.period, sample.period)
405                for i in range(len(symbols)):
406                    symbol = symbols[i]
407                    if i == 1:
408                        period = Period(0, sample.period)
409                    if not self._filter_symbol(symbol):
410                        continue
411                    is_sample_used = True
412                    # Add period to dso.
413                    self._add_dso_period(symbol.dso_name, period, used_dso_dict)
414                    # Add period to source file.
415                    sources = self.addr2line.get_sources(symbol.dso_name, symbol.vaddr_in_file)
416                    for source in sources:
417                        if source.file:
418                            self._add_file_period(source, period, used_file_dict)
419                            # Add period to line.
420                            if source.line:
421                                self._add_line_period(source, period, used_line_dict)
422                    # Add period to function.
423                    sources = self.addr2line.get_sources(symbol.dso_name, symbol.symbol_addr)
424                    for source in sources:
425                        if source.file:
426                            self._add_file_period(source, period, used_file_dict)
427                            if source.function:
428                                self._add_function_period(source, period, used_function_dict)
429
430                if is_sample_used:
431                    self.period += sample.period
432
433
434    def _add_dso_period(self, dso_name, period, used_dso_dict):
435        if not used_dso_dict.has_key(dso_name):
436            used_dso_dict[dso_name] = True
437            dso_period = self.dso_periods.get(dso_name)
438            if dso_period is None:
439                dso_period = self.dso_periods[dso_name] = DsoPeriod(dso_name)
440            dso_period.add_period(period)
441
442
443    def _add_file_period(self, source, period, used_file_dict):
444        if not used_file_dict.has_key(source.file_key):
445            used_file_dict[source.file_key] = True
446            file_period = self.file_periods.get(source.file)
447            if file_period is None:
448                file_period = self.file_periods[source.file] = FilePeriod(source.file)
449            file_period.add_period(period)
450
451
452    def _add_line_period(self, source, period, used_line_dict):
453        if not used_line_dict.has_key(source.line_key):
454            used_line_dict[source.line_key] = True
455            file_period = self.file_periods[source.file]
456            file_period.add_line_period(source.line, period)
457
458
459    def _add_function_period(self, source, period, used_function_dict):
460        if not used_function_dict.has_key(source.function_key):
461            used_function_dict[source.function_key] = True
462            file_period = self.file_periods[source.file]
463            file_period.add_function_period(source.function, source.line, period)
464
465
466    def _write_summary(self):
467        summary = os.path.join(self.config['annotate_dest_dir'], 'summary')
468        with open(summary, 'w') as f:
469            f.write('total period: %d\n\n' % self.period)
470            dso_periods = sorted(self.dso_periods.values(),
471                                 cmp=lambda x, y: cmp(y.period.acc_period, x.period.acc_period))
472            for dso_period in dso_periods:
473                f.write('dso %s: %s\n' % (dso_period.dso_name,
474                                          self._get_percentage_str(dso_period.period)))
475            f.write('\n')
476
477            file_periods = sorted(self.file_periods.values(),
478                                  cmp=lambda x, y: cmp(y.period.acc_period, x.period.acc_period))
479            for file_period in file_periods:
480                f.write('file %s: %s\n' % (file_period.file,
481                                           self._get_percentage_str(file_period.period)))
482            for file_period in file_periods:
483                f.write('\n\n%s: %s\n' % (file_period.file,
484                                          self._get_percentage_str(file_period.period)))
485                values = []
486                for func_name in file_period.function_dict.keys():
487                    func_start_line, period = file_period.function_dict[func_name]
488                    values.append((func_name, func_start_line, period))
489                values = sorted(values,
490                                cmp=lambda x, y: cmp(y[2].acc_period, x[2].acc_period))
491                for value in values:
492                    f.write('\tfunction (%s): line %d, %s\n' % (
493                        value[0], value[1], self._get_percentage_str(value[2])))
494                f.write('\n')
495                for line in sorted(file_period.line_dict.keys()):
496                    f.write('\tline %d: %s\n' % (
497                        line, self._get_percentage_str(file_period.line_dict[line])))
498
499
500    def _get_percentage_str(self, period, short=False):
501        s = 'acc_p: %f%%, p: %f%%' if short else 'accumulated_period: %f%%, period: %f%%'
502        return s % self._get_percentage(period)
503
504
505    def _get_percentage(self, period):
506        if self.period == 0:
507            return (0, 0)
508        acc_p = 100.0 * period.acc_period / self.period
509        p = 100.0 * period.period / self.period
510        return (acc_p, p)
511
512
513    def _collect_source_files(self):
514        self.source_file_dict = dict()
515        source_file_suffix = ['h', 'c', 'cpp', 'cc', 'java']
516        for source_dir in self.config['source_dirs']:
517            for root, _, files in os.walk(source_dir):
518                for file in files:
519                    if file[file.rfind('.')+1:] in source_file_suffix:
520                        entry = self.source_file_dict.get(file)
521                        if entry is None:
522                            entry = self.source_file_dict[file] = []
523                        entry.append(os.path.join(root, file))
524
525
526    def _find_source_file(self, file):
527        filename = file[file.rfind(os.sep)+1:]
528        source_files = self.source_file_dict.get(filename)
529        if source_files is None:
530            return None
531        match_count = 0
532        result = None
533        for path in source_files:
534            if path.find(file) != -1:
535                match_count += 1
536                result = path
537        if match_count > 1:
538            log_warning('multiple source for %s, select %s' % (file, result))
539        return result
540
541
542    def _annotate_files(self):
543        """Annotate Source files: add acc_period/period for each source file.
544           1. Annotate java source files, which have $JAVA_SRC_ROOT prefix.
545           2. Annotate c++ source files.
546        """
547        dest_dir = self.config['annotate_dest_dir']
548        for key in self.file_periods.keys():
549            is_java = False
550            if key.startswith('$JAVA_SRC_ROOT/'):
551                path = key[len('$JAVA_SRC_ROOT/'):]
552                items = path.split('/')
553                path = os.sep.join(items)
554                from_path = self._find_source_file(path)
555                to_path = os.path.join(dest_dir, 'java', path)
556                is_java = True
557            elif key.startswith('/') and os.path.isfile(key):
558                path = key
559                from_path = path
560                to_path = os.path.join(dest_dir, path[1:])
561            else:
562                path = key[1:] if key.startswith('/') else key
563                # Change path on device to path on host
564                path = os.sep.join(path.split('/'))
565                from_path = self._find_source_file(path)
566                to_path = os.path.join(dest_dir, path)
567            if from_path is None:
568                log_warning("can't find source file for path %s" % key)
569                continue
570            self._annotate_file(from_path, to_path, self.file_periods[key], is_java)
571
572
573    def _annotate_file(self, from_path, to_path, file_period, is_java):
574        """Annotate a source file.
575
576        Annotate a source file in three steps:
577          1. In the first line, show periods of this file.
578          2. For each function, show periods of this function.
579          3. For each line not hitting the same line as functions, show
580             line periods.
581        """
582        log_info('annotate file %s' % from_path)
583        with open(from_path, 'r') as rf:
584            lines = rf.readlines()
585
586        annotates = dict()
587        for line in file_period.line_dict.keys():
588            annotates[line] = self._get_percentage_str(file_period.line_dict[line], True)
589        for func_name in file_period.function_dict.keys():
590            func_start_line, period = file_period.function_dict[func_name]
591            if func_start_line == -1:
592                continue
593            line = func_start_line - 1 if is_java else func_start_line
594            annotates[line] = '[func] ' + self._get_percentage_str(period, True)
595        annotates[1] = '[file] ' + self._get_percentage_str(file_period.period, True)
596
597        max_annotate_cols = 0
598        for key in annotates.keys():
599            max_annotate_cols = max(max_annotate_cols, len(annotates[key]))
600
601        empty_annotate = ' ' * (max_annotate_cols + 6)
602
603        dirname = os.path.dirname(to_path)
604        if not os.path.isdir(dirname):
605            os.makedirs(dirname)
606        with open(to_path, 'w') as wf:
607            for line in range(1, len(lines) + 1):
608                annotate = annotates.get(line)
609                if annotate is None:
610                    annotate = empty_annotate
611                else:
612                    annotate = '/* ' + annotate + (
613                        ' ' * (max_annotate_cols - len(annotate))) + ' */'
614                wf.write(annotate)
615                wf.write(lines[line-1])
616
617
618if __name__ == '__main__':
619    parser = argparse.ArgumentParser(
620        description='Annotate based on perf.data. See configurations in annotate.config.')
621    parser.add_argument('--config', default='annotate.config',
622                        help='Set configuration file. Default is annotate.config.')
623    args = parser.parse_args()
624    config = load_config(args.config)
625    annotator = SourceFileAnnotator(config)
626    annotator.annotate()
627