3# Copyright 2013 Google Inc. All Rights Reserved.
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
9#     http://www.apache.org/licenses/LICENSE-2.0
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
17import fileinput
18import operator
19import optparse
20import os
21import pprint
22import re
23import subprocess
24import sys
25import json
27def format_bytes(bytes):
28    """Pretty-print a number of bytes."""
29    if bytes > 1e6:
30        bytes = bytes / 1.0e6
31        return '%.1fm' % bytes
32    if bytes > 1e3:
33        bytes = bytes / 1.0e3
34        return '%.1fk' % bytes
35    return str(bytes)
38def symbol_type_to_human(type):
39    """Convert a symbol type as printed by nm into a human-readable name."""
40    return {
41        'b': 'bss',
42        'd': 'data',
43        'r': 'read-only data',
44        't': 'code',
45        'u': 'weak symbol', # Unique global.
46        'w': 'weak symbol',
47        'v': 'weak symbol'
48        }[type]
51def parse_nm(input):
52    """Parse nm output.
54    Argument: an iterable over lines of nm output.
56    Yields: (symbol name, symbol type, symbol size, source file path).
57    Path may be None if nm couldn't figure out the source file.
58    """
60    # Match lines with size + symbol + optional filename.
61    sym_re = re.compile(r'^[0-9a-f]+ ([0-9a-f]+) (.) ([^\t]+)(?:\t(.*):\d+)?$')
63    # Match lines with addr but no size.
64    addr_re = re.compile(r'^[0-9a-f]+ (.) ([^\t]+)(?:\t.*)?$')
65    # Match lines that don't have an address at all -- typically external symbols.
66    noaddr_re = re.compile(r'^ + (.) (.*)$')
68    for line in input:
69        line = line.rstrip()
70        match = sym_re.match(line)
71        if match:
72            size, type, sym = match.groups()[0:3]
73            size = int(size, 16)
74            type = type.lower()
75            if type in ['u', 'v']:
76                type = 'w'  # just call them all weak
77            if type == 'b':
78                continue  # skip all BSS for now
79            path = match.group(4)
80            yield sym, type, size, path
81            continue
82        match = addr_re.match(line)
83        if match:
84            type, sym = match.groups()[0:2]
85            # No size == we don't care.
86            continue
87        match = noaddr_re.match(line)
88        if match:
89            type, sym = match.groups()
90            if type in ('U', 'w'):
91                # external or weak symbol
92                continue
94        print >>sys.stderr, 'unparsed:', repr(line)
96def demangle(ident, cppfilt):
97    if cppfilt and ident.startswith('_Z'):
98        # Demangle names when possible. Mangled names all start with _Z.
99        ident = subprocess.check_output([cppfilt, ident]).strip()
100    return ident
103class Suffix:
104    def __init__(self, suffix, replacement):
105        self.pattern = '^(.*)' + suffix + '(.*)$'
106        self.re = re.compile(self.pattern)
107        self.replacement = replacement
109class SuffixCleanup:
110    """Pre-compile suffix regular expressions."""
111    def __init__(self):
112        self.suffixes = [
113            Suffix('\.part\.([0-9]+)',      'part'),
114            Suffix('\.constprop\.([0-9]+)', 'constprop'),
115            Suffix('\.isra\.([0-9]+)',      'isra'),
116        ]
117    def cleanup(self, ident, cppfilt):
118        """Cleanup identifiers that have suffixes preventing demangling,
119           and demangle if possible."""
120        to_append = []
121        for s in self.suffixes:
122            found = s.re.match(ident)
123            if not found:
124                continue
125            to_append += [' [' + s.replacement + '.' + found.group(2) + ']']
126            ident = found.group(1) + found.group(3)
127        if len(to_append) > 0:
128            # Only try to demangle if there were suffixes.
129            ident = demangle(ident, cppfilt)
130        for s in to_append:
131            ident += s
132        return ident
134suffix_cleanup = SuffixCleanup()
136def parse_cpp_name(name, cppfilt):
137    name = suffix_cleanup.cleanup(name, cppfilt)
139    # Turn prefixes into suffixes so namespacing works.
140    prefixes = [
141        ['bool ',                         ''],
142        ['construction vtable for ',      ' [construction vtable]'],
143        ['global constructors keyed to ', ' [global constructors]'],
144        ['guard variable for ',           ' [guard variable]'],
145        ['int ',                          ''],
146        ['non-virtual thunk to ',         ' [non-virtual thunk]'],
147        ['typeinfo for ',                 ' [typeinfo]'],
148        ['typeinfo name for ',            ' [typeinfo name]'],
149        ['virtual thunk to ',             ' [virtual thunk]'],
150        ['void ',                         ''],
151        ['vtable for ',                   ' [vtable]'],
152        ['VTT for ',                      ' [VTT]'],
153    ]
154    for prefix, replacement in prefixes:
155        if name.startswith(prefix):
156            name = name[len(prefix):] + replacement
157    # Simplify parenthesis parsing.
158    replacements = [
159        ['(anonymous namespace)', '[anonymous namespace]'],
160    ]
161    for value, replacement in replacements:
162        name = name.replace(value, replacement)
164    def parse_one(val):
165        """Returns (leftmost-part, remaining)."""
166        if (val.startswith('operator') and
167            not (val[8].isalnum() or val[8] == '_')):
168            # Operator overload function, terminate.
169            return (val, '')
170        co = val.find('::')
171        lt = val.find('<')
172        pa = val.find('(')
173        co = len(val) if co == -1 else co
174        lt = len(val) if lt == -1 else lt
175        pa = len(val) if pa == -1 else pa
176        if co < lt and co < pa:
177            # Namespace or type name.
178            return (val[:co], val[co+2:])
179        if lt < pa:
180            # Template. Make sure we capture nested templates too.
181            open_tmpl = 1
182            gt = lt
183            while val[gt] != '>' or open_tmpl != 0:
184                gt = gt + 1
185                if val[gt] == '<':
186                    open_tmpl = open_tmpl + 1
187                if val[gt] == '>':
188                    open_tmpl = open_tmpl - 1
189            ret = val[gt+1:]
190            if ret.startswith('::'):
191                ret = ret[2:]
192            if ret.startswith('('):
193                # Template function, terminate.
194                return (val, '')
195            return (val[:gt+1], ret)
196        # Terminate with any function name, identifier, or unmangled name.
197        return (val, '')
199    parts = []
200    while len(name) > 0:
201        (part, name) = parse_one(name)
202        assert len(part) > 0
203        parts.append(part)
204    return parts
207def treeify_syms(symbols, strip_prefix=None, cppfilt=None):
208    dirs = {}
209    for sym, type, size, path in symbols:
210        if path:
211            path = os.path.normpath(path)
212            if strip_prefix and path.startswith(strip_prefix):
213                path = path[len(strip_prefix):]
214            elif path.startswith('/'):
215                path = path[1:]
216            path = ['[path]'] + path.split('/')
218        parts = parse_cpp_name(sym, cppfilt)
219        if len(parts) == 1:
220          if path:
221            # No namespaces, group with path.
222            parts = path + parts
223          else:
224            new_prefix = ['[ungrouped]']
225            regroups = [
226                ['.L.str',                 '[str]'],
227                ['.L__PRETTY_FUNCTION__.', '[__PRETTY_FUNCTION__]'],
228                ['.L__func__.',            '[__func__]'],
229                ['.Lswitch.table',         '[switch table]'],
230            ]
231            for prefix, group in regroups:
232                if parts[0].startswith(prefix):
233                    parts[0] = parts[0][len(prefix):]
234                    parts[0] = demangle(parts[0], cppfilt)
235                    new_prefix += [group]
236                    break
237            parts = new_prefix + parts
239        key = parts.pop()
240        tree = dirs
241        try:
242            depth = 0
243            for part in parts:
244                depth = depth + 1
245                assert part != '', path
246                if part not in tree:
247                    tree[part] = {'$bloat_symbols':{}}
248                if type not in tree[part]['$bloat_symbols']:
249                    tree[part]['$bloat_symbols'][type] = 0
250                tree[part]['$bloat_symbols'][type] += 1
251                tree = tree[part]
252            old_size, old_symbols = tree.get(key, (0, {}))
253            if type not in old_symbols:
254                old_symbols[type] = 0
255            old_symbols[type] += 1
256            tree[key] = (old_size + size, old_symbols)
257        except:
258            print >>sys.stderr, 'sym `%s`\tparts `%s`\tkey `%s`' % (sym, parts, key)
259            raise
260    return dirs
263def jsonify_tree(tree, name):
264    children = []
265    total = 0
266    files = 0
268    for key, val in tree.iteritems():
269        if key == '$bloat_symbols':
270            continue
271        if isinstance(val, dict):
272            subtree = jsonify_tree(val, key)
273            total += subtree['data']['$area']
274            children.append(subtree)
275        else:
276            (size, symbols) = val
277            total += size
278            assert len(symbols) == 1, symbols.values()[0] == 1
279            symbol = symbol_type_to_human(symbols.keys()[0])
280            children.append({
281                    'name': key + ' ' + format_bytes(size),
282                    'data': {
283                        '$area': size,
284                        '$symbol': symbol,
285                    }
286            })
288    children.sort(key=lambda child: -child['data']['$area'])
289    dominant_symbol = ''
290    if '$bloat_symbols' in tree:
291        dominant_symbol = symbol_type_to_human(
292            max(tree['$bloat_symbols'].iteritems(),
293                key=operator.itemgetter(1))[0])
294    return {
295        'name': name + ' ' + format_bytes(total),
296        'data': {
297            '$area': total,
298            '$dominant_symbol': dominant_symbol,
299            },
300        'children': children,
301        }
304def dump_nm(nmfile, strip_prefix, cppfilt):
305    dirs = treeify_syms(parse_nm(nmfile), strip_prefix, cppfilt)
306    print ('var kTree = ' +
307           json.dumps(jsonify_tree(dirs, '[everything]'), indent=2))
310def parse_objdump(input):
311    """Parse objdump -h output."""
312    sec_re = re.compile('^\d+ (\S+) +([0-9a-z]+)')
313    sections = []
314    debug_sections = []
316    for line in input:
317        line = line.strip()
318        match = sec_re.match(line)
319        if match:
320            name, size = match.groups()
321            if name.startswith('.'):
322                name = name[1:]
323            if name.startswith('debug_'):
324                name = name[len('debug_'):]
325                debug_sections.append((name, int(size, 16)))
326            else:
327                sections.append((name, int(size, 16)))
328            continue
329    return sections, debug_sections
332def jsonify_sections(name, sections):
333    children = []
334    total = 0
335    for section, size in sections:
336        children.append({
337                'name': section + ' ' + format_bytes(size),
338                'data': { '$area': size }
339                })
340        total += size
342    children.sort(key=lambda child: -child['data']['$area'])
344    return {
345        'name': name + ' ' + format_bytes(total),
346        'data': { '$area': total },
347        'children': children
348        }
351def dump_sections(objdump):
352    sections, debug_sections = parse_objdump(objdump)
353    sections = jsonify_sections('sections', sections)
354    debug_sections = jsonify_sections('debug', debug_sections)
355    size = sections['data']['$area'] + debug_sections['data']['$area']
356    print 'var kTree = ' + json.dumps({
357            'name': 'top ' + format_bytes(size),
358            'data': { '$area': size },
359            'children': [ debug_sections, sections ]})
362usage="""%prog [options] MODE
364Modes are:
365  syms: output symbols json suitable for a treemap
366  dump: print symbols sorted by size (pipe to head for best output)
367  sections: output binary sections json suitable for a treemap
369nm output passed to --nm-output should from running a command
370like the following (note, can take a long time -- 30 minutes):
371  nm -C -S -l /path/to/binary > nm.out
373objdump output passed to --objdump-output should be from a command
375  objdump -h /path/to/binary > objdump.out"""
376parser = optparse.OptionParser(usage=usage)
377parser.add_option('--nm-output', action='store', dest='nmpath',
378                  metavar='PATH', default='nm.out',
379                  help='path to nm output [default=nm.out]')
380parser.add_option('--objdump-output', action='store', dest='objdumppath',
381                  metavar='PATH', default='objdump.out',
382                  help='path to objdump output [default=objdump.out]')
383parser.add_option('--strip-prefix', metavar='PATH', action='store',
384                  help='strip PATH prefix from paths; e.g. /path/to/src/root')
385parser.add_option('--filter', action='store',
386                  help='include only symbols/files matching FILTER')
387parser.add_option('--c++filt', action='store', metavar='PATH', dest='cppfilt',
388                  default='c++filt', help="Path to c++filt, used to demangle "
389                  "symbols that weren't handled by nm. Set to an invalid path "
390                  "to disable.")
391opts, args = parser.parse_args()
393if len(args) != 1:
394    parser.print_usage()
395    sys.exit(1)
397mode = args[0]
398if mode == 'syms':
399    nmfile = open(opts.nmpath, 'r')
400    try:
401        res = subprocess.check_output([opts.cppfilt, 'main'])
402        if res.strip() != 'main':
403            print >>sys.stderr, ("%s failed demangling, "
404                                 "output won't be demangled." % opt.cppfilt)
405            opts.cppfilt = None
406    except:
407        print >>sys.stderr, ("Could not find c++filt at %s, "
408                             "output won't be demangled." % opt.cppfilt)
409        opts.cppfilt = None
410    dump_nm(nmfile, strip_prefix=opts.strip_prefix, cppfilt=opts.cppfilt)
411elif mode == 'sections':
412    objdumpfile = open(opts.objdumppath, 'r')
413    dump_sections(objdumpfile)
414elif mode == 'dump':
415    nmfile = open(opts.nmpath, 'r')
416    syms = list(parse_nm(nmfile))
417    # a list of (sym, type, size, path); sort by size.
418    syms.sort(key=lambda x: -x[2])
419    total = 0
420    for sym, type, size, path in syms:
421        if type in ('b', 'w'):
422            continue  # skip bss and weak symbols
423        if path is None:
424            path = ''
425        if opts.filter and not (opts.filter in sym or opts.filter in path):
426            continue
427        print '%6s %s (%s) %s' % (format_bytes(size), sym,
428                                  symbol_type_to_human(type), path)
429        total += size
430    print '%6s %s' % (format_bytes(total), 'total'),
432    print 'unknown mode'
433    parser.print_usage()