1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4#                     The LLVM Compiler Infrastructure
5#
6# This file is distributed under the University of Illinois Open Source
7# License. See LICENSE.TXT for details.
8#
9#===------------------------------------------------------------------------===#
10import argparse
11import bisect
12import getopt
13import os
14import re
15import subprocess
16import sys
17
18symbolizers = {}
19DEBUG = False
20demangle = False
21binutils_prefix = None
22sysroot_path = None
23binary_name_filter = None
24fix_filename_patterns = None
25logfile = sys.stdin
26allow_system_symbolizer = True
27
28# FIXME: merge the code that calls fix_filename().
29def fix_filename(file_name):
30  if fix_filename_patterns:
31    for path_to_cut in fix_filename_patterns:
32      file_name = re.sub('.*' + path_to_cut, '', file_name)
33  file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
34  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
35  return file_name
36
37def sysroot_path_filter(binary_name):
38  return sysroot_path + binary_name
39
40def guess_arch(addr):
41  # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
42  if len(addr) > 10:
43    return 'x86_64'
44  else:
45    return 'i386'
46
47class Symbolizer(object):
48  def __init__(self):
49    pass
50
51  def symbolize(self, addr, binary, offset):
52    """Symbolize the given address (pair of binary and offset).
53
54    Overriden in subclasses.
55    Args:
56        addr: virtual address of an instruction.
57        binary: path to executable/shared object containing this instruction.
58        offset: instruction offset in the @binary.
59    Returns:
60        list of strings (one string for each inlined frame) describing
61        the code locations for this instruction (that is, function name, file
62        name, line and column numbers).
63    """
64    return None
65
66
67class LLVMSymbolizer(Symbolizer):
68  def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
69    super(LLVMSymbolizer, self).__init__()
70    self.symbolizer_path = symbolizer_path
71    self.default_arch = default_arch
72    self.system = system
73    self.dsym_hints = dsym_hints
74    self.pipe = self.open_llvm_symbolizer()
75
76  def open_llvm_symbolizer(self):
77    cmd = [self.symbolizer_path,
78           '--use-symbol-table=true',
79           '--demangle=%s' % demangle,
80           '--functions=linkage',
81           '--inlining=true',
82           '--default-arch=%s' % self.default_arch]
83    if self.system == 'Darwin':
84      for hint in self.dsym_hints:
85        cmd.append('--dsym-hint=%s' % hint)
86    if DEBUG:
87      print ' '.join(cmd)
88    try:
89      result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
90                                stdout=subprocess.PIPE)
91    except OSError:
92      result = None
93    return result
94
95  def symbolize(self, addr, binary, offset):
96    """Overrides Symbolizer.symbolize."""
97    if not self.pipe:
98      return None
99    result = []
100    try:
101      symbolizer_input = '"%s" %s' % (binary, offset)
102      if DEBUG:
103        print symbolizer_input
104      print >> self.pipe.stdin, symbolizer_input
105      while True:
106        function_name = self.pipe.stdout.readline().rstrip()
107        if not function_name:
108          break
109        file_name = self.pipe.stdout.readline().rstrip()
110        file_name = fix_filename(file_name)
111        if (not function_name.startswith('??') or
112            not file_name.startswith('??')):
113          # Append only non-trivial frames.
114          result.append('%s in %s %s' % (addr, function_name,
115                                         file_name))
116    except Exception:
117      result = []
118    if not result:
119      result = None
120    return result
121
122
123def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
124  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
125  if not symbolizer_path:
126    symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
127    if not symbolizer_path:
128      # Assume llvm-symbolizer is in PATH.
129      symbolizer_path = 'llvm-symbolizer'
130  return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
131
132
133class Addr2LineSymbolizer(Symbolizer):
134  def __init__(self, binary):
135    super(Addr2LineSymbolizer, self).__init__()
136    self.binary = binary
137    self.pipe = self.open_addr2line()
138    self.output_terminator = -1
139
140  def open_addr2line(self):
141    addr2line_tool = 'addr2line'
142    if binutils_prefix:
143      addr2line_tool = binutils_prefix + addr2line_tool
144    cmd = [addr2line_tool, '-fi']
145    if demangle:
146      cmd += ['--demangle']
147    cmd += ['-e', self.binary]
148    if DEBUG:
149      print ' '.join(cmd)
150    return subprocess.Popen(cmd,
151                            stdin=subprocess.PIPE, stdout=subprocess.PIPE)
152
153  def symbolize(self, addr, binary, offset):
154    """Overrides Symbolizer.symbolize."""
155    if self.binary != binary:
156      return None
157    lines = []
158    try:
159      print >> self.pipe.stdin, offset
160      print >> self.pipe.stdin, self.output_terminator
161      is_first_frame = True
162      while True:
163        function_name = self.pipe.stdout.readline().rstrip()
164        file_name = self.pipe.stdout.readline().rstrip()
165        if is_first_frame:
166          is_first_frame = False
167        elif function_name in ['', '??']:
168          assert file_name == function_name
169          break
170        lines.append((function_name, file_name));
171    except Exception:
172      lines.append(('??', '??:0'))
173    return ['%s in %s %s' % (addr, function, fix_filename(file)) for (function, file) in lines]
174
175class UnbufferedLineConverter(object):
176  """
177  Wrap a child process that responds to each line of input with one line of
178  output.  Uses pty to trick the child into providing unbuffered output.
179  """
180  def __init__(self, args, close_stderr=False):
181    # Local imports so that the script can start on Windows.
182    import pty
183    import termios
184    pid, fd = pty.fork()
185    if pid == 0:
186      # We're the child. Transfer control to command.
187      if close_stderr:
188        dev_null = os.open('/dev/null', 0)
189        os.dup2(dev_null, 2)
190      os.execvp(args[0], args)
191    else:
192      # Disable echoing.
193      attr = termios.tcgetattr(fd)
194      attr[3] = attr[3] & ~termios.ECHO
195      termios.tcsetattr(fd, termios.TCSANOW, attr)
196      # Set up a file()-like interface to the child process
197      self.r = os.fdopen(fd, "r", 1)
198      self.w = os.fdopen(os.dup(fd), "w", 1)
199
200  def convert(self, line):
201    self.w.write(line + "\n")
202    return self.readline()
203
204  def readline(self):
205    return self.r.readline().rstrip()
206
207
208class DarwinSymbolizer(Symbolizer):
209  def __init__(self, addr, binary):
210    super(DarwinSymbolizer, self).__init__()
211    self.binary = binary
212    self.arch = guess_arch(addr)
213    self.open_atos()
214
215  def open_atos(self):
216    if DEBUG:
217      print 'atos -o %s -arch %s' % (self.binary, self.arch)
218    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
219    self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
220
221  def symbolize(self, addr, binary, offset):
222    """Overrides Symbolizer.symbolize."""
223    if self.binary != binary:
224      return None
225    atos_line = self.atos.convert('0x%x' % int(offset, 16))
226    while "got symbolicator for" in atos_line:
227      atos_line = self.atos.readline()
228    # A well-formed atos response looks like this:
229    #   foo(type1, type2) (in object.name) (filename.cc:80)
230    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
231    if DEBUG:
232      print 'atos_line: ', atos_line
233    if match:
234      function_name = match.group(1)
235      function_name = re.sub('\(.*?\)', '', function_name)
236      file_name = fix_filename(match.group(3))
237      return ['%s in %s %s' % (addr, function_name, file_name)]
238    else:
239      return ['%s in %s' % (addr, atos_line)]
240
241
242# Chain several symbolizers so that if one symbolizer fails, we fall back
243# to the next symbolizer in chain.
244class ChainSymbolizer(Symbolizer):
245  def __init__(self, symbolizer_list):
246    super(ChainSymbolizer, self).__init__()
247    self.symbolizer_list = symbolizer_list
248
249  def symbolize(self, addr, binary, offset):
250    """Overrides Symbolizer.symbolize."""
251    for symbolizer in self.symbolizer_list:
252      if symbolizer:
253        result = symbolizer.symbolize(addr, binary, offset)
254        if result:
255          return result
256    return None
257
258  def append_symbolizer(self, symbolizer):
259    self.symbolizer_list.append(symbolizer)
260
261
262def BreakpadSymbolizerFactory(binary):
263  suffix = os.getenv('BREAKPAD_SUFFIX')
264  if suffix:
265    filename = binary + suffix
266    if os.access(filename, os.F_OK):
267      return BreakpadSymbolizer(filename)
268  return None
269
270
271def SystemSymbolizerFactory(system, addr, binary):
272  if system == 'Darwin':
273    return DarwinSymbolizer(addr, binary)
274  elif system == 'Linux' or system == 'FreeBSD':
275    return Addr2LineSymbolizer(binary)
276
277
278class BreakpadSymbolizer(Symbolizer):
279  def __init__(self, filename):
280    super(BreakpadSymbolizer, self).__init__()
281    self.filename = filename
282    lines = file(filename).readlines()
283    self.files = []
284    self.symbols = {}
285    self.address_list = []
286    self.addresses = {}
287    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
288    fragments = lines[0].rstrip().split()
289    self.arch = fragments[2]
290    self.debug_id = fragments[3]
291    self.binary = ' '.join(fragments[4:])
292    self.parse_lines(lines[1:])
293
294  def parse_lines(self, lines):
295    cur_function_addr = ''
296    for line in lines:
297      fragments = line.split()
298      if fragments[0] == 'FILE':
299        assert int(fragments[1]) == len(self.files)
300        self.files.append(' '.join(fragments[2:]))
301      elif fragments[0] == 'PUBLIC':
302        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
303      elif fragments[0] in ['CFI', 'STACK']:
304        pass
305      elif fragments[0] == 'FUNC':
306        cur_function_addr = int(fragments[1], 16)
307        if not cur_function_addr in self.symbols.keys():
308          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
309      else:
310        # Line starting with an address.
311        addr = int(fragments[0], 16)
312        self.address_list.append(addr)
313        # Tuple of symbol address, size, line, file number.
314        self.addresses[addr] = (cur_function_addr,
315                                int(fragments[1], 16),
316                                int(fragments[2]),
317                                int(fragments[3]))
318    self.address_list.sort()
319
320  def get_sym_file_line(self, addr):
321    key = None
322    if addr in self.addresses.keys():
323      key = addr
324    else:
325      index = bisect.bisect_left(self.address_list, addr)
326      if index == 0:
327        return None
328      else:
329        key = self.address_list[index - 1]
330    sym_id, size, line_no, file_no = self.addresses[key]
331    symbol = self.symbols[sym_id]
332    filename = self.files[file_no]
333    if addr < key + size:
334      return symbol, filename, line_no
335    else:
336      return None
337
338  def symbolize(self, addr, binary, offset):
339    if self.binary != binary:
340      return None
341    res = self.get_sym_file_line(int(offset, 16))
342    if res:
343      function_name, file_name, line_no = res
344      result = ['%s in %s %s:%d' % (
345          addr, function_name, file_name, line_no)]
346      print result
347      return result
348    else:
349      return None
350
351
352class SymbolizationLoop(object):
353  def __init__(self, binary_name_filter=None, dsym_hint_producer=None):
354    if sys.platform == 'win32':
355      # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
356      # even in sandboxed processes.  Nothing needs to be done here.
357      self.process_line = self.process_line_echo
358    else:
359      # Used by clients who may want to supply a different binary name.
360      # E.g. in Chrome several binaries may share a single .dSYM.
361      self.binary_name_filter = binary_name_filter
362      self.dsym_hint_producer = dsym_hint_producer
363      self.system = os.uname()[0]
364      if self.system not in ['Linux', 'Darwin', 'FreeBSD']:
365        raise Exception('Unknown system')
366      self.llvm_symbolizers = {}
367      self.last_llvm_symbolizer = None
368      self.dsym_hints = set([])
369      self.frame_no = 0
370      self.process_line = self.process_line_posix
371
372  def symbolize_address(self, addr, binary, offset):
373    # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
374    # a single symbolizer binary.
375    # On Darwin, if the dsym hint producer is present:
376    #  1. check whether we've seen this binary already; if so,
377    #     use |llvm_symbolizers[binary]|, which has already loaded the debug
378    #     info for this binary (might not be the case for
379    #     |last_llvm_symbolizer|);
380    #  2. otherwise check if we've seen all the hints for this binary already;
381    #     if so, reuse |last_llvm_symbolizer| which has the full set of hints;
382    #  3. otherwise create a new symbolizer and pass all currently known
383    #     .dSYM hints to it.
384    if not binary in self.llvm_symbolizers:
385      use_new_symbolizer = True
386      if self.system == 'Darwin' and self.dsym_hint_producer:
387        dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
388        use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
389        self.dsym_hints |= dsym_hints_for_binary
390      if self.last_llvm_symbolizer and not use_new_symbolizer:
391          self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
392      else:
393        self.last_llvm_symbolizer = LLVMSymbolizerFactory(
394            self.system, guess_arch(addr), self.dsym_hints)
395        self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
396    # Use the chain of symbolizers:
397    # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
398    # (fall back to next symbolizer if the previous one fails).
399    if not binary in symbolizers:
400      symbolizers[binary] = ChainSymbolizer(
401          [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]])
402    result = symbolizers[binary].symbolize(addr, binary, offset)
403    if result is None:
404      if not allow_system_symbolizer:
405        raise Exception('Failed to launch or use llvm-symbolizer.')
406      # Initialize system symbolizer only if other symbolizers failed.
407      symbolizers[binary].append_symbolizer(
408          SystemSymbolizerFactory(self.system, addr, binary))
409      result = symbolizers[binary].symbolize(addr, binary, offset)
410    # The system symbolizer must produce some result.
411    assert result
412    return result
413
414  def get_symbolized_lines(self, symbolized_lines):
415    if not symbolized_lines:
416      return [self.current_line]
417    else:
418      result = []
419      for symbolized_frame in symbolized_lines:
420        result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
421        self.frame_no += 1
422      return result
423
424  def process_logfile(self):
425    self.frame_no = 0
426    for line in logfile:
427      processed = self.process_line(line)
428      print '\n'.join(processed)
429
430  def process_line_echo(self, line):
431    return [line.rstrip()]
432
433  def process_line_posix(self, line):
434    self.current_line = line.rstrip()
435    #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
436    stack_trace_line_format = (
437        '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
438    match = re.match(stack_trace_line_format, line)
439    if not match:
440      return [self.current_line]
441    if DEBUG:
442      print line
443    _, frameno_str, addr, binary, offset = match.groups()
444    if frameno_str == '0':
445      # Assume that frame #0 is the first frame of new stack trace.
446      self.frame_no = 0
447    original_binary = binary
448    if self.binary_name_filter:
449      binary = self.binary_name_filter(binary)
450    symbolized_line = self.symbolize_address(addr, binary, offset)
451    if not symbolized_line:
452      if original_binary != binary:
453        symbolized_line = self.symbolize_address(addr, binary, offset)
454    return self.get_symbolized_lines(symbolized_line)
455
456
457if __name__ == '__main__':
458  parser = argparse.ArgumentParser(
459      formatter_class=argparse.RawDescriptionHelpFormatter,
460      description='ASan symbolization script',
461      epilog='Example of use:\n'
462             'asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" '
463             '-s "$HOME/SymbolFiles" < asan.log')
464  parser.add_argument('path_to_cut', nargs='*',
465                      help='pattern to be cut from the result file path ')
466  parser.add_argument('-d','--demangle', action='store_true',
467                      help='demangle function names')
468  parser.add_argument('-s', metavar='SYSROOT',
469                      help='set path to sysroot for sanitized binaries')
470  parser.add_argument('-c', metavar='CROSS_COMPILE',
471                      help='set prefix for binutils')
472  parser.add_argument('-l','--logfile', default=sys.stdin,
473                      type=argparse.FileType('r'),
474                      help='set log file name to parse, default is stdin')
475  args = parser.parse_args()
476  if args.path_to_cut:
477    fix_filename_patterns = args.path_to_cut
478  if args.demangle:
479    demangle = True
480  if args.s:
481    binary_name_filter = sysroot_path_filter
482    sysroot_path = args.s
483  if args.c:
484    binutils_prefix = args.c
485  if args.logfile:
486    logfile = args.logfile
487  else:
488    logfile = sys.stdin
489  loop = SymbolizationLoop(binary_name_filter)
490  loop.process_logfile()
491