1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4#                     The LLVM Compiler Infrastructure
5#
6# This file is distributed under the University of Illinois Open Source
7# License. See LICENSE.TXT for details.
8#
9#===------------------------------------------------------------------------===#
10import argparse
11import bisect
12import getopt
13import os
14import pty
15import re
16import subprocess
17import sys
18import termios
19
20symbolizers = {}
21DEBUG = False
22demangle = False
23binutils_prefix = None
24sysroot_path = None
25binary_name_filter = None
26fix_filename_patterns = None
27logfile = sys.stdin
28
29# FIXME: merge the code that calls fix_filename().
30def fix_filename(file_name):
31  if fix_filename_patterns:
32    for path_to_cut in fix_filename_patterns:
33      file_name = re.sub('.*' + path_to_cut, '', file_name)
34  file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
35  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
36  return file_name
37
38def sysroot_path_filter(binary_name):
39  return sysroot_path + binary_name
40
41def guess_arch(addr):
42  # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
43  if len(addr) > 10:
44    return 'x86_64'
45  else:
46    return 'i386'
47
48class Symbolizer(object):
49  def __init__(self):
50    pass
51
52  def symbolize(self, addr, binary, offset):
53    """Symbolize the given address (pair of binary and offset).
54
55    Overriden in subclasses.
56    Args:
57        addr: virtual address of an instruction.
58        binary: path to executable/shared object containing this instruction.
59        offset: instruction offset in the @binary.
60    Returns:
61        list of strings (one string for each inlined frame) describing
62        the code locations for this instruction (that is, function name, file
63        name, line and column numbers).
64    """
65    return None
66
67
68class LLVMSymbolizer(Symbolizer):
69  def __init__(self, symbolizer_path, addr):
70    super(LLVMSymbolizer, self).__init__()
71    self.symbolizer_path = symbolizer_path
72    self.default_arch = guess_arch(addr)
73    self.pipe = self.open_llvm_symbolizer()
74
75  def open_llvm_symbolizer(self):
76    cmd = [self.symbolizer_path,
77           '--use-symbol-table=true',
78           '--demangle=%s' % demangle,
79           '--functions=short',
80           '--inlining=true',
81           '--default-arch=%s' % self.default_arch]
82    if DEBUG:
83      print ' '.join(cmd)
84    try:
85      result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
86                                stdout=subprocess.PIPE)
87    except OSError:
88      result = None
89    return result
90
91  def symbolize(self, addr, binary, offset):
92    """Overrides Symbolizer.symbolize."""
93    if not self.pipe:
94      return None
95    result = []
96    try:
97      symbolizer_input = '%s %s' % (binary, offset)
98      if DEBUG:
99        print symbolizer_input
100      print >> self.pipe.stdin, symbolizer_input
101      while True:
102        function_name = self.pipe.stdout.readline().rstrip()
103        if not function_name:
104          break
105        file_name = self.pipe.stdout.readline().rstrip()
106        file_name = fix_filename(file_name)
107        if (not function_name.startswith('??') or
108            not file_name.startswith('??')):
109          # Append only non-trivial frames.
110          result.append('%s in %s %s' % (addr, function_name,
111                                         file_name))
112    except Exception:
113      result = []
114    if not result:
115      result = None
116    return result
117
118
119def LLVMSymbolizerFactory(system, addr):
120  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
121  if not symbolizer_path:
122    symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
123    if not symbolizer_path:
124      # Assume llvm-symbolizer is in PATH.
125      symbolizer_path = 'llvm-symbolizer'
126  return LLVMSymbolizer(symbolizer_path, addr)
127
128
129class Addr2LineSymbolizer(Symbolizer):
130  def __init__(self, binary):
131    super(Addr2LineSymbolizer, self).__init__()
132    self.binary = binary
133    self.pipe = self.open_addr2line()
134
135  def open_addr2line(self):
136    addr2line_tool = 'addr2line'
137    if binutils_prefix:
138      addr2line_tool = binutils_prefix + addr2line_tool
139    cmd = [addr2line_tool, '-f']
140    if demangle:
141      cmd += ['--demangle']
142    cmd += ['-e', self.binary]
143    if DEBUG:
144      print ' '.join(cmd)
145    return subprocess.Popen(cmd,
146                            stdin=subprocess.PIPE, stdout=subprocess.PIPE)
147
148  def symbolize(self, addr, binary, offset):
149    """Overrides Symbolizer.symbolize."""
150    if self.binary != binary:
151      return None
152    try:
153      print >> self.pipe.stdin, offset
154      function_name = self.pipe.stdout.readline().rstrip()
155      file_name = self.pipe.stdout.readline().rstrip()
156    except Exception:
157      function_name = ''
158      file_name = ''
159    file_name = fix_filename(file_name)
160    return ['%s in %s %s' % (addr, function_name, file_name)]
161
162
163class UnbufferedLineConverter(object):
164  """
165  Wrap a child process that responds to each line of input with one line of
166  output.  Uses pty to trick the child into providing unbuffered output.
167  """
168  def __init__(self, args, close_stderr=False):
169    pid, fd = pty.fork()
170    if pid == 0:
171      # We're the child. Transfer control to command.
172      if close_stderr:
173        dev_null = os.open('/dev/null', 0)
174        os.dup2(dev_null, 2)
175      os.execvp(args[0], args)
176    else:
177      # Disable echoing.
178      attr = termios.tcgetattr(fd)
179      attr[3] = attr[3] & ~termios.ECHO
180      termios.tcsetattr(fd, termios.TCSANOW, attr)
181      # Set up a file()-like interface to the child process
182      self.r = os.fdopen(fd, "r", 1)
183      self.w = os.fdopen(os.dup(fd), "w", 1)
184
185  def convert(self, line):
186    self.w.write(line + "\n")
187    return self.readline()
188
189  def readline(self):
190    return self.r.readline().rstrip()
191
192
193class DarwinSymbolizer(Symbolizer):
194  def __init__(self, addr, binary):
195    super(DarwinSymbolizer, self).__init__()
196    self.binary = binary
197    self.arch = guess_arch(addr)
198    self.open_atos()
199
200  def open_atos(self):
201    if DEBUG:
202      print 'atos -o %s -arch %s' % (self.binary, self.arch)
203    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
204    self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
205
206  def symbolize(self, addr, binary, offset):
207    """Overrides Symbolizer.symbolize."""
208    if self.binary != binary:
209      return None
210    atos_line = self.atos.convert('0x%x' % int(offset, 16))
211    while "got symbolicator for" in atos_line:
212      atos_line = self.atos.readline()
213    # A well-formed atos response looks like this:
214    #   foo(type1, type2) (in object.name) (filename.cc:80)
215    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
216    if DEBUG:
217      print 'atos_line: ', atos_line
218    if match:
219      function_name = match.group(1)
220      function_name = re.sub('\(.*?\)', '', function_name)
221      file_name = fix_filename(match.group(3))
222      return ['%s in %s %s' % (addr, function_name, file_name)]
223    else:
224      return ['%s in %s' % (addr, atos_line)]
225
226
227# Chain several symbolizers so that if one symbolizer fails, we fall back
228# to the next symbolizer in chain.
229class ChainSymbolizer(Symbolizer):
230  def __init__(self, symbolizer_list):
231    super(ChainSymbolizer, self).__init__()
232    self.symbolizer_list = symbolizer_list
233
234  def symbolize(self, addr, binary, offset):
235    """Overrides Symbolizer.symbolize."""
236    for symbolizer in self.symbolizer_list:
237      if symbolizer:
238        result = symbolizer.symbolize(addr, binary, offset)
239        if result:
240          return result
241    return None
242
243  def append_symbolizer(self, symbolizer):
244    self.symbolizer_list.append(symbolizer)
245
246
247def BreakpadSymbolizerFactory(binary):
248  suffix = os.getenv('BREAKPAD_SUFFIX')
249  if suffix:
250    filename = binary + suffix
251    if os.access(filename, os.F_OK):
252      return BreakpadSymbolizer(filename)
253  return None
254
255
256def SystemSymbolizerFactory(system, addr, binary):
257  if system == 'Darwin':
258    return DarwinSymbolizer(addr, binary)
259  elif system == 'Linux':
260    return Addr2LineSymbolizer(binary)
261
262
263class BreakpadSymbolizer(Symbolizer):
264  def __init__(self, filename):
265    super(BreakpadSymbolizer, self).__init__()
266    self.filename = filename
267    lines = file(filename).readlines()
268    self.files = []
269    self.symbols = {}
270    self.address_list = []
271    self.addresses = {}
272    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
273    fragments = lines[0].rstrip().split()
274    self.arch = fragments[2]
275    self.debug_id = fragments[3]
276    self.binary = ' '.join(fragments[4:])
277    self.parse_lines(lines[1:])
278
279  def parse_lines(self, lines):
280    cur_function_addr = ''
281    for line in lines:
282      fragments = line.split()
283      if fragments[0] == 'FILE':
284        assert int(fragments[1]) == len(self.files)
285        self.files.append(' '.join(fragments[2:]))
286      elif fragments[0] == 'PUBLIC':
287        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
288      elif fragments[0] in ['CFI', 'STACK']:
289        pass
290      elif fragments[0] == 'FUNC':
291        cur_function_addr = int(fragments[1], 16)
292        if not cur_function_addr in self.symbols.keys():
293          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
294      else:
295        # Line starting with an address.
296        addr = int(fragments[0], 16)
297        self.address_list.append(addr)
298        # Tuple of symbol address, size, line, file number.
299        self.addresses[addr] = (cur_function_addr,
300                                int(fragments[1], 16),
301                                int(fragments[2]),
302                                int(fragments[3]))
303    self.address_list.sort()
304
305  def get_sym_file_line(self, addr):
306    key = None
307    if addr in self.addresses.keys():
308      key = addr
309    else:
310      index = bisect.bisect_left(self.address_list, addr)
311      if index == 0:
312        return None
313      else:
314        key = self.address_list[index - 1]
315    sym_id, size, line_no, file_no = self.addresses[key]
316    symbol = self.symbols[sym_id]
317    filename = self.files[file_no]
318    if addr < key + size:
319      return symbol, filename, line_no
320    else:
321      return None
322
323  def symbolize(self, addr, binary, offset):
324    if self.binary != binary:
325      return None
326    res = self.get_sym_file_line(int(offset, 16))
327    if res:
328      function_name, file_name, line_no = res
329      result = ['%s in %s %s:%d' % (
330          addr, function_name, file_name, line_no)]
331      print result
332      return result
333    else:
334      return None
335
336
337class SymbolizationLoop(object):
338  def __init__(self, binary_name_filter=None):
339    # Used by clients who may want to supply a different binary name.
340    # E.g. in Chrome several binaries may share a single .dSYM.
341    self.binary_name_filter = binary_name_filter
342    self.system = os.uname()[0]
343    if self.system not in ['Linux', 'Darwin', 'FreeBSD']:
344      raise Exception('Unknown system')
345    self.llvm_symbolizer = None
346    self.frame_no = 0
347
348  def symbolize_address(self, addr, binary, offset):
349    # Initialize llvm-symbolizer lazily.
350    if not self.llvm_symbolizer:
351      self.llvm_symbolizer = LLVMSymbolizerFactory(self.system, addr)
352    # Use the chain of symbolizers:
353    # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
354    # (fall back to next symbolizer if the previous one fails).
355    if not binary in symbolizers:
356      symbolizers[binary] = ChainSymbolizer(
357          [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer])
358    result = symbolizers[binary].symbolize(addr, binary, offset)
359    if result is None:
360      # Initialize system symbolizer only if other symbolizers failed.
361      symbolizers[binary].append_symbolizer(
362          SystemSymbolizerFactory(self.system, addr, binary))
363      result = symbolizers[binary].symbolize(addr, binary, offset)
364    # The system symbolizer must produce some result.
365    assert result
366    return result
367
368  def get_symbolized_lines(self, symbolized_lines):
369    if not symbolized_lines:
370      return [self.current_line]
371    else:
372      result = []
373      for symbolized_frame in symbolized_lines:
374        result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
375        self.frame_no += 1
376      return result
377
378  def process_logfile(self):
379    self.frame_no = 0
380    while True:
381      line = logfile.readline()
382      if not line:
383        break
384      processed = self.process_line(line)
385      print '\n'.join(processed)
386
387  def process_line(self, line):
388    self.current_line = line.rstrip()
389    #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
390    stack_trace_line_format = (
391        '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
392    match = re.match(stack_trace_line_format, line)
393    if not match:
394      return [self.current_line]
395    if DEBUG:
396      print line
397    _, frameno_str, addr, binary, offset = match.groups()
398    if frameno_str == '0':
399      # Assume that frame #0 is the first frame of new stack trace.
400      self.frame_no = 0
401    original_binary = binary
402    if self.binary_name_filter:
403      binary = self.binary_name_filter(binary)
404    symbolized_line = self.symbolize_address(addr, binary, offset)
405    if not symbolized_line:
406      if original_binary != binary:
407        symbolized_line = self.symbolize_address(addr, binary, offset)
408    return self.get_symbolized_lines(symbolized_line)
409
410
411if __name__ == '__main__':
412  parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
413  description='ASan symbolization script',
414  epilog='''Example of use:
415  asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" -s "$HOME/SymbolFiles" < asan.log''')
416  parser.add_argument('path_to_cut', nargs='*',
417    help='pattern to be cut from the result file path ')
418  parser.add_argument('-d','--demangle', action='store_true',
419    help='demangle function names')
420  parser.add_argument('-s', metavar='SYSROOT',
421    help='set path to sysroot for sanitized binaries')
422  parser.add_argument('-c', metavar='CROSS_COMPILE',
423    help='set prefix for binutils')
424  parser.add_argument('-l','--logfile', default=sys.stdin, type=argparse.FileType('r'),
425    help='set log file name to parse, default is stdin')
426  args = parser.parse_args()
427  if args.path_to_cut:
428    fix_filename_patterns = args.path_to_cut
429  if args.demangle:
430    demangle = True
431  if args.s:
432    binary_name_filter = sysroot_path_filter
433    sysroot_path = args.s
434  if args.c:
435    binutils_prefix = args.c
436  if args.logfile:
437    logfile = args.logfile
438  else:
439    logfile = sys.stdin
440  loop = SymbolizationLoop(binary_name_filter)
441  loop.process_logfile()
442