asan_symbolize.py revision 2e6a1fb50f0252b673c37a272773a83508d4f927
1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4#                     The LLVM Compiler Infrastructure
5#
6# This file is distributed under the University of Illinois Open Source
7# License. See LICENSE.TXT for details.
8#
9#===------------------------------------------------------------------------===#
10import bisect
11import getopt
12import os
13import pty
14import re
15import subprocess
16import sys
17import termios
18
19llvm_symbolizer = None
20symbolizers = {}
21DEBUG = False
22demangle = False;
23
24
25# FIXME: merge the code that calls fix_filename().
26def fix_filename(file_name):
27  for path_to_cut in sys.argv[1:]:
28    file_name = re.sub('.*' + path_to_cut, '', file_name)
29  file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
30  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
31  return file_name
32
33
34class Symbolizer(object):
35  def __init__(self):
36    pass
37
38  def symbolize(self, addr, binary, offset):
39    """Symbolize the given address (pair of binary and offset).
40
41    Overriden in subclasses.
42    Args:
43        addr: virtual address of an instruction.
44        binary: path to executable/shared object containing this instruction.
45        offset: instruction offset in the @binary.
46    Returns:
47        list of strings (one string for each inlined frame) describing
48        the code locations for this instruction (that is, function name, file
49        name, line and column numbers).
50    """
51    return None
52
53
54class LLVMSymbolizer(Symbolizer):
55  def __init__(self, symbolizer_path):
56    super(LLVMSymbolizer, self).__init__()
57    self.symbolizer_path = symbolizer_path
58    self.pipe = self.open_llvm_symbolizer()
59
60  def open_llvm_symbolizer(self):
61    if not os.path.exists(self.symbolizer_path):
62      return None
63    cmd = [self.symbolizer_path,
64           '--use-symbol-table=true',
65           '--demangle=%s' % demangle,
66           '--functions=true',
67           '--inlining=true']
68    if DEBUG:
69      print ' '.join(cmd)
70    return subprocess.Popen(cmd, stdin=subprocess.PIPE,
71                            stdout=subprocess.PIPE)
72
73  def symbolize(self, addr, binary, offset):
74    """Overrides Symbolizer.symbolize."""
75    if not self.pipe:
76      return None
77    result = []
78    try:
79      symbolizer_input = '%s %s' % (binary, offset)
80      if DEBUG:
81        print symbolizer_input
82      print >> self.pipe.stdin, symbolizer_input
83      while True:
84        function_name = self.pipe.stdout.readline().rstrip()
85        if not function_name:
86          break
87        file_name = self.pipe.stdout.readline().rstrip()
88        file_name = fix_filename(file_name)
89        if (not function_name.startswith('??') and
90            not file_name.startswith('??')):
91          # Append only valid frames.
92          result.append('%s in %s %s' % (addr, function_name,
93                                         file_name))
94    except Exception:
95      result = []
96    if not result:
97      result = None
98    return result
99
100
101def LLVMSymbolizerFactory(system):
102  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
103  if not symbolizer_path:
104    # Assume llvm-symbolizer is in PATH.
105    symbolizer_path = 'llvm-symbolizer'
106  return LLVMSymbolizer(symbolizer_path)
107
108
109class Addr2LineSymbolizer(Symbolizer):
110  def __init__(self, binary):
111    super(Addr2LineSymbolizer, self).__init__()
112    self.binary = binary
113    self.pipe = self.open_addr2line()
114
115  def open_addr2line(self):
116    cmd = ['addr2line', '-f']
117    if demangle:
118      cmd += ['--demangle']
119    cmd += ['-e', self.binary]
120    if DEBUG:
121      print ' '.join(cmd)
122    return subprocess.Popen(cmd,
123                            stdin=subprocess.PIPE, stdout=subprocess.PIPE)
124
125  def symbolize(self, addr, binary, offset):
126    """Overrides Symbolizer.symbolize."""
127    if self.binary != binary:
128      return None
129    try:
130      print >> self.pipe.stdin, offset
131      function_name = self.pipe.stdout.readline().rstrip()
132      file_name = self.pipe.stdout.readline().rstrip()
133    except Exception:
134      function_name = ''
135      file_name = ''
136    file_name = fix_filename(file_name)
137    return ['%s in %s %s' % (addr, function_name, file_name)]
138
139
140class UnbufferedLineConverter(object):
141  """
142  Wrap a child process that responds to each line of input with one line of
143  output.  Uses pty to trick the child into providing unbuffered output.
144  """
145  def __init__(self, args, close_stderr=False):
146    pid, fd = pty.fork()
147    if pid == 0:
148      # We're the child. Transfer control to command.
149      if close_stderr:
150        dev_null = os.open('/dev/null', 0)
151        os.dup2(dev_null, 2)
152      os.execvp(args[0], args)
153    else:
154      # Disable echoing.
155      attr = termios.tcgetattr(fd)
156      attr[3] = attr[3] & ~termios.ECHO
157      termios.tcsetattr(fd, termios.TCSANOW, attr)
158      # Set up a file()-like interface to the child process
159      self.r = os.fdopen(fd, "r", 1)
160      self.w = os.fdopen(os.dup(fd), "w", 1)
161
162  def convert(self, line):
163    self.w.write(line + "\n")
164    return self.readline()
165
166  def readline(self):
167    return self.r.readline().rstrip()
168
169
170class DarwinSymbolizer(Symbolizer):
171  def __init__(self, addr, binary):
172    super(DarwinSymbolizer, self).__init__()
173    self.binary = binary
174    # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
175    if len(addr) > 10:
176      self.arch = 'x86_64'
177    else:
178      self.arch = 'i386'
179    self.open_atos()
180
181  def open_atos(self):
182    if DEBUG:
183      print 'atos -o %s -arch %s' % (self.binary, self.arch)
184    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
185    self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
186
187  def symbolize(self, addr, binary, offset):
188    """Overrides Symbolizer.symbolize."""
189    if self.binary != binary:
190      return None
191    atos_line = self.atos.convert('0x%x' % int(offset, 16))
192    while "got symbolicator for" in atos_line:
193      atos_line = self.atos.readline()
194    # A well-formed atos response looks like this:
195    #   foo(type1, type2) (in object.name) (filename.cc:80)
196    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
197    if DEBUG:
198      print 'atos_line: ', atos_line
199    if match:
200      function_name = match.group(1)
201      function_name = re.sub('\(.*?\)', '', function_name)
202      file_name = fix_filename(match.group(3))
203      return ['%s in %s %s' % (addr, function_name, file_name)]
204    else:
205      return ['%s in %s' % (addr, atos_line)]
206
207
208# Chain several symbolizers so that if one symbolizer fails, we fall back
209# to the next symbolizer in chain.
210class ChainSymbolizer(Symbolizer):
211  def __init__(self, symbolizer_list):
212    super(ChainSymbolizer, self).__init__()
213    self.symbolizer_list = symbolizer_list
214
215  def symbolize(self, addr, binary, offset):
216    """Overrides Symbolizer.symbolize."""
217    for symbolizer in self.symbolizer_list:
218      if symbolizer:
219        result = symbolizer.symbolize(addr, binary, offset)
220        if result:
221          return result
222    return None
223
224  def append_symbolizer(self, symbolizer):
225    self.symbolizer_list.append(symbolizer)
226
227
228def BreakpadSymbolizerFactory(binary):
229  suffix = os.getenv('BREAKPAD_SUFFIX')
230  if suffix:
231    filename = binary + suffix
232    if os.access(filename, os.F_OK):
233      return BreakpadSymbolizer(filename)
234  return None
235
236
237def SystemSymbolizerFactory(system, addr, binary):
238  if system == 'Darwin':
239    return DarwinSymbolizer(addr, binary)
240  elif system == 'Linux':
241    return Addr2LineSymbolizer(binary)
242
243
244class BreakpadSymbolizer(Symbolizer):
245  def __init__(self, filename):
246    super(BreakpadSymbolizer, self).__init__()
247    self.filename = filename
248    lines = file(filename).readlines()
249    self.files = []
250    self.symbols = {}
251    self.address_list = []
252    self.addresses = {}
253    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
254    fragments = lines[0].rstrip().split()
255    self.arch = fragments[2]
256    self.debug_id = fragments[3]
257    self.binary = ' '.join(fragments[4:])
258    self.parse_lines(lines[1:])
259
260  def parse_lines(self, lines):
261    cur_function_addr = ''
262    for line in lines:
263      fragments = line.split()
264      if fragments[0] == 'FILE':
265        assert int(fragments[1]) == len(self.files)
266        self.files.append(' '.join(fragments[2:]))
267      elif fragments[0] == 'PUBLIC':
268        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
269      elif fragments[0] in ['CFI', 'STACK']:
270        pass
271      elif fragments[0] == 'FUNC':
272        cur_function_addr = int(fragments[1], 16)
273        if not cur_function_addr in self.symbols.keys():
274          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
275      else:
276        # Line starting with an address.
277        addr = int(fragments[0], 16)
278        self.address_list.append(addr)
279        # Tuple of symbol address, size, line, file number.
280        self.addresses[addr] = (cur_function_addr,
281                                int(fragments[1], 16),
282                                int(fragments[2]),
283                                int(fragments[3]))
284    self.address_list.sort()
285
286  def get_sym_file_line(self, addr):
287    key = None
288    if addr in self.addresses.keys():
289      key = addr
290    else:
291      index = bisect.bisect_left(self.address_list, addr)
292      if index == 0:
293        return None
294      else:
295        key = self.address_list[index - 1]
296    sym_id, size, line_no, file_no = self.addresses[key]
297    symbol = self.symbols[sym_id]
298    filename = self.files[file_no]
299    if addr < key + size:
300      return symbol, filename, line_no
301    else:
302      return None
303
304  def symbolize(self, addr, binary, offset):
305    if self.binary != binary:
306      return None
307    res = self.get_sym_file_line(int(offset, 16))
308    if res:
309      function_name, file_name, line_no = res
310      result = ['%s in %s %s:%d' % (
311          addr, function_name, file_name, line_no)]
312      print result
313      return result
314    else:
315      return None
316
317
318class SymbolizationLoop(object):
319  def __init__(self, binary_name_filter=None):
320    # Used by clients who may want to supply a different binary name.
321    # E.g. in Chrome several binaries may share a single .dSYM.
322    self.binary_name_filter = binary_name_filter
323    self.system = os.uname()[0]
324    if self.system in ['Linux', 'Darwin']:
325      self.llvm_symbolizer = LLVMSymbolizerFactory(self.system)
326    else:
327      raise Exception('Unknown system')
328
329  def symbolize_address(self, addr, binary, offset):
330    # Use the chain of symbolizers:
331    # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
332    # (fall back to next symbolizer if the previous one fails).
333    if not binary in symbolizers:
334      symbolizers[binary] = ChainSymbolizer(
335          [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer])
336    result = symbolizers[binary].symbolize(addr, binary, offset)
337    if result is None:
338      # Initialize system symbolizer only if other symbolizers failed.
339      symbolizers[binary].append_symbolizer(
340          SystemSymbolizerFactory(self.system, addr, binary))
341      result = symbolizers[binary].symbolize(addr, binary, offset)
342    # The system symbolizer must produce some result.
343    assert result
344    return result
345
346  def print_symbolized_lines(self, symbolized_lines):
347    if not symbolized_lines:
348      print self.current_line
349    else:
350      for symbolized_frame in symbolized_lines:
351        print '    #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip()
352        self.frame_no += 1
353
354  def process_stdin(self):
355    self.frame_no = 0
356    while True:
357      line = sys.stdin.readline()
358      if not line:
359        break
360      self.current_line = line.rstrip()
361      #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
362      stack_trace_line_format = (
363          '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
364      match = re.match(stack_trace_line_format, line)
365      if not match:
366        print self.current_line
367        continue
368      if DEBUG:
369        print line
370      _, frameno_str, addr, binary, offset = match.groups()
371      if frameno_str == '0':
372        # Assume that frame #0 is the first frame of new stack trace.
373        self.frame_no = 0
374      original_binary = binary
375      if self.binary_name_filter:
376        binary = self.binary_name_filter(binary)
377      symbolized_line = self.symbolize_address(addr, binary, offset)
378      if not symbolized_line:
379        if original_binary != binary:
380          symbolized_line = self.symbolize_address(addr, binary, offset)
381      self.print_symbolized_lines(symbolized_line)
382
383
384if __name__ == '__main__':
385  opts, args = getopt.getopt(sys.argv[1:], "d", ["demangle"])
386  for o, a in opts:
387    if o in ("-d", "--demangle"):
388      demangle = True;
389  loop = SymbolizationLoop()
390  loop.process_stdin()
391