asan_symbolize.py revision f21e025112d5f82b2b475eb1d8e690824883fc97
1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4#                     The LLVM Compiler Infrastructure
5#
6# This file is distributed under the University of Illinois Open Source
7# License. See LICENSE.TXT for details.
8#
9#===------------------------------------------------------------------------===#
10import bisect
11import getopt
12import os
13import pty
14import re
15import subprocess
16import sys
17import termios
18
19llvm_symbolizer = None
20symbolizers = {}
21DEBUG = False
22demangle = False;
23
24
25# FIXME: merge the code that calls fix_filename().
26def fix_filename(file_name):
27  for path_to_cut in sys.argv[1:]:
28    file_name = re.sub('.*' + path_to_cut, '', file_name)
29  file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
30  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
31  return file_name
32
33
34class Symbolizer(object):
35  def __init__(self):
36    pass
37
38  def symbolize(self, addr, binary, offset):
39    """Symbolize the given address (pair of binary and offset).
40
41    Overriden in subclasses.
42    Args:
43        addr: virtual address of an instruction.
44        binary: path to executable/shared object containing this instruction.
45        offset: instruction offset in the @binary.
46    Returns:
47        list of strings (one string for each inlined frame) describing
48        the code locations for this instruction (that is, function name, file
49        name, line and column numbers).
50    """
51    return None
52
53
54class LLVMSymbolizer(Symbolizer):
55  def __init__(self, symbolizer_path):
56    super(LLVMSymbolizer, self).__init__()
57    self.symbolizer_path = symbolizer_path
58    self.pipe = self.open_llvm_symbolizer()
59
60  def open_llvm_symbolizer(self):
61    if not os.path.exists(self.symbolizer_path):
62      return None
63    cmd = [self.symbolizer_path,
64           '--use-symbol-table=true',
65           '--demangle=%s' % demangle,
66           '--functions=true',
67           '--inlining=true']
68    if DEBUG:
69      print ' '.join(cmd)
70    return subprocess.Popen(cmd, stdin=subprocess.PIPE,
71                            stdout=subprocess.PIPE)
72
73  def symbolize(self, addr, binary, offset):
74    """Overrides Symbolizer.symbolize."""
75    if not self.pipe:
76      return None
77    result = []
78    try:
79      symbolizer_input = '%s %s' % (binary, offset)
80      if DEBUG:
81        print symbolizer_input
82      print >> self.pipe.stdin, symbolizer_input
83      while True:
84        function_name = self.pipe.stdout.readline().rstrip()
85        if not function_name:
86          break
87        file_name = self.pipe.stdout.readline().rstrip()
88        file_name = fix_filename(file_name)
89        if (not function_name.startswith('??') and
90            not file_name.startswith('??')):
91          # Append only valid frames.
92          result.append('%s in %s %s' % (addr, function_name,
93                                         file_name))
94    except Exception:
95      result = []
96    if not result:
97      result = None
98    return result
99
100
101def LLVMSymbolizerFactory(system):
102  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
103  if not symbolizer_path:
104    # Assume llvm-symbolizer is in PATH.
105    symbolizer_path = 'llvm-symbolizer'
106  return LLVMSymbolizer(symbolizer_path)
107
108
109class Addr2LineSymbolizer(Symbolizer):
110  def __init__(self, binary):
111    super(Addr2LineSymbolizer, self).__init__()
112    self.binary = binary
113    self.pipe = self.open_addr2line()
114
115  def open_addr2line(self):
116    cmd = ['addr2line', '-f']
117    if demangle:
118      cmd += ['--demangle']
119    cmd += ['-e', self.binary]
120    if DEBUG:
121      print ' '.join(cmd)
122    return subprocess.Popen(cmd,
123                            stdin=subprocess.PIPE, stdout=subprocess.PIPE)
124
125  def symbolize(self, addr, binary, offset):
126    """Overrides Symbolizer.symbolize."""
127    if self.binary != binary:
128      return None
129    try:
130      print >> self.pipe.stdin, offset
131      function_name = self.pipe.stdout.readline().rstrip()
132      file_name = self.pipe.stdout.readline().rstrip()
133    except Exception:
134      function_name = ''
135      file_name = ''
136    file_name = fix_filename(file_name)
137    return ['%s in %s %s' % (addr, function_name, file_name)]
138
139
140class UnbufferedLineConverter(object):
141  """
142  Wrap a child process that responds to each line of input with one line of
143  output.  Uses pty to trick the child into providing unbuffered output.
144  """
145  def __init__(self, args):
146    pid, fd = pty.fork()
147    if pid == 0:
148      # We're the child.  Transfer control to command.
149      os.execvp(args[0], args)
150    else:
151      # Disable echoing.
152      attr = termios.tcgetattr(fd)
153      attr[3] = attr[3] & ~termios.ECHO
154      termios.tcsetattr(fd, termios.TCSANOW, attr)
155      # Set up a file()-like interface to the child process
156      self.r = os.fdopen(fd, "r", 1)
157      self.w = os.fdopen(os.dup(fd), "w", 1)
158
159  def convert(self, line):
160    self.w.write(line + "\n")
161    return self.readline()
162
163  def readline(self):
164    return self.r.readline().rstrip()
165
166
167class DarwinSymbolizer(Symbolizer):
168  def __init__(self, addr, binary):
169    super(DarwinSymbolizer, self).__init__()
170    self.binary = binary
171    # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
172    if len(addr) > 10:
173      self.arch = 'x86_64'
174    else:
175      self.arch = 'i386'
176    self.open_atos()
177
178  def open_atos(self):
179    if DEBUG:
180      print 'atos -o %s -arch %s' % (self.binary, self.arch)
181    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
182    self.atos = UnbufferedLineConverter(cmdline)
183
184  def symbolize(self, addr, binary, offset):
185    """Overrides Symbolizer.symbolize."""
186    if self.binary != binary:
187      return None
188    atos_line = self.atos.convert('0x%x' % int(offset, 16))
189    while "got symbolicator for" in atos_line:
190      atos_line = self.atos.readline()
191    # A well-formed atos response looks like this:
192    #   foo(type1, type2) (in object.name) (filename.cc:80)
193    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
194    if DEBUG:
195      print 'atos_line: ', atos_line
196    if match:
197      function_name = match.group(1)
198      function_name = re.sub('\(.*?\)', '', function_name)
199      file_name = fix_filename(match.group(3))
200      return ['%s in %s %s' % (addr, function_name, file_name)]
201    else:
202      return ['%s in %s' % (addr, atos_line)]
203
204
205# Chain several symbolizers so that if one symbolizer fails, we fall back
206# to the next symbolizer in chain.
207class ChainSymbolizer(Symbolizer):
208  def __init__(self, symbolizer_list):
209    super(ChainSymbolizer, self).__init__()
210    self.symbolizer_list = symbolizer_list
211
212  def symbolize(self, addr, binary, offset):
213    """Overrides Symbolizer.symbolize."""
214    for symbolizer in self.symbolizer_list:
215      if symbolizer:
216        result = symbolizer.symbolize(addr, binary, offset)
217        if result:
218          return result
219    return None
220
221  def append_symbolizer(self, symbolizer):
222    self.symbolizer_list.append(symbolizer)
223
224
225def BreakpadSymbolizerFactory(binary):
226  suffix = os.getenv('BREAKPAD_SUFFIX')
227  if suffix:
228    filename = binary + suffix
229    if os.access(filename, os.F_OK):
230      return BreakpadSymbolizer(filename)
231  return None
232
233
234def SystemSymbolizerFactory(system, addr, binary):
235  if system == 'Darwin':
236    return DarwinSymbolizer(addr, binary)
237  elif system == 'Linux':
238    return Addr2LineSymbolizer(binary)
239
240
241class BreakpadSymbolizer(Symbolizer):
242  def __init__(self, filename):
243    super(BreakpadSymbolizer, self).__init__()
244    self.filename = filename
245    lines = file(filename).readlines()
246    self.files = []
247    self.symbols = {}
248    self.address_list = []
249    self.addresses = {}
250    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
251    fragments = lines[0].rstrip().split()
252    self.arch = fragments[2]
253    self.debug_id = fragments[3]
254    self.binary = ' '.join(fragments[4:])
255    self.parse_lines(lines[1:])
256
257  def parse_lines(self, lines):
258    cur_function_addr = ''
259    for line in lines:
260      fragments = line.split()
261      if fragments[0] == 'FILE':
262        assert int(fragments[1]) == len(self.files)
263        self.files.append(' '.join(fragments[2:]))
264      elif fragments[0] == 'PUBLIC':
265        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
266      elif fragments[0] in ['CFI', 'STACK']:
267        pass
268      elif fragments[0] == 'FUNC':
269        cur_function_addr = int(fragments[1], 16)
270        if not cur_function_addr in self.symbols.keys():
271          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
272      else:
273        # Line starting with an address.
274        addr = int(fragments[0], 16)
275        self.address_list.append(addr)
276        # Tuple of symbol address, size, line, file number.
277        self.addresses[addr] = (cur_function_addr,
278                                int(fragments[1], 16),
279                                int(fragments[2]),
280                                int(fragments[3]))
281    self.address_list.sort()
282
283  def get_sym_file_line(self, addr):
284    key = None
285    if addr in self.addresses.keys():
286      key = addr
287    else:
288      index = bisect.bisect_left(self.address_list, addr)
289      if index == 0:
290        return None
291      else:
292        key = self.address_list[index - 1]
293    sym_id, size, line_no, file_no = self.addresses[key]
294    symbol = self.symbols[sym_id]
295    filename = self.files[file_no]
296    if addr < key + size:
297      return symbol, filename, line_no
298    else:
299      return None
300
301  def symbolize(self, addr, binary, offset):
302    if self.binary != binary:
303      return None
304    res = self.get_sym_file_line(int(offset, 16))
305    if res:
306      function_name, file_name, line_no = res
307      result = ['%s in %s %s:%d' % (
308          addr, function_name, file_name, line_no)]
309      print result
310      return result
311    else:
312      return None
313
314
315class SymbolizationLoop(object):
316  def __init__(self, binary_name_filter=None):
317    # Used by clients who may want to supply a different binary name.
318    # E.g. in Chrome several binaries may share a single .dSYM.
319    self.binary_name_filter = binary_name_filter
320    self.system = os.uname()[0]
321    if self.system in ['Linux', 'Darwin']:
322      self.llvm_symbolizer = LLVMSymbolizerFactory(self.system)
323    else:
324      raise Exception('Unknown system')
325
326  def symbolize_address(self, addr, binary, offset):
327    # Use the chain of symbolizers:
328    # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
329    # (fall back to next symbolizer if the previous one fails).
330    if not binary in symbolizers:
331      symbolizers[binary] = ChainSymbolizer(
332          [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer])
333    result = symbolizers[binary].symbolize(addr, binary, offset)
334    if result is None:
335      # Initialize system symbolizer only if other symbolizers failed.
336      symbolizers[binary].append_symbolizer(
337          SystemSymbolizerFactory(self.system, addr, binary))
338      result = symbolizers[binary].symbolize(addr, binary, offset)
339    # The system symbolizer must produce some result.
340    assert result
341    return result
342
343  def print_symbolized_lines(self, symbolized_lines):
344    if not symbolized_lines:
345      print self.current_line
346    else:
347      for symbolized_frame in symbolized_lines:
348        print '    #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip()
349        self.frame_no += 1
350
351  def process_stdin(self):
352    self.frame_no = 0
353    while True:
354      line = sys.stdin.readline()
355      if not line:
356        break
357      self.current_line = line.rstrip()
358      #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
359      stack_trace_line_format = (
360          '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
361      match = re.match(stack_trace_line_format, line)
362      if not match:
363        print self.current_line
364        continue
365      if DEBUG:
366        print line
367      _, frameno_str, addr, binary, offset = match.groups()
368      if frameno_str == '0':
369        # Assume that frame #0 is the first frame of new stack trace.
370        self.frame_no = 0
371      original_binary = binary
372      if self.binary_name_filter:
373        binary = self.binary_name_filter(binary)
374      symbolized_line = self.symbolize_address(addr, binary, offset)
375      if not symbolized_line:
376        if original_binary != binary:
377          symbolized_line = self.symbolize_address(addr, binary, offset)
378      self.print_symbolized_lines(symbolized_line)
379
380
381if __name__ == '__main__':
382  opts, args = getopt.getopt(sys.argv[1:], "d", ["demangle"])
383  for o, a in opts:
384    if o in ("-d", "--demangle"):
385      demangle = True;
386  loop = SymbolizationLoop()
387  loop.process_stdin()
388