asan_symbolize.py revision 41dcb1c8848c8677c06216c6fcaa9b001f736778
1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4#                     The LLVM Compiler Infrastructure
5#
6# This file is distributed under the University of Illinois Open Source
7# License. See LICENSE.TXT for details.
8#
9#===------------------------------------------------------------------------===#
10import bisect
11import getopt
12import os
13import re
14import subprocess
15import sys
16
17llvm_symbolizer = None
18symbolizers = {}
19DEBUG = False
20demangle = False;
21
22
23# FIXME: merge the code that calls fix_filename().
24def fix_filename(file_name):
25  for path_to_cut in sys.argv[1:]:
26    file_name = re.sub('.*' + path_to_cut, '', file_name)
27  file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
28  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
29  return file_name
30
31
32class Symbolizer(object):
33  def __init__(self):
34    pass
35
36  def symbolize(self, addr, binary, offset):
37    """Symbolize the given address (pair of binary and offset).
38
39    Overriden in subclasses.
40    Args:
41        addr: virtual address of an instruction.
42        binary: path to executable/shared object containing this instruction.
43        offset: instruction offset in the @binary.
44    Returns:
45        list of strings (one string for each inlined frame) describing
46        the code locations for this instruction (that is, function name, file
47        name, line and column numbers).
48    """
49    return None
50
51
52class LLVMSymbolizer(Symbolizer):
53  def __init__(self, symbolizer_path):
54    super(LLVMSymbolizer, self).__init__()
55    self.symbolizer_path = symbolizer_path
56    self.pipe = self.open_llvm_symbolizer()
57
58  def open_llvm_symbolizer(self):
59    if not os.path.exists(self.symbolizer_path):
60      return None
61    cmd = [self.symbolizer_path,
62           '--use-symbol-table=true',
63           '--demangle=%s' % demangle,
64           '--functions=true',
65           '--inlining=true']
66    if DEBUG:
67      print ' '.join(cmd)
68    return subprocess.Popen(cmd, stdin=subprocess.PIPE,
69                            stdout=subprocess.PIPE)
70
71  def symbolize(self, addr, binary, offset):
72    """Overrides Symbolizer.symbolize."""
73    if not self.pipe:
74      return None
75    result = []
76    try:
77      symbolizer_input = '%s %s' % (binary, offset)
78      if DEBUG:
79        print symbolizer_input
80      print >> self.pipe.stdin, symbolizer_input
81      while True:
82        function_name = self.pipe.stdout.readline().rstrip()
83        if not function_name:
84          break
85        file_name = self.pipe.stdout.readline().rstrip()
86        file_name = fix_filename(file_name)
87        if (not function_name.startswith('??') and
88            not file_name.startswith('??')):
89          # Append only valid frames.
90          result.append('%s in %s %s' % (addr, function_name,
91                                         file_name))
92    except Exception:
93      result = []
94    if not result:
95      result = None
96    return result
97
98
99def LLVMSymbolizerFactory(system):
100  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
101  if not symbolizer_path:
102    # Assume llvm-symbolizer is in PATH.
103    symbolizer_path = 'llvm-symbolizer'
104  return LLVMSymbolizer(symbolizer_path)
105
106
107class Addr2LineSymbolizer(Symbolizer):
108  def __init__(self, binary):
109    super(Addr2LineSymbolizer, self).__init__()
110    self.binary = binary
111    self.pipe = self.open_addr2line()
112
113  def open_addr2line(self):
114    cmd = ['addr2line', '-f']
115    if demangle:
116      cmd += ['--demangle']
117    cmd += ['-e', self.binary]
118    if DEBUG:
119      print ' '.join(cmd)
120    return subprocess.Popen(cmd,
121                            stdin=subprocess.PIPE, stdout=subprocess.PIPE)
122
123  def symbolize(self, addr, binary, offset):
124    """Overrides Symbolizer.symbolize."""
125    if self.binary != binary:
126      return None
127    try:
128      print >> self.pipe.stdin, offset
129      function_name = self.pipe.stdout.readline().rstrip()
130      file_name = self.pipe.stdout.readline().rstrip()
131    except Exception:
132      function_name = ''
133      file_name = ''
134    file_name = fix_filename(file_name)
135    return ['%s in %s %s' % (addr, function_name, file_name)]
136
137
138class DarwinSymbolizer(Symbolizer):
139  def __init__(self, addr, binary):
140    super(DarwinSymbolizer, self).__init__()
141    self.binary = binary
142    # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
143    if len(addr) > 10:
144      self.arch = 'x86_64'
145    else:
146      self.arch = 'i386'
147    self.pipe = None
148
149  def write_addr_to_pipe(self, offset):
150    print >> self.pipe.stdin, '0x%x' % int(offset, 16)
151
152  def open_atos(self):
153    if DEBUG:
154      print 'atos -o %s -arch %s' % (self.binary, self.arch)
155    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
156    self.pipe = subprocess.Popen(cmdline,
157                                 stdin=subprocess.PIPE,
158                                 stdout=subprocess.PIPE,
159                                 stderr=subprocess.PIPE)
160
161  def symbolize(self, addr, binary, offset):
162    """Overrides Symbolizer.symbolize."""
163    if self.binary != binary:
164      return None
165    self.open_atos()
166    self.write_addr_to_pipe(offset)
167    self.pipe.stdin.close()
168    atos_line = self.pipe.stdout.readline().rstrip()
169    # A well-formed atos response looks like this:
170    #   foo(type1, type2) (in object.name) (filename.cc:80)
171    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
172    if DEBUG:
173      print 'atos_line: ', atos_line
174    if match:
175      function_name = match.group(1)
176      function_name = re.sub('\(.*?\)', '', function_name)
177      file_name = fix_filename(match.group(3))
178      return ['%s in %s %s' % (addr, function_name, file_name)]
179    else:
180      return ['%s in %s' % (addr, atos_line)]
181
182
183# Chain several symbolizers so that if one symbolizer fails, we fall back
184# to the next symbolizer in chain.
185class ChainSymbolizer(Symbolizer):
186  def __init__(self, symbolizer_list):
187    super(ChainSymbolizer, self).__init__()
188    self.symbolizer_list = symbolizer_list
189
190  def symbolize(self, addr, binary, offset):
191    """Overrides Symbolizer.symbolize."""
192    for symbolizer in self.symbolizer_list:
193      if symbolizer:
194        result = symbolizer.symbolize(addr, binary, offset)
195        if result:
196          return result
197    return None
198
199  def append_symbolizer(self, symbolizer):
200    self.symbolizer_list.append(symbolizer)
201
202
203def BreakpadSymbolizerFactory(binary):
204  suffix = os.getenv('BREAKPAD_SUFFIX')
205  if suffix:
206    filename = binary + suffix
207    if os.access(filename, os.F_OK):
208      return BreakpadSymbolizer(filename)
209  return None
210
211
212def SystemSymbolizerFactory(system, addr, binary):
213  if system == 'Darwin':
214    return DarwinSymbolizer(addr, binary)
215  elif system == 'Linux':
216    return Addr2LineSymbolizer(binary)
217
218
219class BreakpadSymbolizer(Symbolizer):
220  def __init__(self, filename):
221    super(BreakpadSymbolizer, self).__init__()
222    self.filename = filename
223    lines = file(filename).readlines()
224    self.files = []
225    self.symbols = {}
226    self.address_list = []
227    self.addresses = {}
228    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
229    fragments = lines[0].rstrip().split()
230    self.arch = fragments[2]
231    self.debug_id = fragments[3]
232    self.binary = ' '.join(fragments[4:])
233    self.parse_lines(lines[1:])
234
235  def parse_lines(self, lines):
236    cur_function_addr = ''
237    for line in lines:
238      fragments = line.split()
239      if fragments[0] == 'FILE':
240        assert int(fragments[1]) == len(self.files)
241        self.files.append(' '.join(fragments[2:]))
242      elif fragments[0] == 'PUBLIC':
243        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
244      elif fragments[0] in ['CFI', 'STACK']:
245        pass
246      elif fragments[0] == 'FUNC':
247        cur_function_addr = int(fragments[1], 16)
248        if not cur_function_addr in self.symbols.keys():
249          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
250      else:
251        # Line starting with an address.
252        addr = int(fragments[0], 16)
253        self.address_list.append(addr)
254        # Tuple of symbol address, size, line, file number.
255        self.addresses[addr] = (cur_function_addr,
256                                int(fragments[1], 16),
257                                int(fragments[2]),
258                                int(fragments[3]))
259    self.address_list.sort()
260
261  def get_sym_file_line(self, addr):
262    key = None
263    if addr in self.addresses.keys():
264      key = addr
265    else:
266      index = bisect.bisect_left(self.address_list, addr)
267      if index == 0:
268        return None
269      else:
270        key = self.address_list[index - 1]
271    sym_id, size, line_no, file_no = self.addresses[key]
272    symbol = self.symbols[sym_id]
273    filename = self.files[file_no]
274    if addr < key + size:
275      return symbol, filename, line_no
276    else:
277      return None
278
279  def symbolize(self, addr, binary, offset):
280    if self.binary != binary:
281      return None
282    res = self.get_sym_file_line(int(offset, 16))
283    if res:
284      function_name, file_name, line_no = res
285      result = ['%s in %s %s:%d' % (
286          addr, function_name, file_name, line_no)]
287      print result
288      return result
289    else:
290      return None
291
292
293class SymbolizationLoop(object):
294  def __init__(self, binary_name_filter=None):
295    # Used by clients who may want to supply a different binary name.
296    # E.g. in Chrome several binaries may share a single .dSYM.
297    self.binary_name_filter = binary_name_filter
298    self.system = os.uname()[0]
299    if self.system in ['Linux', 'Darwin']:
300      self.llvm_symbolizer = LLVMSymbolizerFactory(self.system)
301    else:
302      raise Exception('Unknown system')
303
304  def symbolize_address(self, addr, binary, offset):
305    # Use the chain of symbolizers:
306    # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
307    # (fall back to next symbolizer if the previous one fails).
308    if not binary in symbolizers:
309      symbolizers[binary] = ChainSymbolizer(
310          [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer])
311    result = symbolizers[binary].symbolize(addr, binary, offset)
312    if result is None:
313      # Initialize system symbolizer only if other symbolizers failed.
314      symbolizers[binary].append_symbolizer(
315          SystemSymbolizerFactory(self.system, addr, binary))
316      result = symbolizers[binary].symbolize(addr, binary, offset)
317    # The system symbolizer must produce some result.
318    assert result
319    return result
320
321  def print_symbolized_lines(self, symbolized_lines):
322    if not symbolized_lines:
323      print self.current_line
324    else:
325      for symbolized_frame in symbolized_lines:
326        print '    #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip()
327        self.frame_no += 1
328
329  def process_stdin(self):
330    self.frame_no = 0
331    for line in sys.stdin:
332      self.current_line = line.rstrip()
333      #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
334      stack_trace_line_format = (
335          '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
336      match = re.match(stack_trace_line_format, line)
337      if not match:
338        print self.current_line
339        continue
340      if DEBUG:
341        print line
342      _, frameno_str, addr, binary, offset = match.groups()
343      if frameno_str == '0':
344        # Assume that frame #0 is the first frame of new stack trace.
345        self.frame_no = 0
346      original_binary = binary
347      if self.binary_name_filter:
348        binary = self.binary_name_filter(binary)
349      symbolized_line = self.symbolize_address(addr, binary, offset)
350      if not symbolized_line:
351        if original_binary != binary:
352          symbolized_line = self.symbolize_address(addr, binary, offset)
353      self.print_symbolized_lines(symbolized_line)
354
355
356if __name__ == '__main__':
357  opts, args = getopt.getopt(sys.argv[1:], "d", ["demangle"])
358  for o, a in opts:
359    if o in ("-d", "--demangle"):
360      demangle = True;
361  loop = SymbolizationLoop()
362  loop.process_stdin()
363