asan_symbolize.py revision 444a185d855bccf806f12572d3e8a01eee7c09bf
1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4#                     The LLVM Compiler Infrastructure
5#
6# This file is distributed under the University of Illinois Open Source
7# License. See LICENSE.TXT for details.
8#
9#===------------------------------------------------------------------------===#
10import bisect
11import os
12import re
13import subprocess
14import sys
15
16llvm_symbolizer = None
17symbolizers = {}
18filetypes = {}
19vmaddrs = {}
20DEBUG = False
21
22
23# FIXME: merge the code that calls fix_filename().
24def fix_filename(file_name):
25  for path_to_cut in sys.argv[1:]:
26    file_name = re.sub('.*' + path_to_cut, '', file_name)
27  file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
28  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
29  return file_name
30
31
32class Symbolizer(object):
33  def __init__(self):
34    pass
35
36  def symbolize(self, addr, binary, offset):
37    """Symbolize the given address (pair of binary and offset).
38
39    Overriden in subclasses.
40    Args:
41        addr: virtual address of an instruction.
42        binary: path to executable/shared object containing this instruction.
43        offset: instruction offset in the @binary.
44    Returns:
45        list of strings (one string for each inlined frame) describing
46        the code locations for this instruction (that is, function name, file
47        name, line and column numbers).
48    """
49    return None
50
51
52class LLVMSymbolizer(Symbolizer):
53  def __init__(self, symbolizer_path):
54    super(LLVMSymbolizer, self).__init__()
55    self.symbolizer_path = symbolizer_path
56    self.pipe = self.open_llvm_symbolizer()
57
58  def open_llvm_symbolizer(self):
59    if not os.path.exists(self.symbolizer_path):
60      return None
61    cmd = [self.symbolizer_path,
62           '--use-symbol-table=true',
63           '--demangle=false',
64           '--functions=true',
65           '--inlining=true']
66    if DEBUG:
67      print ' '.join(cmd)
68    return subprocess.Popen(cmd, stdin=subprocess.PIPE,
69                            stdout=subprocess.PIPE)
70
71  def symbolize(self, addr, binary, offset):
72    """Overrides Symbolizer.symbolize."""
73    if not self.pipe:
74      return None
75    result = []
76    try:
77      symbolizer_input = '%s %s' % (binary, offset)
78      if DEBUG:
79        print symbolizer_input
80      print >> self.pipe.stdin, symbolizer_input
81      while True:
82        function_name = self.pipe.stdout.readline().rstrip()
83        if not function_name:
84          break
85        file_name = self.pipe.stdout.readline().rstrip()
86        file_name = fix_filename(file_name)
87        if (not function_name.startswith('??') and
88            not file_name.startswith('??')):
89          # Append only valid frames.
90          result.append('%s in %s %s' % (addr, function_name,
91                                         file_name))
92    except Exception:
93      result = []
94    if not result:
95      result = None
96    return result
97
98
99def LLVMSymbolizerFactory(system):
100  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
101  if not symbolizer_path:
102    # Assume llvm-symbolizer is in PATH.
103    symbolizer_path = 'llvm-symbolizer'
104  return LLVMSymbolizer(symbolizer_path)
105
106
107class Addr2LineSymbolizer(Symbolizer):
108  def __init__(self, binary):
109    super(Addr2LineSymbolizer, self).__init__()
110    self.binary = binary
111    self.pipe = self.open_addr2line()
112
113  def open_addr2line(self):
114    cmd = ['addr2line', '-f', '-e', self.binary]
115    if DEBUG:
116      print ' '.join(cmd)
117    return subprocess.Popen(cmd,
118                            stdin=subprocess.PIPE, stdout=subprocess.PIPE)
119
120  def symbolize(self, addr, binary, offset):
121    """Overrides Symbolizer.symbolize."""
122    if self.binary != binary:
123      return None
124    try:
125      print >> self.pipe.stdin, offset
126      function_name = self.pipe.stdout.readline().rstrip()
127      file_name = self.pipe.stdout.readline().rstrip()
128    except Exception:
129      function_name = ''
130      file_name = ''
131    file_name = fix_filename(file_name)
132    return ['%s in %s %s' % (addr, function_name, file_name)]
133
134
135class DarwinSymbolizer(Symbolizer):
136  def __init__(self, addr, binary):
137    super(DarwinSymbolizer, self).__init__()
138    self.binary = binary
139    # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
140    if len(addr) > 10:
141      self.arch = 'x86_64'
142    else:
143      self.arch = 'i386'
144    self.vmaddr = None
145    self.pipe = None
146
147  def write_addr_to_pipe(self, offset):
148    print >> self.pipe.stdin, '0x%x' % int(offset, 16)
149
150  def open_atos(self):
151    if DEBUG:
152      print 'atos -o %s -arch %s' % (self.binary, self.arch)
153    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
154    self.pipe = subprocess.Popen(cmdline,
155                                 stdin=subprocess.PIPE,
156                                 stdout=subprocess.PIPE,
157                                 stderr=subprocess.PIPE)
158
159  def symbolize(self, addr, binary, offset):
160    """Overrides Symbolizer.symbolize."""
161    if self.binary != binary:
162      return None
163    self.open_atos()
164    self.write_addr_to_pipe(offset)
165    self.pipe.stdin.close()
166    atos_line = self.pipe.stdout.readline().rstrip()
167    # A well-formed atos response looks like this:
168    #   foo(type1, type2) (in object.name) (filename.cc:80)
169    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
170    if DEBUG:
171      print 'atos_line: ', atos_line
172    if match:
173      function_name = match.group(1)
174      function_name = re.sub('\(.*?\)', '', function_name)
175      file_name = fix_filename(match.group(3))
176      return ['%s in %s %s' % (addr, function_name, file_name)]
177    else:
178      return ['%s in %s' % (addr, atos_line)]
179
180
181# Chain several symbolizers so that if one symbolizer fails, we fall back
182# to the next symbolizer in chain.
183class ChainSymbolizer(Symbolizer):
184  def __init__(self, symbolizer_list):
185    super(ChainSymbolizer, self).__init__()
186    self.symbolizer_list = symbolizer_list
187
188  def symbolize(self, addr, binary, offset):
189    """Overrides Symbolizer.symbolize."""
190    for symbolizer in self.symbolizer_list:
191      if symbolizer:
192        result = symbolizer.symbolize(addr, binary, offset)
193        if result:
194          return result
195    return None
196
197  def append_symbolizer(self, symbolizer):
198    self.symbolizer_list.append(symbolizer)
199
200
201def BreakpadSymbolizerFactory(binary):
202  suffix = os.getenv('BREAKPAD_SUFFIX')
203  if suffix:
204    filename = binary + suffix
205    if os.access(filename, os.F_OK):
206      return BreakpadSymbolizer(filename)
207  return None
208
209
210def SystemSymbolizerFactory(system, addr, binary):
211  if system == 'Darwin':
212    return DarwinSymbolizer(addr, binary)
213  elif system == 'Linux':
214    return Addr2LineSymbolizer(binary)
215
216
217class BreakpadSymbolizer(Symbolizer):
218  def __init__(self, filename):
219    super(BreakpadSymbolizer, self).__init__()
220    self.filename = filename
221    lines = file(filename).readlines()
222    self.files = []
223    self.symbols = {}
224    self.address_list = []
225    self.addresses = {}
226    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
227    fragments = lines[0].rstrip().split()
228    self.arch = fragments[2]
229    self.debug_id = fragments[3]
230    self.binary = ' '.join(fragments[4:])
231    self.parse_lines(lines[1:])
232
233  def parse_lines(self, lines):
234    cur_function_addr = ''
235    for line in lines:
236      fragments = line.split()
237      if fragments[0] == 'FILE':
238        assert int(fragments[1]) == len(self.files)
239        self.files.append(' '.join(fragments[2:]))
240      elif fragments[0] == 'PUBLIC':
241        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
242      elif fragments[0] in ['CFI', 'STACK']:
243        pass
244      elif fragments[0] == 'FUNC':
245        cur_function_addr = int(fragments[1], 16)
246        if not cur_function_addr in self.symbols.keys():
247          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
248      else:
249        # Line starting with an address.
250        addr = int(fragments[0], 16)
251        self.address_list.append(addr)
252        # Tuple of symbol address, size, line, file number.
253        self.addresses[addr] = (cur_function_addr,
254                                int(fragments[1], 16),
255                                int(fragments[2]),
256                                int(fragments[3]))
257    self.address_list.sort()
258
259  def get_sym_file_line(self, addr):
260    key = None
261    if addr in self.addresses.keys():
262      key = addr
263    else:
264      index = bisect.bisect_left(self.address_list, addr)
265      if index == 0:
266        return None
267      else:
268        key = self.address_list[index - 1]
269    sym_id, size, line_no, file_no = self.addresses[key]
270    symbol = self.symbols[sym_id]
271    filename = self.files[file_no]
272    if addr < key + size:
273      return symbol, filename, line_no
274    else:
275      return None
276
277  def symbolize(self, addr, binary, offset):
278    if self.binary != binary:
279      return None
280    res = self.get_sym_file_line(int(offset, 16))
281    if res:
282      function_name, file_name, line_no = res
283      result = ['%s in %s %s:%d' % (
284          addr, function_name, file_name, line_no)]
285      print result
286      return result
287    else:
288      return None
289
290
291class SymbolizationLoop(object):
292  def __init__(self, binary_name_filter=None):
293    # Used by clients who may want to supply a different binary name.
294    # E.g. in Chrome several binaries may share a single .dSYM.
295    self.binary_name_filter = binary_name_filter
296    self.system = os.uname()[0]
297    if self.system in ['Linux', 'Darwin']:
298      self.llvm_symbolizer = LLVMSymbolizerFactory(self.system)
299    else:
300      raise Exception('Unknown system')
301
302  def symbolize_address(self, addr, binary, offset):
303    # Use the chain of symbolizers:
304    # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
305    # (fall back to next symbolizer if the previous one fails).
306    if not binary in symbolizers:
307      symbolizers[binary] = ChainSymbolizer(
308          [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer])
309    result = symbolizers[binary].symbolize(addr, binary, offset)
310    if result is None:
311      # Initialize system symbolizer only if other symbolizers failed.
312      symbolizers[binary].append_symbolizer(
313          SystemSymbolizerFactory(self.system, addr, binary))
314      result = symbolizers[binary].symbolize(addr, binary, offset)
315    # The system symbolizer must produce some result.
316    assert result
317    return result
318
319  def print_symbolized_lines(self, symbolized_lines):
320    if not symbolized_lines:
321      print self.current_line
322    else:
323      for symbolized_frame in symbolized_lines:
324        print '    #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip()
325        self.frame_no += 1
326
327  def process_stdin(self):
328    self.frame_no = 0
329    for line in sys.stdin:
330      self.current_line = line.rstrip()
331      #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
332      stack_trace_line_format = (
333          '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
334      match = re.match(stack_trace_line_format, line)
335      if not match:
336        print self.current_line
337        continue
338      if DEBUG:
339        print line
340      _, frameno_str, addr, binary, offset = match.groups()
341      if frameno_str == '0':
342        # Assume that frame #0 is the first frame of new stack trace.
343        self.frame_no = 0
344      original_binary = binary
345      if self.binary_name_filter:
346        binary = self.binary_name_filter(binary)
347      symbolized_line = self.symbolize_address(addr, binary, offset)
348      if not symbolized_line:
349        if original_binary != binary:
350          symbolized_line = self.symbolize_address(addr, binary, offset)
351      self.print_symbolized_lines(symbolized_line)
352
353
354if __name__ == '__main__':
355  loop = SymbolizationLoop()
356  loop.process_stdin()
357