asan_symbolize.py revision 444a185d855bccf806f12572d3e8a01eee7c09bf
186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#!/usr/bin/env python
286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#
486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#                     The LLVM Compiler Infrastructure
586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#
686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# This file is distributed under the University of Illinois Open Source
786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# License. See LICENSE.TXT for details.
886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#
986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#===------------------------------------------------------------------------===#
1086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport bisect
1186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport os
1286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport re
1386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport subprocess
1486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport sys
1586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
1686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstllvm_symbolizer = None
1786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstsymbolizers = {}
1886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstfiletypes = {}
1986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstvmaddrs = {}
2086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine GarstDEBUG = False
2186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
2286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
2386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# FIXME: merge the code that calls fix_filename().
2486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstdef fix_filename(file_name):
2586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst  for path_to_cut in sys.argv[1:]:
2686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    file_name = re.sub('.*' + path_to_cut, '', file_name)
2786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst  file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
2886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
2986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst  return file_name
3086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
3186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
3286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstclass Symbolizer(object):
3386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst  def __init__(self):
3486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    pass
3586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
3686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst  def symbolize(self, addr, binary, offset):
3786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    """Symbolize the given address (pair of binary and offset).
3886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
3986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    Overriden in subclasses.
4086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    Args:
4186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst        addr: virtual address of an instruction.
4286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst        binary: path to executable/shared object containing this instruction.
4386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst        offset: instruction offset in the @binary.
4486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    Returns:
4586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst        list of strings (one string for each inlined frame) describing
4686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst        the code locations for this instruction (that is, function name, file
4786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst        name, line and column numbers).
4886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    """
4986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    return None
5086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
5186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
5286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstclass LLVMSymbolizer(Symbolizer):
5386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst  def __init__(self, symbolizer_path):
5486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    super(LLVMSymbolizer, self).__init__()
5586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    self.symbolizer_path = symbolizer_path
5686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    self.pipe = self.open_llvm_symbolizer()
5786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst
5886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst  def open_llvm_symbolizer(self):
5986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    if not os.path.exists(self.symbolizer_path):
6086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst      return None
6186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    cmd = [self.symbolizer_path,
6286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst           '--use-symbol-table=true',
6386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst           '--demangle=false',
6486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst           '--functions=true',
6586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst           '--inlining=true']
6686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst    if DEBUG:
67      print ' '.join(cmd)
68    return subprocess.Popen(cmd, stdin=subprocess.PIPE,
69                            stdout=subprocess.PIPE)
70
71  def symbolize(self, addr, binary, offset):
72    """Overrides Symbolizer.symbolize."""
73    if not self.pipe:
74      return None
75    result = []
76    try:
77      symbolizer_input = '%s %s' % (binary, offset)
78      if DEBUG:
79        print symbolizer_input
80      print >> self.pipe.stdin, symbolizer_input
81      while True:
82        function_name = self.pipe.stdout.readline().rstrip()
83        if not function_name:
84          break
85        file_name = self.pipe.stdout.readline().rstrip()
86        file_name = fix_filename(file_name)
87        if (not function_name.startswith('??') and
88            not file_name.startswith('??')):
89          # Append only valid frames.
90          result.append('%s in %s %s' % (addr, function_name,
91                                         file_name))
92    except Exception:
93      result = []
94    if not result:
95      result = None
96    return result
97
98
99def LLVMSymbolizerFactory(system):
100  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
101  if not symbolizer_path:
102    # Assume llvm-symbolizer is in PATH.
103    symbolizer_path = 'llvm-symbolizer'
104  return LLVMSymbolizer(symbolizer_path)
105
106
107class Addr2LineSymbolizer(Symbolizer):
108  def __init__(self, binary):
109    super(Addr2LineSymbolizer, self).__init__()
110    self.binary = binary
111    self.pipe = self.open_addr2line()
112
113  def open_addr2line(self):
114    cmd = ['addr2line', '-f', '-e', self.binary]
115    if DEBUG:
116      print ' '.join(cmd)
117    return subprocess.Popen(cmd,
118                            stdin=subprocess.PIPE, stdout=subprocess.PIPE)
119
120  def symbolize(self, addr, binary, offset):
121    """Overrides Symbolizer.symbolize."""
122    if self.binary != binary:
123      return None
124    try:
125      print >> self.pipe.stdin, offset
126      function_name = self.pipe.stdout.readline().rstrip()
127      file_name = self.pipe.stdout.readline().rstrip()
128    except Exception:
129      function_name = ''
130      file_name = ''
131    file_name = fix_filename(file_name)
132    return ['%s in %s %s' % (addr, function_name, file_name)]
133
134
135class DarwinSymbolizer(Symbolizer):
136  def __init__(self, addr, binary):
137    super(DarwinSymbolizer, self).__init__()
138    self.binary = binary
139    # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
140    if len(addr) > 10:
141      self.arch = 'x86_64'
142    else:
143      self.arch = 'i386'
144    self.vmaddr = None
145    self.pipe = None
146
147  def write_addr_to_pipe(self, offset):
148    print >> self.pipe.stdin, '0x%x' % int(offset, 16)
149
150  def open_atos(self):
151    if DEBUG:
152      print 'atos -o %s -arch %s' % (self.binary, self.arch)
153    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
154    self.pipe = subprocess.Popen(cmdline,
155                                 stdin=subprocess.PIPE,
156                                 stdout=subprocess.PIPE,
157                                 stderr=subprocess.PIPE)
158
159  def symbolize(self, addr, binary, offset):
160    """Overrides Symbolizer.symbolize."""
161    if self.binary != binary:
162      return None
163    self.open_atos()
164    self.write_addr_to_pipe(offset)
165    self.pipe.stdin.close()
166    atos_line = self.pipe.stdout.readline().rstrip()
167    # A well-formed atos response looks like this:
168    #   foo(type1, type2) (in object.name) (filename.cc:80)
169    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
170    if DEBUG:
171      print 'atos_line: ', atos_line
172    if match:
173      function_name = match.group(1)
174      function_name = re.sub('\(.*?\)', '', function_name)
175      file_name = fix_filename(match.group(3))
176      return ['%s in %s %s' % (addr, function_name, file_name)]
177    else:
178      return ['%s in %s' % (addr, atos_line)]
179
180
181# Chain several symbolizers so that if one symbolizer fails, we fall back
182# to the next symbolizer in chain.
183class ChainSymbolizer(Symbolizer):
184  def __init__(self, symbolizer_list):
185    super(ChainSymbolizer, self).__init__()
186    self.symbolizer_list = symbolizer_list
187
188  def symbolize(self, addr, binary, offset):
189    """Overrides Symbolizer.symbolize."""
190    for symbolizer in self.symbolizer_list:
191      if symbolizer:
192        result = symbolizer.symbolize(addr, binary, offset)
193        if result:
194          return result
195    return None
196
197  def append_symbolizer(self, symbolizer):
198    self.symbolizer_list.append(symbolizer)
199
200
201def BreakpadSymbolizerFactory(binary):
202  suffix = os.getenv('BREAKPAD_SUFFIX')
203  if suffix:
204    filename = binary + suffix
205    if os.access(filename, os.F_OK):
206      return BreakpadSymbolizer(filename)
207  return None
208
209
210def SystemSymbolizerFactory(system, addr, binary):
211  if system == 'Darwin':
212    return DarwinSymbolizer(addr, binary)
213  elif system == 'Linux':
214    return Addr2LineSymbolizer(binary)
215
216
217class BreakpadSymbolizer(Symbolizer):
218  def __init__(self, filename):
219    super(BreakpadSymbolizer, self).__init__()
220    self.filename = filename
221    lines = file(filename).readlines()
222    self.files = []
223    self.symbols = {}
224    self.address_list = []
225    self.addresses = {}
226    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
227    fragments = lines[0].rstrip().split()
228    self.arch = fragments[2]
229    self.debug_id = fragments[3]
230    self.binary = ' '.join(fragments[4:])
231    self.parse_lines(lines[1:])
232
233  def parse_lines(self, lines):
234    cur_function_addr = ''
235    for line in lines:
236      fragments = line.split()
237      if fragments[0] == 'FILE':
238        assert int(fragments[1]) == len(self.files)
239        self.files.append(' '.join(fragments[2:]))
240      elif fragments[0] == 'PUBLIC':
241        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
242      elif fragments[0] in ['CFI', 'STACK']:
243        pass
244      elif fragments[0] == 'FUNC':
245        cur_function_addr = int(fragments[1], 16)
246        if not cur_function_addr in self.symbols.keys():
247          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
248      else:
249        # Line starting with an address.
250        addr = int(fragments[0], 16)
251        self.address_list.append(addr)
252        # Tuple of symbol address, size, line, file number.
253        self.addresses[addr] = (cur_function_addr,
254                                int(fragments[1], 16),
255                                int(fragments[2]),
256                                int(fragments[3]))
257    self.address_list.sort()
258
259  def get_sym_file_line(self, addr):
260    key = None
261    if addr in self.addresses.keys():
262      key = addr
263    else:
264      index = bisect.bisect_left(self.address_list, addr)
265      if index == 0:
266        return None
267      else:
268        key = self.address_list[index - 1]
269    sym_id, size, line_no, file_no = self.addresses[key]
270    symbol = self.symbols[sym_id]
271    filename = self.files[file_no]
272    if addr < key + size:
273      return symbol, filename, line_no
274    else:
275      return None
276
277  def symbolize(self, addr, binary, offset):
278    if self.binary != binary:
279      return None
280    res = self.get_sym_file_line(int(offset, 16))
281    if res:
282      function_name, file_name, line_no = res
283      result = ['%s in %s %s:%d' % (
284          addr, function_name, file_name, line_no)]
285      print result
286      return result
287    else:
288      return None
289
290
291class SymbolizationLoop(object):
292  def __init__(self, binary_name_filter=None):
293    # Used by clients who may want to supply a different binary name.
294    # E.g. in Chrome several binaries may share a single .dSYM.
295    self.binary_name_filter = binary_name_filter
296    self.system = os.uname()[0]
297    if self.system in ['Linux', 'Darwin']:
298      self.llvm_symbolizer = LLVMSymbolizerFactory(self.system)
299    else:
300      raise Exception('Unknown system')
301
302  def symbolize_address(self, addr, binary, offset):
303    # Use the chain of symbolizers:
304    # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
305    # (fall back to next symbolizer if the previous one fails).
306    if not binary in symbolizers:
307      symbolizers[binary] = ChainSymbolizer(
308          [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer])
309    result = symbolizers[binary].symbolize(addr, binary, offset)
310    if result is None:
311      # Initialize system symbolizer only if other symbolizers failed.
312      symbolizers[binary].append_symbolizer(
313          SystemSymbolizerFactory(self.system, addr, binary))
314      result = symbolizers[binary].symbolize(addr, binary, offset)
315    # The system symbolizer must produce some result.
316    assert result
317    return result
318
319  def print_symbolized_lines(self, symbolized_lines):
320    if not symbolized_lines:
321      print self.current_line
322    else:
323      for symbolized_frame in symbolized_lines:
324        print '    #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip()
325        self.frame_no += 1
326
327  def process_stdin(self):
328    self.frame_no = 0
329    for line in sys.stdin:
330      self.current_line = line.rstrip()
331      #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
332      stack_trace_line_format = (
333          '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
334      match = re.match(stack_trace_line_format, line)
335      if not match:
336        print self.current_line
337        continue
338      if DEBUG:
339        print line
340      _, frameno_str, addr, binary, offset = match.groups()
341      if frameno_str == '0':
342        # Assume that frame #0 is the first frame of new stack trace.
343        self.frame_no = 0
344      original_binary = binary
345      if self.binary_name_filter:
346        binary = self.binary_name_filter(binary)
347      symbolized_line = self.symbolize_address(addr, binary, offset)
348      if not symbolized_line:
349        if original_binary != binary:
350          symbolized_line = self.symbolize_address(addr, binary, offset)
351      self.print_symbolized_lines(symbolized_line)
352
353
354if __name__ == '__main__':
355  loop = SymbolizationLoop()
356  loop.process_stdin()
357