asan_symbolize.py revision b2546c44c828a546a98c091c714b71b1c9966673
1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4#                     The LLVM Compiler Infrastructure
5#
6# This file is distributed under the University of Illinois Open Source
7# License. See LICENSE.TXT for details.
8#
9#===------------------------------------------------------------------------===#
10import bisect
11import os
12import re
13import sys
14import subprocess
15
16llvm_symbolizer = None
17symbolizers = {}
18filetypes = {}
19vmaddrs = {}
20DEBUG = False
21
22
23# FIXME: merge the code that calls fix_filename().
24def fix_filename(file_name):
25  for path_to_cut in sys.argv[1:]:
26    file_name = re.sub(".*" + path_to_cut, "", file_name)
27  file_name = re.sub(".*asan_[a-z_]*.cc:[0-9]*", "_asan_rtl_", file_name)
28  file_name = re.sub(".*crtstuff.c:0", "???:0", file_name)
29  return file_name
30
31
32class Symbolizer(object):
33  def __init__(self):
34    pass
35  def symbolize(self, addr, binary, offset):
36    """
37      Overrided in subclasses.
38      Args:
39        addr: virtual address of an instruction.
40        binary: path to executable/shared object containing this instruction.
41        offset: instruction offset in the @binary.
42      Returns:
43        list of strings (one string for each inlined frame) describing
44        the code locations for this instruction (that is, function name, file
45        name, line and column numbers).
46    """
47    return None
48
49
50class LLVMSymbolizer(Symbolizer):
51  def __init__(self, symbolizer_path):
52    super(LLVMSymbolizer, self).__init__()
53    self.symbolizer_path = symbolizer_path
54    self.pipe = self.open_llvm_symbolizer()
55  def open_llvm_symbolizer(self):
56    if not os.path.exists(self.symbolizer_path):
57      return None
58    cmd = [self.symbolizer_path,
59           "--use-symbol-table=true",
60           "--demangle=false",
61           "--functions=true",
62           "--inlining=true"]
63    if DEBUG:
64      print ' '.join(cmd)
65    return subprocess.Popen(cmd, stdin=subprocess.PIPE,
66                            stdout=subprocess.PIPE)
67
68  def symbolize(self, addr, binary, offset):
69    """Overrides Symbolizer.symbolize"""
70    if not self.pipe:
71      return None
72    result = []
73    try:
74      symbolizer_input = "%s %s" % (binary, offset)
75      print >> self.pipe.stdin, symbolizer_input
76      while True:
77        function_name = self.pipe.stdout.readline().rstrip()
78        if (function_name == ""):
79          break
80        file_name = self.pipe.stdout.readline().rstrip()
81        file_name = fix_filename(file_name)
82        if (not function_name.startswith("??") and
83            not file_name.startswith("??")):
84          # Append only valid frames.
85          result.append("%s in %s %s" % (addr, function_name,
86                                         file_name))
87    except Exception:
88      result = []
89    if len(result) == 0:
90      result = None
91    return result
92
93
94def LLVMSymbolizerFactory(system):
95  if system == "Linux":
96    symbolizer_path = os.getenv("LLVM_SYMBOLIZER_PATH")
97    if not symbolizer_path:
98      # Assume llvm-symbolizer is in PATH.
99      symbolizer_path = "llvm-symbolizer"
100    return LLVMSymbolizer(symbolizer_path)
101  return None
102
103
104class Addr2LineSymbolizer(Symbolizer):
105  def __init__(self, binary):
106    super(Addr2LineSymbolizer, self).__init__()
107    self.binary = binary
108    self.pipe = self.open_addr2line()
109  def open_addr2line(self):
110    cmd = ["addr2line", "-f", "-e", self.binary]
111    if DEBUG:
112      print ' '.join(cmd)
113    return subprocess.Popen(cmd,
114                            stdin=subprocess.PIPE, stdout=subprocess.PIPE)
115  def symbolize(self, addr, binary, offset):
116    """Overrides Symbolizer.symbolize"""
117    if self.binary != binary:
118      return None
119    try:
120      print >> self.pipe.stdin, offset
121      function_name = self.pipe.stdout.readline().rstrip()
122      file_name     = self.pipe.stdout.readline().rstrip()
123    except Exception:
124      function_name = ""
125      file_name = ""
126    file_name = fix_filename(file_name)
127    return ["%s in %s %s" % (addr, function_name, file_name)]
128
129
130class DarwinSymbolizer(Symbolizer):
131  def __init__(self, addr, binary):
132    super(DarwinSymbolizer, self).__init__()
133    self.binary = binary
134    # Guess which arch we're running. 10 = len("0x") + 8 hex digits.
135    if len(addr) > 10:
136      self.arch = "x86_64"
137    else:
138      self.arch = "i386"
139    self.vmaddr = None
140    self.pipe = None
141  def get_binary_vmaddr(self):
142    """
143    Get the slide value to be added to the address.
144    We're ooking for the following piece in otool -l output:
145      Load command 0
146      cmd LC_SEGMENT
147      cmdsize 736
148      segname __TEXT
149      vmaddr 0x00000000
150    """
151    if self.vmaddr:
152      return self.vmaddr
153    cmdline = ["otool", "-l", self.binary]
154    pipe = subprocess.Popen(cmdline,
155                            stdin=subprocess.PIPE,
156                            stdout=subprocess.PIPE)
157    is_text = False
158    vmaddr = 0
159    for line in pipe.stdout.readlines():
160      line = line.strip()
161      if line.startswith('segname'):
162        is_text = (line == 'segname __TEXT')
163        continue
164      if line.startswith('vmaddr') and is_text:
165        sv = line.split(' ')
166        vmaddr = int(sv[-1], 16)
167        break
168    self.vmaddr = vmaddr
169    return self.vmaddr
170  def write_addr_to_pipe(self, offset):
171    slide = self.get_binary_vmaddr()
172    print >> self.pipe.stdin, "0x%x" % (int(offset, 16) + slide)
173  def open_atos(self):
174    if DEBUG:
175      print "atos -o %s -arch %s" % (self.binary, self.arch)
176    cmdline = ["atos", "-o", self.binary, "-arch", self.arch]
177    self.pipe = subprocess.Popen(cmdline,
178                                 stdin=subprocess.PIPE,
179                                 stdout=subprocess.PIPE,
180                                 stderr=subprocess.PIPE)
181  def symbolize(self, addr, binary, offset):
182    """Overrides Symbolizer.symbolize"""
183    if self.binary != binary:
184      return None
185    self.open_atos()
186    self.write_addr_to_pipe(offset)
187    self.pipe.stdin.close()
188    atos_line = self.pipe.stdout.readline().rstrip()
189    # A well-formed atos response looks like this:
190    #   foo(type1, type2) (in object.name) (filename.cc:80)
191    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
192    if DEBUG:
193      print "atos_line: ", atos_line
194    if match:
195      function_name = match.group(1)
196      function_name = re.sub("\(.*?\)", "", function_name)
197      file_name = fix_filename(match.group(3))
198      return ["%s in %s %s" % (addr, function_name, file_name)]
199    else:
200      return ["%s in %s" % (addr, atos_line)]
201
202
203# Chain several symbolizers so that if one symbolizer fails, we fall back
204# to the next symbolizer in chain.
205class ChainSymbolizer(Symbolizer):
206  def __init__(self, symbolizer_list):
207    super(ChainSymbolizer, self).__init__()
208    self.symbolizer_list = symbolizer_list
209  def symbolize(self, addr, binary, offset):
210    """Overrides Symbolizer.symbolize"""
211    for symbolizer in self.symbolizer_list:
212      if symbolizer:
213        result = symbolizer.symbolize(addr, binary, offset)
214        if result:
215          return result
216    return None
217  def append_symbolizer(self, symbolizer):
218    self.symbolizer_list.append(symbolizer)
219
220
221def BreakpadSymbolizerFactory(addr, binary):
222  suffix = os.getenv("BREAKPAD_SUFFIX")
223  if suffix:
224    filename = binary + suffix
225    if os.access(filename, os.F_OK):
226      return BreakpadSymbolizer(filename)
227  return None
228
229
230def SystemSymbolizerFactory(system, addr, binary):
231  if system == 'Darwin':
232    return DarwinSymbolizer(addr, binary)
233  elif system == 'Linux':
234    return Addr2LineSymbolizer(binary)
235
236
237class BreakpadSymbolizer(Symbolizer):
238  def __init__(self, filename):
239    super(BreakpadSymbolizer, self).__init__()
240    self.filename = filename
241    lines = file(filename).readlines()
242    self.files = []
243    self.symbols = {}
244    self.address_list = []
245    self.addresses = {}
246    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
247    fragments = lines[0].rstrip().split()
248    self.arch = fragments[2]
249    self.debug_id = fragments[3]
250    self.binary = ' '.join(fragments[4:])
251    self.parse_lines(lines[1:])
252  def parse_lines(self, lines):
253    cur_function_addr = ''
254    for line in lines:
255      fragments = line.split()
256      if fragments[0] == 'FILE':
257        assert int(fragments[1]) == len(self.files)
258        self.files.append(' '.join(fragments[2:]))
259      elif fragments[0] == 'PUBLIC':
260        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
261      elif fragments[0] in ['CFI', 'STACK']:
262        pass
263      elif fragments[0] == 'FUNC':
264        cur_function_addr = int(fragments[1], 16)
265        if not cur_function_addr in self.symbols.keys():
266          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
267      else:
268        # Line starting with an address.
269        addr = int(fragments[0], 16)
270        self.address_list.append(addr)
271        # Tuple of symbol address, size, line, file number.
272        self.addresses[addr] = (cur_function_addr,
273                                int(fragments[1], 16),
274                                int(fragments[2]),
275                                int(fragments[3]))
276    self.address_list.sort()
277  def get_sym_file_line(self, addr):
278    key = None
279    if addr in self.addresses.keys():
280      key = addr
281    else:
282      index = bisect.bisect_left(self.address_list, addr)
283      if index == 0:
284        return None
285      else:
286        key = self.address_list[index - 1]
287    sym_id, size, line_no, file_no = self.addresses[key]
288    symbol = self.symbols[sym_id]
289    filename = self.files[file_no]
290    if addr < key + size:
291      return symbol, filename, line_no
292    else:
293      return None
294  def symbolize(self, addr, binary, offset):
295    if self.binary != binary:
296      return None
297    res = self.get_sym_file_line(int(offset, 16))
298    if res:
299      function_name, file_name, line_no = res
300      result = ["%s in %s %s:%d" % (
301          addr, function_name, file_name, line_no)]
302      print result
303      return result
304    else:
305      return None
306
307
308class SymbolizationLoop(object):
309  def __init__(self):
310    self.system = os.uname()[0]
311    if self.system in ['Linux', 'Darwin']:
312      self.llvm_symbolizer = LLVMSymbolizerFactory(self.system)
313    else:
314      raise Exception("Unknown system")
315  def symbolize_address(self, addr, binary, offset):
316    # Use the chain of symbolizers:
317    # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
318    # (fall back to next symbolizer if the previous one fails).
319    if not symbolizers.has_key(binary):
320      symbolizers[binary] = ChainSymbolizer(
321          [BreakpadSymbolizerFactory(addr, binary), llvm_symbolizer])
322    result = symbolizers[binary].symbolize(addr, binary, offset)
323    if result is None:
324      # Initialize system symbolizer only if other symbolizers failed.
325      symbolizers[binary].append_symbolizer(
326          SystemSymbolizerFactory(self.system, addr, binary))
327      result = symbolizers[binary].symbolize(addr, binary, offset)
328    # The system symbolizer must produce some result.
329    assert(result)
330    return result
331  def loop(self):
332    frame_no = 0
333    for line in sys.stdin:
334      #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
335      stack_trace_line_format = (
336          "^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)")
337      match = re.match(stack_trace_line_format, line)
338      if not match:
339        print line.rstrip()
340        continue
341      if DEBUG:
342        print line
343      prefix, frameno_str, addr, binary, offset = match.groups()
344      if (frameno_str == "0"):
345        # Assume that frame #0 is the first frame of new stack trace.
346        frame_no = 0
347      symbolized_line = self.symbolize_address(addr, binary, offset)
348      if not symbolized_line:
349        print line.rstrip()
350      else:
351        for symbolized_frame in symbolized_line:
352          print "    #" + str(frame_no) + " " + symbolized_frame.rstrip()
353          frame_no += 1
354
355
356if __name__ == '__main__':
357  loop = SymbolizationLoop()
358  loop.loop()
359