asan_symbolize.py revision 444a185d855bccf806f12572d3e8a01eee7c09bf
186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#!/usr/bin/env python 286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===# 386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# 486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# The LLVM Compiler Infrastructure 586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# 686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# This file is distributed under the University of Illinois Open Source 786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# License. See LICENSE.TXT for details. 886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# 986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst#===------------------------------------------------------------------------===# 1086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport bisect 1186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport os 1286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport re 1386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport subprocess 1486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstimport sys 1586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 1686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstllvm_symbolizer = None 1786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstsymbolizers = {} 1886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstfiletypes = {} 1986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstvmaddrs = {} 2086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine GarstDEBUG = False 2186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 2286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 2386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst# FIXME: merge the code that calls fix_filename(). 2486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstdef fix_filename(file_name): 2586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst for path_to_cut in sys.argv[1:]: 2686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst file_name = re.sub('.*' + path_to_cut, '', file_name) 2786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name) 2886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst file_name = re.sub('.*crtstuff.c:0', '???:0', file_name) 2986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst return file_name 3086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 3186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 3286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstclass Symbolizer(object): 3386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst def __init__(self): 3486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst pass 3586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 3686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst def symbolize(self, addr, binary, offset): 3786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst """Symbolize the given address (pair of binary and offset). 3886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 3986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst Overriden in subclasses. 4086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst Args: 4186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst addr: virtual address of an instruction. 4286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst binary: path to executable/shared object containing this instruction. 4386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst offset: instruction offset in the @binary. 4486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst Returns: 4586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst list of strings (one string for each inlined frame) describing 4686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst the code locations for this instruction (that is, function name, file 4786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst name, line and column numbers). 4886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst """ 4986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst return None 5086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 5186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 5286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garstclass LLVMSymbolizer(Symbolizer): 5386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst def __init__(self, symbolizer_path): 5486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst super(LLVMSymbolizer, self).__init__() 5586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst self.symbolizer_path = symbolizer_path 5686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst self.pipe = self.open_llvm_symbolizer() 5786d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst 5886d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst def open_llvm_symbolizer(self): 5986d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst if not os.path.exists(self.symbolizer_path): 6086d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst return None 6186d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst cmd = [self.symbolizer_path, 6286d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst '--use-symbol-table=true', 6386d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst '--demangle=false', 6486d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst '--functions=true', 6586d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst '--inlining=true'] 6686d0ba4286ec42dcd055bd22434c7f40009834bdBlaine Garst if DEBUG: 67 print ' '.join(cmd) 68 return subprocess.Popen(cmd, stdin=subprocess.PIPE, 69 stdout=subprocess.PIPE) 70 71 def symbolize(self, addr, binary, offset): 72 """Overrides Symbolizer.symbolize.""" 73 if not self.pipe: 74 return None 75 result = [] 76 try: 77 symbolizer_input = '%s %s' % (binary, offset) 78 if DEBUG: 79 print symbolizer_input 80 print >> self.pipe.stdin, symbolizer_input 81 while True: 82 function_name = self.pipe.stdout.readline().rstrip() 83 if not function_name: 84 break 85 file_name = self.pipe.stdout.readline().rstrip() 86 file_name = fix_filename(file_name) 87 if (not function_name.startswith('??') and 88 not file_name.startswith('??')): 89 # Append only valid frames. 90 result.append('%s in %s %s' % (addr, function_name, 91 file_name)) 92 except Exception: 93 result = [] 94 if not result: 95 result = None 96 return result 97 98 99def LLVMSymbolizerFactory(system): 100 symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH') 101 if not symbolizer_path: 102 # Assume llvm-symbolizer is in PATH. 103 symbolizer_path = 'llvm-symbolizer' 104 return LLVMSymbolizer(symbolizer_path) 105 106 107class Addr2LineSymbolizer(Symbolizer): 108 def __init__(self, binary): 109 super(Addr2LineSymbolizer, self).__init__() 110 self.binary = binary 111 self.pipe = self.open_addr2line() 112 113 def open_addr2line(self): 114 cmd = ['addr2line', '-f', '-e', self.binary] 115 if DEBUG: 116 print ' '.join(cmd) 117 return subprocess.Popen(cmd, 118 stdin=subprocess.PIPE, stdout=subprocess.PIPE) 119 120 def symbolize(self, addr, binary, offset): 121 """Overrides Symbolizer.symbolize.""" 122 if self.binary != binary: 123 return None 124 try: 125 print >> self.pipe.stdin, offset 126 function_name = self.pipe.stdout.readline().rstrip() 127 file_name = self.pipe.stdout.readline().rstrip() 128 except Exception: 129 function_name = '' 130 file_name = '' 131 file_name = fix_filename(file_name) 132 return ['%s in %s %s' % (addr, function_name, file_name)] 133 134 135class DarwinSymbolizer(Symbolizer): 136 def __init__(self, addr, binary): 137 super(DarwinSymbolizer, self).__init__() 138 self.binary = binary 139 # Guess which arch we're running. 10 = len('0x') + 8 hex digits. 140 if len(addr) > 10: 141 self.arch = 'x86_64' 142 else: 143 self.arch = 'i386' 144 self.vmaddr = None 145 self.pipe = None 146 147 def write_addr_to_pipe(self, offset): 148 print >> self.pipe.stdin, '0x%x' % int(offset, 16) 149 150 def open_atos(self): 151 if DEBUG: 152 print 'atos -o %s -arch %s' % (self.binary, self.arch) 153 cmdline = ['atos', '-o', self.binary, '-arch', self.arch] 154 self.pipe = subprocess.Popen(cmdline, 155 stdin=subprocess.PIPE, 156 stdout=subprocess.PIPE, 157 stderr=subprocess.PIPE) 158 159 def symbolize(self, addr, binary, offset): 160 """Overrides Symbolizer.symbolize.""" 161 if self.binary != binary: 162 return None 163 self.open_atos() 164 self.write_addr_to_pipe(offset) 165 self.pipe.stdin.close() 166 atos_line = self.pipe.stdout.readline().rstrip() 167 # A well-formed atos response looks like this: 168 # foo(type1, type2) (in object.name) (filename.cc:80) 169 match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line) 170 if DEBUG: 171 print 'atos_line: ', atos_line 172 if match: 173 function_name = match.group(1) 174 function_name = re.sub('\(.*?\)', '', function_name) 175 file_name = fix_filename(match.group(3)) 176 return ['%s in %s %s' % (addr, function_name, file_name)] 177 else: 178 return ['%s in %s' % (addr, atos_line)] 179 180 181# Chain several symbolizers so that if one symbolizer fails, we fall back 182# to the next symbolizer in chain. 183class ChainSymbolizer(Symbolizer): 184 def __init__(self, symbolizer_list): 185 super(ChainSymbolizer, self).__init__() 186 self.symbolizer_list = symbolizer_list 187 188 def symbolize(self, addr, binary, offset): 189 """Overrides Symbolizer.symbolize.""" 190 for symbolizer in self.symbolizer_list: 191 if symbolizer: 192 result = symbolizer.symbolize(addr, binary, offset) 193 if result: 194 return result 195 return None 196 197 def append_symbolizer(self, symbolizer): 198 self.symbolizer_list.append(symbolizer) 199 200 201def BreakpadSymbolizerFactory(binary): 202 suffix = os.getenv('BREAKPAD_SUFFIX') 203 if suffix: 204 filename = binary + suffix 205 if os.access(filename, os.F_OK): 206 return BreakpadSymbolizer(filename) 207 return None 208 209 210def SystemSymbolizerFactory(system, addr, binary): 211 if system == 'Darwin': 212 return DarwinSymbolizer(addr, binary) 213 elif system == 'Linux': 214 return Addr2LineSymbolizer(binary) 215 216 217class BreakpadSymbolizer(Symbolizer): 218 def __init__(self, filename): 219 super(BreakpadSymbolizer, self).__init__() 220 self.filename = filename 221 lines = file(filename).readlines() 222 self.files = [] 223 self.symbols = {} 224 self.address_list = [] 225 self.addresses = {} 226 # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t 227 fragments = lines[0].rstrip().split() 228 self.arch = fragments[2] 229 self.debug_id = fragments[3] 230 self.binary = ' '.join(fragments[4:]) 231 self.parse_lines(lines[1:]) 232 233 def parse_lines(self, lines): 234 cur_function_addr = '' 235 for line in lines: 236 fragments = line.split() 237 if fragments[0] == 'FILE': 238 assert int(fragments[1]) == len(self.files) 239 self.files.append(' '.join(fragments[2:])) 240 elif fragments[0] == 'PUBLIC': 241 self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:]) 242 elif fragments[0] in ['CFI', 'STACK']: 243 pass 244 elif fragments[0] == 'FUNC': 245 cur_function_addr = int(fragments[1], 16) 246 if not cur_function_addr in self.symbols.keys(): 247 self.symbols[cur_function_addr] = ' '.join(fragments[4:]) 248 else: 249 # Line starting with an address. 250 addr = int(fragments[0], 16) 251 self.address_list.append(addr) 252 # Tuple of symbol address, size, line, file number. 253 self.addresses[addr] = (cur_function_addr, 254 int(fragments[1], 16), 255 int(fragments[2]), 256 int(fragments[3])) 257 self.address_list.sort() 258 259 def get_sym_file_line(self, addr): 260 key = None 261 if addr in self.addresses.keys(): 262 key = addr 263 else: 264 index = bisect.bisect_left(self.address_list, addr) 265 if index == 0: 266 return None 267 else: 268 key = self.address_list[index - 1] 269 sym_id, size, line_no, file_no = self.addresses[key] 270 symbol = self.symbols[sym_id] 271 filename = self.files[file_no] 272 if addr < key + size: 273 return symbol, filename, line_no 274 else: 275 return None 276 277 def symbolize(self, addr, binary, offset): 278 if self.binary != binary: 279 return None 280 res = self.get_sym_file_line(int(offset, 16)) 281 if res: 282 function_name, file_name, line_no = res 283 result = ['%s in %s %s:%d' % ( 284 addr, function_name, file_name, line_no)] 285 print result 286 return result 287 else: 288 return None 289 290 291class SymbolizationLoop(object): 292 def __init__(self, binary_name_filter=None): 293 # Used by clients who may want to supply a different binary name. 294 # E.g. in Chrome several binaries may share a single .dSYM. 295 self.binary_name_filter = binary_name_filter 296 self.system = os.uname()[0] 297 if self.system in ['Linux', 'Darwin']: 298 self.llvm_symbolizer = LLVMSymbolizerFactory(self.system) 299 else: 300 raise Exception('Unknown system') 301 302 def symbolize_address(self, addr, binary, offset): 303 # Use the chain of symbolizers: 304 # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos 305 # (fall back to next symbolizer if the previous one fails). 306 if not binary in symbolizers: 307 symbolizers[binary] = ChainSymbolizer( 308 [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer]) 309 result = symbolizers[binary].symbolize(addr, binary, offset) 310 if result is None: 311 # Initialize system symbolizer only if other symbolizers failed. 312 symbolizers[binary].append_symbolizer( 313 SystemSymbolizerFactory(self.system, addr, binary)) 314 result = symbolizers[binary].symbolize(addr, binary, offset) 315 # The system symbolizer must produce some result. 316 assert result 317 return result 318 319 def print_symbolized_lines(self, symbolized_lines): 320 if not symbolized_lines: 321 print self.current_line 322 else: 323 for symbolized_frame in symbolized_lines: 324 print ' #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip() 325 self.frame_no += 1 326 327 def process_stdin(self): 328 self.frame_no = 0 329 for line in sys.stdin: 330 self.current_line = line.rstrip() 331 #0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45) 332 stack_trace_line_format = ( 333 '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)') 334 match = re.match(stack_trace_line_format, line) 335 if not match: 336 print self.current_line 337 continue 338 if DEBUG: 339 print line 340 _, frameno_str, addr, binary, offset = match.groups() 341 if frameno_str == '0': 342 # Assume that frame #0 is the first frame of new stack trace. 343 self.frame_no = 0 344 original_binary = binary 345 if self.binary_name_filter: 346 binary = self.binary_name_filter(binary) 347 symbolized_line = self.symbolize_address(addr, binary, offset) 348 if not symbolized_line: 349 if original_binary != binary: 350 symbolized_line = self.symbolize_address(addr, binary, offset) 351 self.print_symbolized_lines(symbolized_line) 352 353 354if __name__ == '__main__': 355 loop = SymbolizationLoop() 356 loop.process_stdin() 357