1#!/usr/bin/python
2# Copyright 2013 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Symbolize log file produced by cypgofile instrumentation.
7
8Given a log file and the binary being profiled (e.g. executable, shared
9library), the script can produce three different outputs: 1) symbols for the
10addresses, 2) function and line numbers for the addresses, or 3) an order file.
11"""
12
13import optparse
14import os
15import string
16import subprocess
17import sys
18
19
20def ParseLogLines(log_file_lines):
21  """Parse a log file produced by the profiled run of clank.
22
23  Args:
24    log_file_lines: array of lines in log file produced by profiled run
25    lib_name: library or executable containing symbols
26
27    Below is an example of a small log file:
28    5086e000-52e92000 r-xp 00000000 b3:02 51276      libchromeview.so
29    secs       usecs      pid:threadid    func
30    START
31    1314897086 795828     3587:1074648168 0x509e105c
32    1314897086 795874     3587:1074648168 0x509e0eb4
33    1314897086 796326     3587:1074648168 0x509e0e3c
34    1314897086 796552     3587:1074648168 0x509e07bc
35    END
36
37  Returns:
38    call_info list with list of tuples of the format (sec, usec, call id,
39    function address called)
40  """
41  call_lines = []
42  has_started = False
43  vm_start = 0
44  line = log_file_lines[0]
45  assert("r-xp" in line)
46  end_index = line.find('-')
47  vm_start = int(line[:end_index], 16)
48  for line in log_file_lines[2:]:
49  # print hex(vm_start)
50    fields = line.split()
51    if len(fields) == 4:
52      call_lines.append(fields)
53
54  # Convert strings to int in fields.
55  call_info = []
56  for call_line in call_lines:
57    (sec_timestamp, usec_timestamp) = map(int, call_line[0:2])
58    callee_id = call_line[2]
59    addr = int(call_line[3], 16)
60    if vm_start < addr:
61      addr -= vm_start
62      call_info.append((sec_timestamp, usec_timestamp, callee_id, addr))
63
64  return call_info
65
66
67def ParseLibSymbols(lib_file):
68  """Get output from running nm and greping for text symbols.
69
70  Args:
71    lib_file: the library or executable that contains the profiled code
72
73  Returns:
74    list of sorted unique addresses and corresponding size of function symbols
75    in lib_file and map of addresses to all symbols at a particular address
76  """
77  cmd = ['nm', '-S', '-n', lib_file]
78  nm_p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
79  output = nm_p.communicate()[0]
80  nm_lines = output.split('\n')
81
82  nm_symbols = []
83  for nm_line in nm_lines:
84    if any(str in nm_line for str in (' t ', ' W ', ' T ')):
85      nm_symbols.append(nm_line)
86
87  nm_index = 0
88  unique_addrs = []
89  address_map = {}
90  while nm_index < len(nm_symbols):
91
92    # If the length of the split line is not 4, then it does not contain all the
93    # information needed to symbolize (i.e. address, size and symbol name).
94    if len(nm_symbols[nm_index].split()) == 4:
95      (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]]
96
97      # Multiple symbols may be at the same address.  This is do to aliasing
98      # done by the compiler.  Since there is no way to be sure which one was
99      # called in profiled run, we will symbolize to include all symbol names at
100      # a particular address.
101      fnames = []
102      while (nm_index < len(nm_symbols) and
103             addr == int(nm_symbols[nm_index].split()[0], 16)):
104        if len(nm_symbols[nm_index].split()) == 4:
105          fnames.append(nm_symbols[nm_index].split()[3])
106        nm_index += 1
107      address_map[addr] = fnames
108      unique_addrs.append((addr, size))
109    else:
110      nm_index += 1
111
112  return (unique_addrs, address_map)
113
114class SymbolNotFoundException(Exception):
115  def __init__(self,value):
116    self.value = value
117  def __str__(self):
118    return repr(self.value)
119
120def BinarySearchAddresses(addr, start, end, arr):
121  """Find starting address of a symbol at a particular address.
122
123  The reason we can not directly use the address provided by the log file is
124  that the log file may give an address after the start of the symbol.  The
125  logged address is often one byte after the start.  By using this search
126  function rather than just subtracting one from the logged address allows
127  the logging instrumentation to log any address in a function.
128
129  Args:
130    addr: the address being searched for
131    start: the starting index for the binary search
132    end: the ending index for the binary search
133    arr: the list being searched containing tuple of address and size
134
135  Returns:
136    the starting address of the symbol at address addr
137
138  Raises:
139    Exception: if address not found.  Functions expects all logged addresses
140    to be found
141  """
142  # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end)
143  if start >= end or start == end - 1:
144    # arr[i] is a tuple of address and size.  Check if addr inside range
145    if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]:
146      return arr[start][0]
147    elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]:
148      return arr[end][0]
149    else:
150      raise SymbolNotFoundException(addr)
151  else:
152    halfway = (start + end) / 2
153    (nm_addr, size) = arr[halfway]
154    # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway)
155    if addr >= nm_addr and addr < nm_addr + size:
156      return nm_addr
157    elif addr < nm_addr:
158      return BinarySearchAddresses(addr, start, halfway-1, arr)
159    else:
160      # Condition (addr >= nm_addr + size) must be true.
161      return BinarySearchAddresses(addr, halfway+1, end, arr)
162
163
164def FindFunctions(addr, unique_addrs, address_map):
165  """Find function symbol names at address addr."""
166  return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1,
167                                           unique_addrs)]
168
169
170def AddrToLine(addr, lib_file):
171  """Use addr2line to determine line info of a particular address."""
172  cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)]
173  p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
174  output = (p.communicate()[0]).split('\n')
175  line = output[0]
176  index = 1
177  while index < len(output):
178    line = line + ':' + output[index]
179    index += 1
180  return line
181
182
183def main():
184  """Write output for profiled run to standard out.
185
186  The format of the output depends on the output type specified as the third
187  command line argument.  The default output type is to symbolize the addresses
188  of the functions called.
189  """
190  parser = optparse.OptionParser('usage: %prog [options] log_file lib_file')
191  parser.add_option('-t', '--outputType', dest='output_type',
192                    default='symbolize', type='string',
193                    help='lineize or symbolize or orderfile')
194
195  # Option for output type.  The log file and lib file arguments are required
196  # by the script and therefore are not options.
197  (options, args) = parser.parse_args()
198  if len(args) != 2:
199    parser.error('expected 2 args: log_file lib_file')
200
201  (log_file, lib_file) = args
202  output_type = options.output_type
203
204  lib_name = lib_file.split('/')[-1].strip()
205  log_file_lines = map(string.rstrip, open(log_file).readlines())
206  call_info = ParseLogLines(log_file_lines)
207  (unique_addrs, address_map) = ParseLibSymbols(lib_file)
208
209  # Check for duplicate addresses in the log file, and print a warning if
210  # duplicates are found. The instrumentation that produces the log file
211  # should only print the first time a function is entered.
212  addr_list = []
213  for call in call_info:
214    addr = call[3]
215    if addr not in addr_list:
216      addr_list.append(addr)
217    else:
218      print('WARNING: Address ' + hex(addr) + ' (line= ' +
219            AddrToLine(addr, lib_file) + ') already profiled.')
220
221  for call in call_info:
222    if output_type == 'lineize':
223      symbol = AddrToLine(call[3], lib_file)
224      print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
225            + symbol)
226    elif output_type == 'orderfile':
227      try:
228        symbols = FindFunctions(call[3], unique_addrs, address_map)
229        for symbol in symbols:
230          print '.text.' + symbol
231        print ''
232      except SymbolNotFoundException as e:
233        sys.stderr.write('WARNING: Did not find function in binary. addr: '
234                      + hex(addr) + '\n')
235    else:
236      try:
237        symbols = FindFunctions(call[3], unique_addrs, address_map)
238        print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
239              + symbols[0])
240        first_symbol = True
241        for symbol in symbols:
242          if not first_symbol:
243            print '\t\t\t\t\t' + symbol
244          else:
245            first_symbol = False
246      except SymbolNotFoundException as e:
247        sys.stderr.write('WARNING: Did not find function in binary. addr: '
248                      + hex(addr) + '\n')
249
250if __name__ == '__main__':
251  main()
252