1#!/usr/bin/env python
2#
3# Copyright (C) 2016 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""annotate.py: annotate source files based on perf.data.
19"""
20
21
22import argparse
23import os
24import os.path
25import shutil
26import subprocess
27import sys
28
29from simpleperf_report_lib import *
30from utils import *
31
32class SourceLine(object):
33    def __init__(self, file, function, line):
34        self.file = file
35        self.function = function
36        self.line = line
37
38    @property
39    def file_key(self):
40        return self.file
41
42    @property
43    def function_key(self):
44        return (self.file, self.function)
45
46    @property
47    def line_key(self):
48        return (self.file, self.line)
49
50
51# TODO: using addr2line can't convert from function_start_address to
52# source_file:line very well for java code. Because in .debug_line section,
53# there is some distance between function_start_address and the address
54# of the first instruction which can be mapped to source line.
55class Addr2Line(object):
56    """collect information of how to map [dso_name,vaddr] to [source_file:line].
57    """
58    def __init__(self, addr2line_path, symfs_dir=None):
59        self.dso_dict = dict()
60        if addr2line_path and is_executable_available(addr2line_path):
61            self.addr2line_path = addr2line_path
62        else:
63            self.addr2line_path = find_tool_path('addr2line')
64            if not self.addr2line_path:
65                log_exit("Can't find addr2line.")
66        self.symfs_dir = symfs_dir
67
68
69    def add_addr(self, dso_name, addr):
70        dso = self.dso_dict.get(dso_name)
71        if dso is None:
72            self.dso_dict[dso_name] = dso = dict()
73        if addr not in dso:
74            dso[addr] = None
75
76
77    def convert_addrs_to_lines(self):
78        # store a list of source files
79        self.file_list = []
80        # map from file to id with file_list[id] == file
81        self.file_dict = {}
82        self.file_list.append('')
83        self.file_dict[''] = 0
84
85        for dso_name in self.dso_dict.keys():
86            self._convert_addrs_to_lines(dso_name, self.dso_dict[dso_name])
87        self._combine_source_files()
88
89
90    def _convert_addrs_to_lines(self, dso_name, dso):
91        dso_path = self._find_dso_path(dso_name)
92        if dso_path is None:
93            log_warning("can't find dso '%s'" % dso_name)
94            dso.clear()
95            return
96        addrs = sorted(dso.keys())
97        addr_str = []
98        for addr in addrs:
99            addr_str.append('0x%x' % addr)
100        addr_str = '\n'.join(addr_str)
101        subproc = subprocess.Popen([self.addr2line_path, '-e', dso_path, '-aifC'],
102                                   stdin=subprocess.PIPE, stdout=subprocess.PIPE)
103        (stdoutdata, _) = subproc.communicate(str_to_bytes(addr_str))
104        stdoutdata = bytes_to_str(stdoutdata)
105        stdoutdata = stdoutdata.strip().split('\n')
106        if len(stdoutdata) < len(addrs):
107            log_fatal("addr2line didn't output enough lines")
108        addr_pos = 0
109        out_pos = 0
110        while addr_pos < len(addrs) and out_pos < len(stdoutdata):
111            addr_line = stdoutdata[out_pos]
112            out_pos += 1
113            assert addr_line[:2] == "0x"
114            assert out_pos < len(stdoutdata)
115            source_lines = []
116            while out_pos < len(stdoutdata) and stdoutdata[out_pos][:2] != "0x":
117                function = stdoutdata[out_pos]
118                out_pos += 1
119                assert out_pos < len(stdoutdata)
120                # Handle lines like "C:\Users\...\file:32".
121                items = stdoutdata[out_pos].rsplit(':', 1)
122                if len(items) != 2:
123                    continue
124                (file, line) = items
125                line = line.split()[0]  # Remove comments after line number
126                out_pos += 1
127                if file.find('?') != -1:
128                    file = 0
129                else:
130                    file = self._get_file_id(file)
131                if line.find('?') != -1:
132                    line = 0
133                else:
134                    line = int(line)
135                source_lines.append(SourceLine(file, function, line))
136            dso[addrs[addr_pos]] = source_lines
137            addr_pos += 1
138        assert addr_pos == len(addrs)
139
140
141    def _get_file_id(self, file):
142        id = self.file_dict.get(file)
143        if id is None:
144            id = len(self.file_list)
145            self.file_list.append(file)
146            self.file_dict[file] = id
147        return id
148
149    def _combine_source_files(self):
150        """It is possible that addr2line gives us different names for the same
151           file, like:
152            /usr/local/.../src/main/jni/sudo-game-jni.cpp
153            sudo-game-jni.cpp
154           We'd better combine these two files. We can do it by combining
155           source files with no conflicts in path.
156        """
157        # Collect files having the same filename.
158        filename_dict = dict()
159        for file in self.file_list:
160            index = max(file.rfind('/'), file.rfind(os.sep))
161            filename = file[index+1:]
162            entry = filename_dict.get(filename)
163            if entry is None:
164                filename_dict[filename] = entry = []
165            entry.append(file)
166
167        # Combine files having the same filename and having no conflicts in path.
168        for filename in filename_dict.keys():
169            files = filename_dict[filename]
170            if len(files) == 1:
171                continue
172            for file in files:
173                to_file = file
174                # Test if we can merge files[i] with another file having longer
175                # path.
176                for f in files:
177                    if len(f) > len(to_file) and f.find(file) != -1:
178                        to_file = f
179                if to_file != file:
180                    from_id = self.file_dict[file]
181                    to_id = self.file_dict[to_file]
182                    self.file_list[from_id] = self.file_list[to_id]
183
184
185    def get_sources(self, dso_name, addr):
186        dso = self.dso_dict.get(dso_name)
187        if dso is None:
188            return []
189        item = dso.get(addr, [])
190        source_lines = []
191        for source in item:
192            source_lines.append(SourceLine(self.file_list[source.file],
193                                           source.function, source.line))
194        return source_lines
195
196
197    def _find_dso_path(self, dso):
198        if dso[0] != '/' or dso == '//anon':
199            return None
200        if self.symfs_dir:
201            dso_path = os.path.join(self.symfs_dir, dso[1:])
202            if os.path.isfile(dso_path):
203                return dso_path
204        if os.path.isfile(dso):
205            return dso
206        return None
207
208
209class Period(object):
210    """event count information. It can be used to represent event count
211       of a line, a function, a source file, or a binary. It contains two
212       parts: period and acc_period.
213       When used for a line, period is the event count occurred when running
214       that line, acc_period is the accumulated event count occurred when
215       running that line and functions called by that line. Same thing applies
216       when it is used for a function, a source file, or a binary.
217    """
218    def __init__(self, period=0, acc_period=0):
219        self.period = period
220        self.acc_period = acc_period
221
222
223    def __iadd__(self, other):
224        self.period += other.period
225        self.acc_period += other.acc_period
226        return self
227
228
229class DsoPeriod(object):
230    """Period for each shared library"""
231    def __init__(self, dso_name):
232        self.dso_name = dso_name
233        self.period = Period()
234
235
236    def add_period(self, period):
237        self.period += period
238
239
240class FilePeriod(object):
241    """Period for each source file"""
242    def __init__(self, file):
243        self.file = file
244        self.period = Period()
245        # Period for each line in the file.
246        self.line_dict = {}
247        # Period for each function in the source file.
248        self.function_dict = {}
249
250
251    def add_period(self, period):
252        self.period += period
253
254
255    def add_line_period(self, line, period):
256        a = self.line_dict.get(line)
257        if a is None:
258            self.line_dict[line] = a = Period()
259        a += period
260
261
262    def add_function_period(self, function_name, function_start_line, period):
263        a = self.function_dict.get(function_name)
264        if not a:
265            if function_start_line is None:
266                function_start_line = -1
267            self.function_dict[function_name] = a = [function_start_line, Period()]
268        a[1] += period
269
270
271class SourceFileAnnotator(object):
272    """group code for annotating source files"""
273    def __init__(self, config):
274        # check config variables
275        config_names = ['perf_data_list', 'source_dirs', 'comm_filters',
276                        'pid_filters', 'tid_filters', 'dso_filters', 'addr2line_path']
277        for name in config_names:
278            if name not in config:
279                log_exit('config [%s] is missing' % name)
280        symfs_dir = 'binary_cache'
281        if not os.path.isdir(symfs_dir):
282            symfs_dir = None
283        kallsyms = 'binary_cache/kallsyms'
284        if not os.path.isfile(kallsyms):
285            kallsyms = None
286        source_dirs = config['source_dirs']
287        for dir in source_dirs:
288            if not os.path.isdir(dir):
289                log_exit('[source_dirs] "%s" is not a dir' % dir)
290        if not config['source_dirs']:
291            log_exit('Please set source directories.')
292
293        # init member variables
294        self.config = config
295        self.symfs_dir = symfs_dir
296        self.kallsyms = kallsyms
297        self.comm_filter = set(config['comm_filters']) if config.get('comm_filters') else None
298        if config.get('pid_filters'):
299            self.pid_filter = {int(x) for x in config['pid_filters']}
300        else:
301            self.pid_filter = None
302        if config.get('tid_filters'):
303            self.tid_filter = {int(x) for x in config['tid_filters']}
304        else:
305            self.tid_filter = None
306        self.dso_filter = set(config['dso_filters']) if config.get('dso_filters') else None
307
308        config['annotate_dest_dir'] = 'annotated_files'
309        output_dir = config['annotate_dest_dir']
310        if os.path.isdir(output_dir):
311            shutil.rmtree(output_dir)
312        os.makedirs(output_dir)
313
314        self.addr2line = Addr2Line(self.config['addr2line_path'], symfs_dir)
315
316
317    def annotate(self):
318        self._collect_addrs()
319        self._convert_addrs_to_lines()
320        self._generate_periods()
321        self._write_summary()
322        self._collect_source_files()
323        self._annotate_files()
324
325
326    def _collect_addrs(self):
327        """Read perf.data, collect all addresses we need to convert to
328           source file:line.
329        """
330        for perf_data in self.config['perf_data_list']:
331            lib = ReportLib()
332            lib.SetRecordFile(perf_data)
333            if self.symfs_dir:
334                lib.SetSymfs(self.symfs_dir)
335            if self.kallsyms:
336                lib.SetKallsymsFile(self.kallsyms)
337            while True:
338                sample = lib.GetNextSample()
339                if sample is None:
340                    lib.Close()
341                    break
342                if not self._filter_sample(sample):
343                    continue
344                symbols = []
345                symbols.append(lib.GetSymbolOfCurrentSample())
346                callchain = lib.GetCallChainOfCurrentSample()
347                for i in range(callchain.nr):
348                    symbols.append(callchain.entries[i].symbol)
349                for symbol in symbols:
350                    if self._filter_symbol(symbol):
351                        self.addr2line.add_addr(symbol.dso_name, symbol.vaddr_in_file)
352                        self.addr2line.add_addr(symbol.dso_name, symbol.symbol_addr)
353
354
355    def _filter_sample(self, sample):
356        """Return true if the sample can be used."""
357        if self.comm_filter:
358            if sample.thread_comm not in self.comm_filter:
359                return False
360        if self.pid_filter:
361            if sample.pid not in self.pid_filter:
362                return False
363        if self.tid_filter:
364            if sample.tid not in self.tid_filter:
365                return False
366        return True
367
368
369    def _filter_symbol(self, symbol):
370        if not self.dso_filter or symbol.dso_name in self.dso_filter:
371            return True
372        return False
373
374
375    def _convert_addrs_to_lines(self):
376        self.addr2line.convert_addrs_to_lines()
377
378
379    def _generate_periods(self):
380        """read perf.data, collect Period for all types:
381            binaries, source files, functions, lines.
382        """
383        self.period = 0
384        self.dso_periods = dict()
385        self.file_periods = dict()
386        for perf_data in self.config['perf_data_list']:
387            lib = ReportLib()
388            lib.SetRecordFile(perf_data)
389            if self.symfs_dir:
390                lib.SetSymfs(self.symfs_dir)
391            if self.kallsyms:
392                lib.SetKallsymsFile(self.kallsyms)
393            while True:
394                sample = lib.GetNextSample()
395                if sample is None:
396                    lib.Close()
397                    break
398                if not self._filter_sample(sample):
399                    continue
400                symbols = []
401                symbols.append(lib.GetSymbolOfCurrentSample())
402                callchain = lib.GetCallChainOfCurrentSample()
403                for i in range(callchain.nr):
404                    symbols.append(callchain.entries[i].symbol)
405                # Each sample has a callchain, but its period is only used once
406                # to add period for each function/source_line/source_file/binary.
407                # For example, if more than one entry in the callchain hits a
408                # function, the event count of that function is only increased once.
409                # Otherwise, we may get periods > 100%.
410                is_sample_used = False
411                used_dso_dict = dict()
412                used_file_dict = dict()
413                used_function_dict = dict()
414                used_line_dict = dict()
415                period = Period(sample.period, sample.period)
416                for i in range(len(symbols)):
417                    symbol = symbols[i]
418                    if i == 1:
419                        period = Period(0, sample.period)
420                    if not self._filter_symbol(symbol):
421                        continue
422                    is_sample_used = True
423                    # Add period to dso.
424                    self._add_dso_period(symbol.dso_name, period, used_dso_dict)
425                    # Add period to source file.
426                    sources = self.addr2line.get_sources(symbol.dso_name, symbol.vaddr_in_file)
427                    for source in sources:
428                        if source.file:
429                            self._add_file_period(source, period, used_file_dict)
430                            # Add period to line.
431                            if source.line:
432                                self._add_line_period(source, period, used_line_dict)
433                    # Add period to function.
434                    sources = self.addr2line.get_sources(symbol.dso_name, symbol.symbol_addr)
435                    for source in sources:
436                        if source.file:
437                            self._add_file_period(source, period, used_file_dict)
438                            if source.function:
439                                self._add_function_period(source, period, used_function_dict)
440
441                if is_sample_used:
442                    self.period += sample.period
443
444
445    def _add_dso_period(self, dso_name, period, used_dso_dict):
446        if dso_name not in used_dso_dict:
447            used_dso_dict[dso_name] = True
448            dso_period = self.dso_periods.get(dso_name)
449            if dso_period is None:
450                dso_period = self.dso_periods[dso_name] = DsoPeriod(dso_name)
451            dso_period.add_period(period)
452
453
454    def _add_file_period(self, source, period, used_file_dict):
455        if source.file_key not in used_file_dict:
456            used_file_dict[source.file_key] = True
457            file_period = self.file_periods.get(source.file)
458            if file_period is None:
459                file_period = self.file_periods[source.file] = FilePeriod(source.file)
460            file_period.add_period(period)
461
462
463    def _add_line_period(self, source, period, used_line_dict):
464        if source.line_key not in used_line_dict:
465            used_line_dict[source.line_key] = True
466            file_period = self.file_periods[source.file]
467            file_period.add_line_period(source.line, period)
468
469
470    def _add_function_period(self, source, period, used_function_dict):
471        if source.function_key not in used_function_dict:
472            used_function_dict[source.function_key] = True
473            file_period = self.file_periods[source.file]
474            file_period.add_function_period(source.function, source.line, period)
475
476
477    def _write_summary(self):
478        summary = os.path.join(self.config['annotate_dest_dir'], 'summary')
479        with open(summary, 'w') as f:
480            f.write('total period: %d\n\n' % self.period)
481            dso_periods = sorted(self.dso_periods.values(),
482                                 key=lambda x: x.period.acc_period, reverse=True)
483            for dso_period in dso_periods:
484                f.write('dso %s: %s\n' % (dso_period.dso_name,
485                                          self._get_percentage_str(dso_period.period)))
486            f.write('\n')
487
488            file_periods = sorted(self.file_periods.values(),
489                                  key=lambda x: x.period.acc_period, reverse=True)
490            for file_period in file_periods:
491                f.write('file %s: %s\n' % (file_period.file,
492                                           self._get_percentage_str(file_period.period)))
493            for file_period in file_periods:
494                f.write('\n\n%s: %s\n' % (file_period.file,
495                                          self._get_percentage_str(file_period.period)))
496                values = []
497                for func_name in file_period.function_dict.keys():
498                    func_start_line, period = file_period.function_dict[func_name]
499                    values.append((func_name, func_start_line, period))
500                values = sorted(values, key=lambda x: x[2].acc_period, reverse=True)
501                for value in values:
502                    f.write('\tfunction (%s): line %d, %s\n' % (
503                        value[0], value[1], self._get_percentage_str(value[2])))
504                f.write('\n')
505                for line in sorted(file_period.line_dict.keys()):
506                    f.write('\tline %d: %s\n' % (
507                        line, self._get_percentage_str(file_period.line_dict[line])))
508
509
510    def _get_percentage_str(self, period, short=False):
511        s = 'acc_p: %f%%, p: %f%%' if short else 'accumulated_period: %f%%, period: %f%%'
512        return s % self._get_percentage(period)
513
514
515    def _get_percentage(self, period):
516        if self.period == 0:
517            return (0, 0)
518        acc_p = 100.0 * period.acc_period / self.period
519        p = 100.0 * period.period / self.period
520        return (acc_p, p)
521
522
523    def _collect_source_files(self):
524        self.source_file_dict = dict()
525        source_file_suffix = ['h', 'c', 'cpp', 'cc', 'java', 'kt']
526        for source_dir in self.config['source_dirs']:
527            for root, _, files in os.walk(source_dir):
528                for file in files:
529                    if file[file.rfind('.')+1:] in source_file_suffix:
530                        entry = self.source_file_dict.get(file)
531                        if entry is None:
532                            entry = self.source_file_dict[file] = []
533                        entry.append(os.path.join(root, file))
534
535
536    def _find_source_file(self, file):
537        filename = file[file.rfind(os.sep)+1:]
538        source_files = self.source_file_dict.get(filename)
539        if source_files is None:
540            return None
541        match_count = 0
542        result = None
543        for path in source_files:
544            if path.find(file) != -1:
545                match_count += 1
546                result = path
547        if match_count > 1:
548            log_warning('multiple source for %s, select %s' % (file, result))
549        return result
550
551
552    def _annotate_files(self):
553        """Annotate Source files: add acc_period/period for each source file.
554           1. Annotate java source files, which have $JAVA_SRC_ROOT prefix.
555           2. Annotate c++ source files.
556        """
557        dest_dir = self.config['annotate_dest_dir']
558        for key in self.file_periods.keys():
559            is_java = False
560            if key.startswith('$JAVA_SRC_ROOT/'):
561                path = key[len('$JAVA_SRC_ROOT/'):]
562                items = path.split('/')
563                path = os.sep.join(items)
564                from_path = self._find_source_file(path)
565                to_path = os.path.join(dest_dir, 'java', path)
566                is_java = True
567            elif key.startswith('/') and os.path.isfile(key):
568                path = key
569                from_path = path
570                to_path = os.path.join(dest_dir, path[1:])
571            elif is_windows() and key.find(':\\') != -1 and os.path.isfile(key):
572                from_path = key
573                to_path = os.path.join(dest_dir, key.replace(':\\', '\\'))
574            else:
575                path = key[1:] if key.startswith('/') else key
576                # Change path on device to path on host
577                path = os.sep.join(path.split('/'))
578                from_path = self._find_source_file(path)
579                to_path = os.path.join(dest_dir, path)
580            if from_path is None:
581                log_warning("can't find source file for path %s" % key)
582                continue
583            self._annotate_file(from_path, to_path, self.file_periods[key], is_java)
584
585
586    def _annotate_file(self, from_path, to_path, file_period, is_java):
587        """Annotate a source file.
588
589        Annotate a source file in three steps:
590          1. In the first line, show periods of this file.
591          2. For each function, show periods of this function.
592          3. For each line not hitting the same line as functions, show
593             line periods.
594        """
595        log_info('annotate file %s' % from_path)
596        with open(from_path, 'r') as rf:
597            lines = rf.readlines()
598
599        annotates = dict()
600        for line in file_period.line_dict.keys():
601            annotates[line] = self._get_percentage_str(file_period.line_dict[line], True)
602        for func_name in file_period.function_dict.keys():
603            func_start_line, period = file_period.function_dict[func_name]
604            if func_start_line == -1:
605                continue
606            line = func_start_line - 1 if is_java else func_start_line
607            annotates[line] = '[func] ' + self._get_percentage_str(period, True)
608        annotates[1] = '[file] ' + self._get_percentage_str(file_period.period, True)
609
610        max_annotate_cols = 0
611        for key in annotates.keys():
612            max_annotate_cols = max(max_annotate_cols, len(annotates[key]))
613
614        empty_annotate = ' ' * (max_annotate_cols + 6)
615
616        dirname = os.path.dirname(to_path)
617        if not os.path.isdir(dirname):
618            os.makedirs(dirname)
619        with open(to_path, 'w') as wf:
620            for line in range(1, len(lines) + 1):
621                annotate = annotates.get(line)
622                if annotate is None:
623                    if not lines[line-1].strip():
624                        annotate = ''
625                    else:
626                        annotate = empty_annotate
627                else:
628                    annotate = '/* ' + annotate + (
629                        ' ' * (max_annotate_cols - len(annotate))) + ' */'
630                wf.write(annotate)
631                wf.write(lines[line-1])
632
633def main():
634    parser = argparse.ArgumentParser(description=
635"""Annotate source files based on profiling data. It reads line information from
636binary_cache generated by app_profiler.py or binary_cache_builder.py, and
637generate annotated source files in annotated_files directory.""")
638    parser.add_argument('-i', '--perf_data_list', nargs='+', action='append', help=
639"""The paths of profiling data. Default is perf.data.""")
640    parser.add_argument('-s', '--source_dirs', nargs='+', action='append', help=
641"""Directories to find source files.""")
642    parser.add_argument('--comm', nargs='+', action='append', help=
643"""Use samples only in threads with selected names.""")
644    parser.add_argument('--pid', nargs='+', action='append', help=
645"""Use samples only in processes with selected process ids.""")
646    parser.add_argument('--tid', nargs='+', action='append', help=
647"""Use samples only in threads with selected thread ids.""")
648    parser.add_argument('--dso', nargs='+', action='append', help=
649"""Use samples only in selected binaries.""")
650    parser.add_argument('--addr2line', help=
651"""Set the path of addr2line.""")
652
653    args = parser.parse_args()
654    config = {}
655    config['perf_data_list'] = flatten_arg_list(args.perf_data_list)
656    if not config['perf_data_list']:
657        config['perf_data_list'].append('perf.data')
658    config['source_dirs'] = flatten_arg_list(args.source_dirs)
659    config['comm_filters'] = flatten_arg_list(args.comm)
660    config['pid_filters'] = flatten_arg_list(args.pid)
661    config['tid_filters'] = flatten_arg_list(args.tid)
662    config['dso_filters'] = flatten_arg_list(args.dso)
663    config['addr2line_path'] = args.addr2line
664
665    annotator = SourceFileAnnotator(config)
666    annotator.annotate()
667    log_info('annotate finish successfully, please check result in annotated_files/.')
668
669if __name__ == '__main__':
670    main()
671