1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""This module classifies NativeHeap objects filtering their allocations.
6
7The only filter currently available is 'stacktrace', which works as follows:
8
9{'name': 'rule-1', 'stacktrace': 'foo' }
10{'name': 'rule-2', 'stacktrace': ['foo', r'bar\s+baz']}
11{'name': 'rule-3', 'source_path': 'sk.*allocator'}
12{'name': 'rule-3', 'source_path': 'sk', 'stacktrace': 'SkAllocator'}
13
14
15rule-1 will match any allocation that has 'foo' in one of its  stack frames.
16rule-2 will match any allocation that has a stack frame matching 'foo' AND a
17followed by a stack frame matching 'bar baz'. Note that order matters, so rule-2
18will not match a stacktrace like ['bar baz', 'foo'].
19rule-3 will match any allocation in which at least one of the source paths in
20its stack frames matches the regex sk.*allocator.
21rule-4 will match any allocation which satisfies both the conditions.
22
23TODO(primiano): introduce more filters after the first prototype with UI, for
24instance, filter by library file name or by allocation size.
25"""
26
27import collections
28import posixpath
29import re
30
31from memory_inspector.classification import results
32from memory_inspector.classification import rules
33from memory_inspector.core import exceptions
34from memory_inspector.core import native_heap
35
36
37_RESULT_KEYS = ['bytes_allocated', 'bytes_resident']
38
39
40def LoadRules(content):
41  """Loads and parses a native-heap rule tree from a content (string).
42
43  Returns:
44    An instance of |rules.Rule|, which nodes are |_NHeapRule| instances.
45  """
46  return rules.Load(content, _NHeapRule)
47
48
49def Classify(nativeheap, rule_tree):
50  """Creates aggregated results of native heaps using the provided rules.
51
52  Args:
53    nativeheap: the heap dump being processed (a |NativeHeap| instance).
54    rule_tree: the user-defined rules that define the filtering categories.
55
56  Returns:
57    An instance of |AggreatedResults|.
58  """
59  assert(isinstance(nativeheap, native_heap.NativeHeap))
60  assert(isinstance(rule_tree, rules.Rule))
61
62  res = results.AggreatedResults(rule_tree, _RESULT_KEYS)
63  for allocation in nativeheap.allocations:
64    res.AddToMatchingNodes(allocation,
65                           [allocation.size, allocation.resident_size])
66  return res
67
68
69def InferHeuristicRulesFromHeap(nheap, max_depth=3, threshold=0.02):
70  """Infers the rules tree from a symbolized heap snapshot.
71
72  In lack of a specific set of rules, this method can be invoked to infer a
73  meaningful rule tree starting from a heap snapshot. It will build a compact
74  radix tree from the source paths of the stack traces, which height is at most
75  |max_depth|, selecting only those nodes which contribute for at least
76  |threshold| (1.0 = 100%) w.r.t. the total allocation of the heap snapshot.
77  """
78  assert(isinstance(nheap, native_heap.NativeHeap))
79
80  def RadixTreeInsert(node, path):
81    """Inserts a string (path) into a radix tree (a python recursive dict).
82
83    e.g.: [/a/b/c, /a/b/d, /z/h] -> {'/a/b/': {'c': {}, 'd': {}}, '/z/h': {}}
84    """
85
86    def GetCommonPrefix(args):
87      """Returns the common prefix between two paths (no partial paths).
88
89      e.g.: /tmp/bar, /tmp/baz will return /tmp/ (and not /tmp/ba as the dumb
90      posixpath.commonprefix implementation would do)
91      """
92      parts = posixpath.commonprefix(args).rpartition(posixpath.sep)[0]
93      return parts + posixpath.sep if parts else ''
94
95    for node_path in node.iterkeys():
96      pfx = GetCommonPrefix([node_path, path])
97      if not pfx:
98        continue
99      if len(pfx) < len(node_path):
100        node[pfx] = {node_path[len(pfx):] : node[node_path]}
101        del node[node_path]
102      if len(path) > len(pfx):
103        RadixTreeInsert(node[pfx], path[len(pfx):])
104      return
105    node[path] = {}  # No common prefix, create new child in current node.
106
107  # Given an allocation of size N and its stack trace, heuristically determines
108  # the source directory to be blamed for the N bytes.
109  # The blamed_dir is the one which appears more times in the top 8 stack frames
110  # (excluding the first 2, which usually are just the (m|c)alloc call sites).
111  # At the end, this will generate a *leaderboard* (|blamed_dirs|) which
112  # associates, to each source path directory, the number of bytes allocated.
113
114  blamed_dirs = collections.Counter()  # '/s/path' : bytes_from_this_path (int)
115  total_allocated = 0
116  for alloc in nheap.allocations:
117    dir_histogram = collections.Counter()
118    for frame in alloc.stack_trace.frames[2:10]:
119      # Compute a histogram (for each allocation) of the top source dirs.
120      if not frame.symbol or not frame.symbol.source_info:
121        continue
122      src_file = frame.symbol.source_info[0].source_file_path
123      src_dir = posixpath.dirname(src_file.replace('\\', '/')) + '/'
124      dir_histogram.update([src_dir])
125    if not dir_histogram:
126      continue
127    # Add the blamed dir to the leaderboard.
128    blamed_dir = dir_histogram.most_common()[0][0]
129    blamed_dirs.update({blamed_dir : alloc.size})
130    total_allocated += alloc.size
131
132  # Select only the top paths from the leaderboard which contribute for more
133  # than |threshold| and make a radix tree out of them.
134  radix_tree = {}
135  for blamed_dir, alloc_size in blamed_dirs.most_common():
136    if (1.0 * alloc_size / total_allocated) < threshold:
137      break
138    RadixTreeInsert(radix_tree, blamed_dir)
139
140  # The final step consists in generating a rule tree from the radix tree. This
141  # is a pretty straightforward tree-clone operation, they have the same shape.
142  def GenRulesFromRadixTree(radix_tree_node, max_depth, parent_path=''):
143    children = []
144    if max_depth > 0:
145      for node_path, node_children in radix_tree_node.iteritems():
146        child_rule = {
147            'name': node_path[-16:],
148            'source_path': '^' + re.escape(parent_path + node_path),
149            'children': GenRulesFromRadixTree(
150                node_children, max_depth - 1, parent_path + node_path)}
151        children += [child_rule]
152    return children
153
154  rules_tree = GenRulesFromRadixTree(radix_tree, max_depth)
155  return LoadRules(str(rules_tree))
156
157
158class _NHeapRule(rules.Rule):
159  def __init__(self, name, filters):
160    super(_NHeapRule, self).__init__(name)
161
162    # The 'stacktrace' filter can be either a string (simple case, one regex) or
163    # a list of strings (complex case, see doc in the header of this file).
164    stacktrace_regexs = filters.get('stacktrace', [])
165    if isinstance(stacktrace_regexs, basestring):
166      stacktrace_regexs = [stacktrace_regexs]
167    self._stacktrace_regexs = []
168    for regex in stacktrace_regexs:
169      try:
170        self._stacktrace_regexs.append(re.compile(regex))
171      except re.error, descr:
172        raise exceptions.MemoryInspectorException(
173            'Stacktrace regex error "%s" : %s' % (regex, descr))
174
175    # The 'source_path' regex, instead, simply matches the source file path.
176    self._path_regex = None
177    path_regex = filters.get('source_path')
178    if path_regex:
179      try:
180        self._path_regex = re.compile(path_regex)
181      except re.error, descr:
182        raise exceptions.MemoryInspectorException(
183            'Path regex error "%s" : %s' % (path_regex, descr))
184
185  def Match(self, allocation):
186    # Match the source file path, if the 'source_path' filter is specified.
187    if self._path_regex:
188      path_matches = False
189      for frame in allocation.stack_trace.frames:
190        if frame.symbol and frame.symbol.source_info:
191          if self._path_regex.search(
192              frame.symbol.source_info[0].source_file_path):
193            path_matches = True
194            break
195      if not path_matches:
196        return False
197
198    # Match the stack traces symbols, if the 'stacktrace' filter is specified.
199    if not self._stacktrace_regexs:
200      return True
201    cur_regex_idx = 0
202    cur_regex = self._stacktrace_regexs[0]
203    for frame in allocation.stack_trace.frames:
204      if frame.symbol and cur_regex.search(frame.symbol.name):
205        # The current regex has been matched.
206        if cur_regex_idx == len(self._stacktrace_regexs) - 1:
207          return True  # All the provided regexs have been matched, we're happy.
208        cur_regex_idx += 1
209        cur_regex = self._stacktrace_regexs[cur_regex_idx]
210
211    return False  # Not all the provided regexs have been matched.