1#!/usr/bin/env python
2# Copyright (c) 2011 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Process a History database and dump a .dot file suitable for GraphViz.
7
8This is useful for debugging history redirect flows.
9
10An example run of this program:
11  python /src/history-viz.py History > foo.dot
12  /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png
13"""
14
15import struct
16import subprocess
17import sys
18import urlparse
19
20
21# Some transition types, copied from page_transition_types.h.
22TRANS_TYPES = {
23  0: 'link',
24  1: 'typed',
25  2: 'most-visited',
26  3: 'auto subframe',
27  7: 'form',
28}
29
30
31class URL(object):
32  """Represents a broken-down URL from our most visited database."""
33
34  def __init__(self, id, url):
35    """Initialize a new URL object.  |id| is the database id of the URL."""
36    self.id = id
37    self.url = url
38    scheme, loc, path, query, fragment = urlparse.urlsplit(url)
39    if scheme == 'http':
40      scheme = ''  # Shorten for display purposes.
41    if len(scheme) > 0:
42      scheme += '://'
43    self.host = scheme + loc
44    self.path = path
45
46    extra = ''
47    if len(query) > 0:
48      extra += '?' + query
49    if len(fragment) > 0 or url.find('#') > 0:
50      extra += '#' + fragment
51    self.extra = extra
52
53  def PrettyPrint(self, include_host=True, include_path=True):
54    """Pretty-print this URL in a form more suitable for the graph.
55
56    This will elide very long paths and potentially puts newlines between parts
57    of long components.  include_host and include_path determine whether to
58    include the host and path in the output.
59
60    Returns: the pretty-printed string."""
61    MAX_LEN = 30  # Maximum length of a line in the output.
62    parts = []
63    if include_host:
64      parts.append(self.host)
65    if include_path:
66      parts.append(self.path)
67    parts.append(self.extra)
68    lines = []
69    line = ''
70    for part in parts:
71      if len(part) > MAX_LEN:
72        part = part[0:MAX_LEN-3] + '...'
73      if len(line)+len(part) > MAX_LEN:
74        lines.append(line)
75        line = ''
76      line += part
77    if len(line) > 0:
78      lines.append(line)
79    return '\n'.join(lines)
80
81
82class Edge(object):
83  """Represents an edge in the history graph, connecting two pages.
84
85  If a link is traversed twice, it is one Edge with two entries in
86  the .transitions array."""
87  def __init__(self, src, dst):
88    self.src = src
89    self.dst = dst
90    self.transitions = []
91
92  def Transitions(self):
93    """Return a dictionary mapping transition type -> occurences."""
94    all = {}
95    for trans in self.transitions:
96      all[trans] = all.get(trans, 0) + 1
97      # We currently don't use the chain type.
98      # TODO(evanm): make this a command-line option.
99      # if trans & 0x30000000 != 0:
100      #   chain = ''
101      #   if trans & 0x10000000:
102      #     chain = 'start'
103      #   if trans & 0x20000000:
104      #     if len(chain) == 0:
105      #       chain = 'end'
106      #     else:
107      #       chain = ''
108      #   if len(chain) > 0:
109      #     edge['chain'] = chain
110    return all
111
112
113def ClusterBy(objs, pred):
114  """Group a list of objects by a predicate.
115
116  Given a list of objects and a predicate over the objects, return a
117  dictionary mapping pred(obj) -> all objs with the same pred(obj)."""
118  clusters = {}
119  for obj in objs:
120    cluster = pred(obj)
121    clusters[cluster] = clusters.get(cluster, [])
122    clusters[cluster].append(obj)
123  return clusters
124
125
126def EscapeDot(string):
127  """Escape a string suitable for embedding in a graphviz graph."""
128  # TODO(evanm): this is likely not sufficient.
129  return string.replace('\n', '\\n')
130
131
132class SQLite(object):
133  """Trivial interface to executing SQLite queries.
134  Spawns a new process with each call."""
135  def __init__(self, file=None):
136    self.file = file
137
138  def Run(self, sql):
139    """Execute |sql|, yielding each row of results as an array."""
140    subproc = subprocess.Popen(['sqlite', self.file],
141                               stdin=subprocess.PIPE,
142                               stdout=subprocess.PIPE)
143    subproc.stdin.write('.mode tabs\n')
144    subproc.stdin.write(sql + ';')
145    subproc.stdin.close()
146    for line in subproc.stdout:
147      row = line.strip().split('\t')
148      yield row
149
150
151def LoadHistory(filename):
152  db = SQLite(filename)
153
154  urls = {}  # Map of urlid => url.
155  urls['0'] = URL('0', 'start')  # Node name '0' is our special 'start' node.
156  for id, url in db.Run('SELECT id, url FROM urls'):
157    urls[id] = URL(id, url)
158
159  visiturlids = {}  # Map of visitid => urlid.
160  visiturlids['0'] = '0'  # '0' is our special 'start' node.
161  edges = {}  # Map of urlid->urlid->Edge.
162  for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition '
163                                     'FROM visits ORDER BY id'):
164    visiturlids[dst] = url
165    src = visiturlids[src]
166    dst = visiturlids[dst]
167    edges[src] = edges.get(src, {})
168    edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst))
169    # SQLite outputs transition values as signed integers, but they're really
170    # a bitfield.  Below does "unsigned trans = static_cast<unsigned>(trans)".
171    trans = struct.unpack('I', struct.pack('i', int(trans)))[0]
172    edge.transitions.append(trans)
173
174  return urls, edges
175
176
177def main():
178  urls, edges = LoadHistory(sys.argv[1])
179  print 'digraph G {'
180  print '  graph [rankdir=LR]'  # Display left to right.
181  print '  node [shape=box]'    # Display nodes as boxes.
182  print '  subgraph { rank=source; 0 [label="start"] }'
183
184  # Output all the nodes within graph clusters.
185  hosts = ClusterBy(urls.values(), lambda url: url.host)
186  for i, (host, urls) in enumerate(hosts.items()):
187    # Cluster all URLs under this host if it has more than one entry.
188    host_clustered = len(urls) > 1
189    if host_clustered:
190      print 'subgraph clusterhost%d {' % i
191      print '  label="%s"' % host
192    paths = ClusterBy(urls, lambda url: url.path)
193    for j, (path, urls) in enumerate(paths.items()):
194      # Cluster all URLs under this host if it has more than one entry.
195      path_clustered = host_clustered and len(urls) > 1
196      if path_clustered:
197        print '  subgraph cluster%d%d {' % (i, j)
198        print '    label="%s"' % path
199      for url in urls:
200        if url.id == '0': continue  # We already output the special start node.
201        pretty = url.PrettyPrint(include_host=not host_clustered,
202                                include_path=not path_clustered)
203        print '    %s [label="%s"]' % (url.id, EscapeDot(pretty))
204      if path_clustered:
205        print '  }'
206    if host_clustered:
207      print '}'
208
209  # Output all the edges between nodes.
210  for src, dsts in edges.items():
211    for dst, edge in dsts.items():
212      # Gather up all the transitions into the label.
213      label = []      # Label for the edge.
214      transitions = edge.Transitions()
215      for trans, count in transitions.items():
216        text = ''
217        if count > 1:
218          text = '%dx ' % count
219        base_type = trans & 0xFF
220        redir = (trans & 0xC0000000) != 0
221        start = (trans & 0x10000000) != 0
222        end = (trans & 0x20000000) != 0
223        if start or end:
224          if start:
225            text += '<'
226          if end:
227            text += '>'
228          text += ' '
229        if redir:
230          text += 'R '
231        text += TRANS_TYPES.get(base_type, 'trans%d' % base_type)
232        label.append(text)
233      if len(label) == 0:
234        continue
235
236      edgeattrs = []  # Graphviz attributes for the edge.
237      # If the edge is from the start and the transitions are fishy, make it
238      # display as a dotted line.
239      if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0):
240        edgeattrs.append('style=dashed')
241      if len(label) > 0:
242        edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label)))
243
244      out = '%s -> %s' % (src, dst)
245      if len(edgeattrs) > 0:
246        out += ' [%s]' % ','.join(edgeattrs)
247      print out
248  print '}'
249  return 0
250
251
252if __name__ == '__main__':
253  sys.exit(main())
254