1#!/usr/bin/env python 2# Copyright (c) 2011 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Process a History database and dump a .dot file suitable for GraphViz. 7 8This is useful for debugging history redirect flows. 9 10An example run of this program: 11 python /src/history-viz.py History > foo.dot 12 /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png 13""" 14 15import struct 16import subprocess 17import sys 18import urlparse 19 20 21# Some transition types, copied from page_transition_types.h. 22TRANS_TYPES = { 23 0: 'link', 24 1: 'typed', 25 2: 'most-visited', 26 3: 'auto subframe', 27 7: 'form', 28} 29 30 31class URL(object): 32 """Represents a broken-down URL from our most visited database.""" 33 34 def __init__(self, id, url): 35 """Initialize a new URL object. |id| is the database id of the URL.""" 36 self.id = id 37 self.url = url 38 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 39 if scheme == 'http': 40 scheme = '' # Shorten for display purposes. 41 if len(scheme) > 0: 42 scheme += '://' 43 self.host = scheme + loc 44 self.path = path 45 46 extra = '' 47 if len(query) > 0: 48 extra += '?' + query 49 if len(fragment) > 0 or url.find('#') > 0: 50 extra += '#' + fragment 51 self.extra = extra 52 53 def PrettyPrint(self, include_host=True, include_path=True): 54 """Pretty-print this URL in a form more suitable for the graph. 55 56 This will elide very long paths and potentially puts newlines between parts 57 of long components. include_host and include_path determine whether to 58 include the host and path in the output. 59 60 Returns: the pretty-printed string.""" 61 MAX_LEN = 30 # Maximum length of a line in the output. 62 parts = [] 63 if include_host: 64 parts.append(self.host) 65 if include_path: 66 parts.append(self.path) 67 parts.append(self.extra) 68 lines = [] 69 line = '' 70 for part in parts: 71 if len(part) > MAX_LEN: 72 part = part[0:MAX_LEN-3] + '...' 73 if len(line)+len(part) > MAX_LEN: 74 lines.append(line) 75 line = '' 76 line += part 77 if len(line) > 0: 78 lines.append(line) 79 return '\n'.join(lines) 80 81 82class Edge(object): 83 """Represents an edge in the history graph, connecting two pages. 84 85 If a link is traversed twice, it is one Edge with two entries in 86 the .transitions array.""" 87 def __init__(self, src, dst): 88 self.src = src 89 self.dst = dst 90 self.transitions = [] 91 92 def Transitions(self): 93 """Return a dictionary mapping transition type -> occurences.""" 94 all = {} 95 for trans in self.transitions: 96 all[trans] = all.get(trans, 0) + 1 97 # We currently don't use the chain type. 98 # TODO(evanm): make this a command-line option. 99 # if trans & 0x30000000 != 0: 100 # chain = '' 101 # if trans & 0x10000000: 102 # chain = 'start' 103 # if trans & 0x20000000: 104 # if len(chain) == 0: 105 # chain = 'end' 106 # else: 107 # chain = '' 108 # if len(chain) > 0: 109 # edge['chain'] = chain 110 return all 111 112 113def ClusterBy(objs, pred): 114 """Group a list of objects by a predicate. 115 116 Given a list of objects and a predicate over the objects, return a 117 dictionary mapping pred(obj) -> all objs with the same pred(obj).""" 118 clusters = {} 119 for obj in objs: 120 cluster = pred(obj) 121 clusters[cluster] = clusters.get(cluster, []) 122 clusters[cluster].append(obj) 123 return clusters 124 125 126def EscapeDot(string): 127 """Escape a string suitable for embedding in a graphviz graph.""" 128 # TODO(evanm): this is likely not sufficient. 129 return string.replace('\n', '\\n') 130 131 132class SQLite(object): 133 """Trivial interface to executing SQLite queries. 134 Spawns a new process with each call.""" 135 def __init__(self, file=None): 136 self.file = file 137 138 def Run(self, sql): 139 """Execute |sql|, yielding each row of results as an array.""" 140 subproc = subprocess.Popen(['sqlite', self.file], 141 stdin=subprocess.PIPE, 142 stdout=subprocess.PIPE) 143 subproc.stdin.write('.mode tabs\n') 144 subproc.stdin.write(sql + ';') 145 subproc.stdin.close() 146 for line in subproc.stdout: 147 row = line.strip().split('\t') 148 yield row 149 150 151def LoadHistory(filename): 152 db = SQLite(filename) 153 154 urls = {} # Map of urlid => url. 155 urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node. 156 for id, url in db.Run('SELECT id, url FROM urls'): 157 urls[id] = URL(id, url) 158 159 visiturlids = {} # Map of visitid => urlid. 160 visiturlids['0'] = '0' # '0' is our special 'start' node. 161 edges = {} # Map of urlid->urlid->Edge. 162 for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition ' 163 'FROM visits ORDER BY id'): 164 visiturlids[dst] = url 165 src = visiturlids[src] 166 dst = visiturlids[dst] 167 edges[src] = edges.get(src, {}) 168 edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst)) 169 # SQLite outputs transition values as signed integers, but they're really 170 # a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)". 171 trans = struct.unpack('I', struct.pack('i', int(trans)))[0] 172 edge.transitions.append(trans) 173 174 return urls, edges 175 176 177def main(): 178 urls, edges = LoadHistory(sys.argv[1]) 179 print 'digraph G {' 180 print ' graph [rankdir=LR]' # Display left to right. 181 print ' node [shape=box]' # Display nodes as boxes. 182 print ' subgraph { rank=source; 0 [label="start"] }' 183 184 # Output all the nodes within graph clusters. 185 hosts = ClusterBy(urls.values(), lambda url: url.host) 186 for i, (host, urls) in enumerate(hosts.items()): 187 # Cluster all URLs under this host if it has more than one entry. 188 host_clustered = len(urls) > 1 189 if host_clustered: 190 print 'subgraph clusterhost%d {' % i 191 print ' label="%s"' % host 192 paths = ClusterBy(urls, lambda url: url.path) 193 for j, (path, urls) in enumerate(paths.items()): 194 # Cluster all URLs under this host if it has more than one entry. 195 path_clustered = host_clustered and len(urls) > 1 196 if path_clustered: 197 print ' subgraph cluster%d%d {' % (i, j) 198 print ' label="%s"' % path 199 for url in urls: 200 if url.id == '0': continue # We already output the special start node. 201 pretty = url.PrettyPrint(include_host=not host_clustered, 202 include_path=not path_clustered) 203 print ' %s [label="%s"]' % (url.id, EscapeDot(pretty)) 204 if path_clustered: 205 print ' }' 206 if host_clustered: 207 print '}' 208 209 # Output all the edges between nodes. 210 for src, dsts in edges.items(): 211 for dst, edge in dsts.items(): 212 # Gather up all the transitions into the label. 213 label = [] # Label for the edge. 214 transitions = edge.Transitions() 215 for trans, count in transitions.items(): 216 text = '' 217 if count > 1: 218 text = '%dx ' % count 219 base_type = trans & 0xFF 220 redir = (trans & 0xC0000000) != 0 221 start = (trans & 0x10000000) != 0 222 end = (trans & 0x20000000) != 0 223 if start or end: 224 if start: 225 text += '<' 226 if end: 227 text += '>' 228 text += ' ' 229 if redir: 230 text += 'R ' 231 text += TRANS_TYPES.get(base_type, 'trans%d' % base_type) 232 label.append(text) 233 if len(label) == 0: 234 continue 235 236 edgeattrs = [] # Graphviz attributes for the edge. 237 # If the edge is from the start and the transitions are fishy, make it 238 # display as a dotted line. 239 if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0): 240 edgeattrs.append('style=dashed') 241 if len(label) > 0: 242 edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label))) 243 244 out = '%s -> %s' % (src, dst) 245 if len(edgeattrs) > 0: 246 out += ' [%s]' % ','.join(edgeattrs) 247 print out 248 print '}' 249 return 0 250 251 252if __name__ == '__main__': 253 sys.exit(main()) 254