1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
|
#!/usr/bin/python
#
# Copyright 2008 Google Inc. All Rights Reserved.
"""Process a History database and dump a .dot file suitable for GraphViz.
This is useful for debugging history redirect flows.
An example run of this program:
python /src/history-viz.py History > foo.dot
/c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png
"""
import struct
import subprocess
import sys
import urlparse
class URL:
"""Represents a broken-down URL from our most visited database."""
def __init__(self, id, url):
"""Initialize a new URL object. |id| is the database id of the URL."""
self.id = id
self.url = url
scheme, loc, path, query, fragment = urlparse.urlsplit(url)
if scheme == 'http':
scheme = '' # Shorten for display purposes.
if len(scheme) > 0:
scheme += '://'
self.host = scheme + loc
self.path = path
extra = ''
if len(query) > 0:
extra += '?' + query
if len(fragment) > 0 or url.find('#') > 0:
extra += '#' + fragment
self.extra = extra
def PrettyPrint(self, include_host=True, include_path=True):
"""Pretty-print this URL in a form more suitable for the graph.
This will elide very long paths and potentially puts newlines between parts
of long components. include_host and include_path determine whether to
include the host and path in the output.
Returns: the pretty-printed string."""
MAX_LEN = 30 # Maximum length of a line in the output.
parts = []
if include_host:
parts.append(self.host)
if include_path:
parts.append(self.path)
parts.append(self.extra)
lines = []
line = ''
for part in parts:
if len(part) > MAX_LEN:
part = part[0:MAX_LEN-3] + '...'
if len(line)+len(part) > MAX_LEN:
lines.append(line)
line = ''
line += part
if len(line) > 0:
lines.append(line)
return '\n'.join(lines)
class Edge:
"""Represents an edge in the history graph, connecting two pages.
If a link is traversed twice, it is one Edge with two entries in
the .transitions array."""
def __init__(self, src, dst):
self.src = src
self.dst = dst
self.transitions = []
def Transitions(self):
"""Return a dictionary mapping transition type -> occurences."""
all = {}
for trans in self.transitions:
all[trans] = all.get(trans, 0) + 1
# We currently don't use the chain type.
# TODO(evanm): make this a command-line option.
# if trans & 0x30000000 != 0:
# chain = ''
# if trans & 0x10000000:
# chain = 'start'
# if trans & 0x20000000:
# if len(chain) == 0:
# chain = 'end'
# else:
# chain = ''
# if len(chain) > 0:
# edge['chain'] = chain
return all
def ClusterBy(objs, pred):
"""Group a list of objects by a predicate.
Given a list of objects and a predicate over the objects, return a
dictionary mapping pred(obj) -> all objs with the same pred(obj)."""
clusters = {}
for obj in objs:
cluster = pred(obj)
clusters[cluster] = clusters.get(cluster, [])
clusters[cluster].append(obj)
return clusters
def EscapeDot(str):
"""Escape a string suitable for embedding in a graphviz graph."""
# TODO(evanm): this is likely not sufficient.
return str.replace('\n', '\\n')
class SQLite:
"""Trivial interface to executing SQLite queries.
Spawns a new process with each call."""
def __init__(self, file=None):
self.file = file
def Run(self, sql):
"""Execute |sql|, yielding each row of results as an array."""
subproc = subprocess.Popen(['sqlite', self.file],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
subproc.stdin.write('.mode tabs\n')
subproc.stdin.write(sql + ';')
subproc.stdin.close()
for line in subproc.stdout:
row = line.strip().split('\t')
yield row
def LoadHistory(filename):
db = SQLite(filename)
urls = {} # Map of urlid => url.
urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node.
for id, url in db.Run('SELECT id, url FROM urls'):
urls[id] = URL(id, url)
visiturlids = {} # Map of visitid => urlid.
visiturlids['0'] = '0' # '0' is our special 'start' node.
edges = {} # Map of urlid->urlid->Edge.
for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition '
'FROM visits ORDER BY id'):
visiturlids[dst] = url
src = visiturlids[src]
dst = visiturlids[dst]
edges[src] = edges.get(src, {})
edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst))
# SQLite outputs transition values as signed integers, but they're really
# a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)".
trans = struct.unpack('I', struct.pack('i', int(trans)))[0]
edge.transitions.append(trans)
return urls, edges
# Some transition types, copied from page_transition_types.h.
TRANS_TYPES = {
0: 'link',
1: 'typed',
2: 'most-visited',
3: 'auto subframe',
7: 'form',
}
urls, edges = LoadHistory(sys.argv[1])
print 'digraph G {'
print ' graph [rankdir=LR]' # Display left to right.
print ' node [shape=box]' # Display nodes as boxes.
print ' subgraph { rank=source; 0 [label="start"] }'
# Output all the nodes within graph clusters.
hosts = ClusterBy(urls.values(), lambda url: url.host)
for i, (host, urls) in enumerate(hosts.items()):
# Cluster all URLs under this host if it has more than one entry.
host_clustered = len(urls) > 1
if host_clustered:
print 'subgraph clusterhost%d {' % i
print ' label="%s"' % host
paths = ClusterBy(urls, lambda url: url.path)
for j, (path, urls) in enumerate(paths.items()):
# Cluster all URLs under this host if it has more than one entry.
path_clustered = host_clustered and len(urls) > 1
if path_clustered:
print ' subgraph cluster%d%d {' % (i, j)
print ' label="%s"' % path
for url in urls:
if url.id == '0': continue # We already output the special start node.
pretty = url.PrettyPrint(include_host=not host_clustered,
include_path=not path_clustered)
print ' %s [label="%s"]' % (url.id, EscapeDot(pretty))
if path_clustered:
print ' }'
if host_clustered:
print '}'
# Output all the edges between nodes.
for src, dsts in edges.items():
for dst, edge in dsts.items():
# Gather up all the transitions into the label.
label = [] # Label for the edge.
transitions = edge.Transitions()
for trans, count in transitions.items():
text = ''
if count > 1:
text = '%dx ' % count
base_type = trans & 0xFF
redir = (trans & 0xC0000000) != 0
start = (trans & 0x10000000) != 0
end = (trans & 0x20000000) != 0
if start or end:
if start:
text += '<'
if end:
text += '>'
text += ' '
if redir:
text += 'R '
text += TRANS_TYPES.get(base_type, 'trans%d' % base_type)
label.append(text)
if len(label) == 0:
continue
edgeattrs = [] # Graphviz attributes for the edge.
# If the edge is from the start and the transitions are fishy, make it
# display as a dotted line.
if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0):
edgeattrs.append('style=dashed')
if len(label) > 0:
edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label)))
out = '%s -> %s' % (src, dst)
if len(edgeattrs) > 0:
out += ' [%s]' % ','.join(edgeattrs)
print out
print '}'
|