1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
|
#!/usr/bin/python2.4
# Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Component for automatically creating masks of changing areas of a website.
Works by repeated invokation of a browser and scraping of the resulting page.
Areas that differ will be added to the auto-generated mask. The mask generator
considers the mask complete when further scrapes fail to produce any differences
in the mask.
"""
import os # Functions for walking the directory tree
import tempfile # Get a temporary directory to hold intermediates
import time # Used for sleep() and naming masks by time
import command_line
import drivers
from PIL import Image
from PIL import ImageChops
import scrapers
def CreateCommand(cmdline):
"""Inserts the command and arguments into a command line for parsing."""
cmd = cmdline.AddCommand(
["maskmaker"],
"Automatically generates a mask from a list of URLs",
ValidateMaskmaker,
ExecuteMaskmaker)
cmd.AddArgument(
["-bp", "--browserpath"], "Full path to browser's executable",
type="readfile", metaname="PATH")
cmd.AddArgument(
["-b", "--browser"], "Which browser to use", type="string",
default="chrome")
cmd.AddArgument(
["-bv", "--browserver"], "Version of the browser", metaname="VERSION")
cmd.AddArgument(
["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",
required=True)
cmd.AddArgument(
["-u", "--url"], "URL to compare")
cmd.AddArgument(
["-l", "--list"], "List of URLs to compare", type="readfile")
cmd.AddMutualExclusion(["--url", "--list"])
cmd.AddArgument(
["-s", "--startline"], "First line of URL list", type="int")
cmd.AddArgument(
["-e", "--endline"], "Last line of URL list (exclusive)", type="int")
cmd.AddArgument(
["-c", "--count"], "Number of lines of URL file to use", type="int")
cmd.AddDependency("--startline", "--list")
cmd.AddRequiredGroup(["--url", "--list"])
cmd.AddDependency("--endline", "--list")
cmd.AddDependency("--count", "--list")
cmd.AddMutualExclusion(["--count", "--endline"])
cmd.AddDependency("--count", "--startline")
cmd.AddArgument(
["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "
"finish loading",
type="int", default=60)
cmd.AddArgument(
["-w", "--wait"],
"Amount of time (in seconds) to wait between successive scrapes",
type="int", default=60)
cmd.AddArgument(
["-sc", "--scrapes"],
"Number of successive scrapes which must result in no change to a mask "
"before mask creation is considered complete", type="int", default=10)
cmd.AddArgument(
["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")
cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")
cmd.AddArgument(
["-gu", "--giveup"],
"Number of times to scrape before giving up", type="int", default=50)
cmd.AddArgument(
["-th", "--threshhold"],
"Percentage of different pixels (0-100) above which the scrape will be"
"discarded and the mask not updated.", type="int", default=100)
cmd.AddArgument(
["--er", "--errors"],
"Number of times a scrape can fail before giving up on the URL.",
type="int", default=1)
def ValidateMaskmaker(command):
"""Validate the arguments to maskmaker. Raises ParseError if failed."""
executables = [".exe", ".com", ".bat"]
if command["--browserpath"]:
if os.path.splitext(command["--browserpath"])[1].lower() not in executables:
raise command_line.ParseError("Browser filename must be an executable")
def ExecuteMaskmaker(command):
"""Performs automatic mask generation."""
# Get the list of URLs to generate masks for
class MaskmakerURL(object):
"""Helper class for holding information about a URL passed to maskmaker."""
__slots__ = ['url', 'consecutive_successes', 'errors']
def __init__(self, url):
self.url = url
self.consecutive_successes = 0
self.errors = 0
if command["--url"]:
url_list = [MaskmakerURL(command["--url"])]
else:
startline = command["--startline"]
if command["--count"]:
endline = startline+command["--count"]
else:
endline = command["--endline"]
url_list = [MaskmakerURL(url.strip()) for url in
open(command["--list"], "r").readlines()[startline:endline]]
complete_list = []
error_list = []
outdir = command["--outdir"]
scrapes = command["--scrapes"]
errors = command["--errors"]
size = command["--size"]
scrape_pass = 0
scrapedir = command["--scrapedir"]
if not scrapedir: scrapedir = tempfile.gettempdir()
# Get the scraper
scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))
# Repeatedly iterate through the list of URLs until either every URL has
# a successful mask or too many errors, or we've exceeded the giveup limit
while url_list and scrape_pass < command["--giveup"]:
# Scrape each URL
for url in url_list:
print "Processing %r..." % url.url
mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")
# Load the existing mask. This is in a loop so we can try to recover
# from error conditions
while True:
try:
mask = Image.open(mask_filename)
if mask.size != size:
print " %r already exists and is the wrong size! (%r vs %r)" % (
mask_filename, mask.size, size)
mask_filename = "%s_%r%s" % (
mask_filename[:-4], size, mask_filename[-4:])
print " Trying again as %r..." % mask_filename
continue
break
except IOError:
print " %r does not exist, creating" % mask_filename
mask = Image.new("1", size, 1)
mask.save(mask_filename)
# Find the stored scrape path
mask_scrape_dir = os.path.join(
scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
drivers.windowing.PreparePath(mask_scrape_dir)
# Find the baseline image
mask_scrapes = os.listdir(mask_scrape_dir)
mask_scrapes.sort()
if not mask_scrapes:
print " No baseline image found, mask will not be updated"
baseline = None
else:
baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))
mask_scrape_filename = os.path.join(mask_scrape_dir,
time.strftime("%y%m%d-%H%M%S.bmp"))
# Do the scrape
result = scraper.Scrape(
[url.url], mask_scrape_dir, size, (0, 0),
command["--timeout"], path=command["--browserpath"],
filename=mask_scrape_filename)
if result:
# Return value other than None means an error
print " Scrape failed with error '%r'" % result
url.errors += 1
if url.errors >= errors:
print " ** Exceeded maximum error count for this URL, giving up"
continue
# Load the new scrape
scrape = Image.open(mask_scrape_filename)
# Calculate the difference between the new scrape and the baseline,
# subject to the current mask
if baseline:
diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
mask.convert(scrape.mode))
# If the difference is none, there's nothing to update
if max(diff.getextrema()) == (0, 0):
print " Scrape identical to baseline, no change in mask"
url.consecutive_successes += 1
if url.consecutive_successes >= scrapes:
print " ** No change for %r scrapes, done!" % scrapes
else:
# convert the difference to black and white, then change all
# black pixels (where the scrape and the baseline were identical)
# to white, all others (where the scrape and the baseline differed)
# to black.
#
# Since the below command is a little unclear, here's how it works.
# 1. convert("L") converts the RGB image to grayscale
# 2. point() maps grayscale values (or the individual channels)
# of an RGB image) to different ones. Because it operates on
# individual channels, the grayscale conversion from step 1
# is necessary.
# 3. The "1" second parameter to point() outputs the result as
# a monochrome bitmap. If the original RGB image were converted
# directly to monochrome, PIL would dither it.
diff = diff.convert("L").point([255]+[0]*255, "1")
# count the number of different pixels
diff_pixels = diff.getcolors()[0][0]
# is this too much?
diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
if diff_pixel_percent > command["--threshhold"]:
print (" Scrape differed from baseline by %.2f percent, ignoring"
% diff_pixel_percent)
else:
print " Scrape differed in %d pixels, updating mask" % diff_pixels
mask = ImageChops.multiply(mask, diff)
mask.save(mask_filename)
# reset the number of consecutive "good" scrapes
url.consecutive_successes = 0
# Remove URLs whose mask is deemed done
complete_list.extend(
[url for url in url_list if url.consecutive_successes >= scrapes])
error_list.extend(
[url for url in url_list if url.errors >= errors])
url_list = [
url for url in url_list if
url.consecutive_successes < scrapes and
url.errors < errors]
scrape_pass += 1
print "**Done with scrape pass %d\n" % scrape_pass
if scrape_pass >= command["--giveup"]:
print "**Exceeded giveup threshhold. Giving up."
else:
print "Waiting %d seconds..." % command["--wait"]
time.sleep(command["--wait"])
print
print "*** MASKMAKER COMPLETE ***"
print "Summary report:"
print " %d masks successfully generated" % len(complete_list)
for url in complete_list:
print " ", url.url
print " %d masks failed with too many errors" % len(error_list)
for url in error_list:
print " ", url.url
if scrape_pass >= command["--giveup"]:
print (" %d masks were not completed before "
"reaching the giveup threshhold" % len(url_list))
for url in url_list:
print " ", url.url
|