tools/site_compare/commands/maskmaker.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274

#!/usr/bin/python2.4
# Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Component for automatically creating masks of changing areas of a website.

Works by repeated invokation of a browser and scraping of the resulting page.
Areas that differ will be added to the auto-generated mask. The mask generator
considers the mask complete when further scrapes fail to produce any differences
in the mask.
"""

import os            # Functions for walking the directory tree
import tempfile      # Get a temporary directory to hold intermediates
import time          # Used for sleep() and naming masks by time

import command_line
import drivers
from PIL import Image
from PIL import ImageChops
import scrapers


def CreateCommand(cmdline):
  """Inserts the command and arguments into a command line for parsing."""
  cmd = cmdline.AddCommand(
    ["maskmaker"],
    "Automatically generates a mask from a list of URLs",
    ValidateMaskmaker,
    ExecuteMaskmaker)

  cmd.AddArgument(
    ["-bp", "--browserpath"], "Full path to browser's executable",
    type="readfile", metaname="PATH")
  cmd.AddArgument(
    ["-b", "--browser"], "Which browser to use", type="string",
    default="chrome")
  cmd.AddArgument(
    ["-bv", "--browserver"], "Version of the browser", metaname="VERSION")
  cmd.AddArgument(
    ["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",
    required=True)
  cmd.AddArgument(
    ["-u", "--url"], "URL to compare")
  cmd.AddArgument(
    ["-l", "--list"], "List of URLs to compare", type="readfile")
  cmd.AddMutualExclusion(["--url", "--list"])
  cmd.AddArgument(
    ["-s", "--startline"], "First line of URL list", type="int")
  cmd.AddArgument(
    ["-e", "--endline"], "Last line of URL list (exclusive)", type="int")
  cmd.AddArgument(
    ["-c", "--count"], "Number of lines of URL file to use", type="int")
  cmd.AddDependency("--startline", "--list")
  cmd.AddRequiredGroup(["--url", "--list"])
  cmd.AddDependency("--endline", "--list")
  cmd.AddDependency("--count", "--list")
  cmd.AddMutualExclusion(["--count", "--endline"])
  cmd.AddDependency("--count", "--startline")
  cmd.AddArgument(
    ["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "
    "finish loading",
    type="int", default=60)
  cmd.AddArgument(
    ["-w", "--wait"],
    "Amount of time (in seconds) to wait between successive scrapes",
    type="int", default=60)
  cmd.AddArgument(
    ["-sc", "--scrapes"],
    "Number of successive scrapes which must result in no change to a mask "
    "before mask creation is considered complete", type="int", default=10)
  cmd.AddArgument(
    ["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")
  cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")
  cmd.AddArgument(
    ["-gu", "--giveup"],
    "Number of times to scrape before giving up", type="int", default=50)
  cmd.AddArgument(
    ["-th", "--threshhold"],
    "Percentage of different pixels (0-100) above which the scrape will be"
    "discarded and the mask not updated.", type="int", default=100)
  cmd.AddArgument(
    ["--er", "--errors"],
    "Number of times a scrape can fail before giving up on the URL.",
    type="int", default=1)


def ValidateMaskmaker(command):
  """Validate the arguments to maskmaker. Raises ParseError if failed."""
  executables = [".exe", ".com", ".bat"]
  if command["--browserpath"]:
    if os.path.splitext(command["--browserpath"])[1].lower() not in executables:
      raise command_line.ParseError("Browser filename must be an executable")


def ExecuteMaskmaker(command):
  """Performs automatic mask generation."""

  # Get the list of URLs to generate masks for
  class MaskmakerURL(object):
    """Helper class for holding information about a URL passed to maskmaker."""
    __slots__ = ['url', 'consecutive_successes', 'errors']
    def __init__(self, url):
      self.url = url
      self.consecutive_successes = 0
      self.errors = 0

  if command["--url"]:
    url_list = [MaskmakerURL(command["--url"])]
  else:
    startline = command["--startline"]
    if command["--count"]:
      endline = startline+command["--count"]
    else:
      endline = command["--endline"]
    url_list = [MaskmakerURL(url.strip()) for url in
                open(command["--list"], "r").readlines()[startline:endline]]

  complete_list = []
  error_list = []

  outdir = command["--outdir"]
  scrapes = command["--scrapes"]
  errors = command["--errors"]
  size = command["--size"]
  scrape_pass = 0

  scrapedir = command["--scrapedir"]
  if not scrapedir: scrapedir = tempfile.gettempdir()

  # Get the scraper
  scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))

  # Repeatedly iterate through the list of URLs until either every URL has
  # a successful mask or too many errors, or we've exceeded the giveup limit
  while url_list and scrape_pass < command["--giveup"]:
    # Scrape each URL
    for url in url_list:
      print "Processing %r..." % url.url
      mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")

      # Load the existing mask. This is in a loop so we can try to recover
      # from error conditions
      while True:
        try:
          mask = Image.open(mask_filename)
          if mask.size != size:
            print "  %r already exists and is the wrong size! (%r vs %r)" % (
              mask_filename, mask.size, size)
            mask_filename = "%s_%r%s" % (
              mask_filename[:-4], size, mask_filename[-4:])
            print "  Trying again as %r..." % mask_filename
            continue
          break
        except IOError:
          print "  %r does not exist, creating" % mask_filename
          mask = Image.new("1", size, 1)
          mask.save(mask_filename)

      # Find the stored scrape path
      mask_scrape_dir = os.path.join(
        scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
      drivers.windowing.PreparePath(mask_scrape_dir)

      # Find the baseline image
      mask_scrapes = os.listdir(mask_scrape_dir)
      mask_scrapes.sort()

      if not mask_scrapes:
        print "  No baseline image found, mask will not be updated"
        baseline = None
      else:
        baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))

      mask_scrape_filename = os.path.join(mask_scrape_dir,
                                          time.strftime("%y%m%d-%H%M%S.bmp"))

      # Do the scrape
      result = scraper.Scrape(
        [url.url], mask_scrape_dir, size, (0, 0),
        command["--timeout"], path=command["--browserpath"],
        filename=mask_scrape_filename)

      if result:
        # Return value other than None means an error
        print "  Scrape failed with error '%r'" % result
        url.errors += 1
        if url.errors >= errors:
          print "  ** Exceeded maximum error count for this URL, giving up"
        continue

      # Load the new scrape
      scrape = Image.open(mask_scrape_filename)

      # Calculate the difference between the new scrape and the baseline,
      # subject to the current mask
      if baseline:
        diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
                                   mask.convert(scrape.mode))

        # If the difference is none, there's nothing to update
        if max(diff.getextrema()) == (0, 0):
          print "  Scrape identical to baseline, no change in mask"
          url.consecutive_successes += 1
          if url.consecutive_successes >= scrapes:
            print "  ** No change for %r scrapes, done!" % scrapes
        else:
          # convert the difference to black and white, then change all
          # black pixels (where the scrape and the baseline were identical)
          # to white, all others (where the scrape and the baseline differed)
          # to black.
          #
          # Since the below command is a little unclear, here's how it works.
          #    1. convert("L") converts the RGB image to grayscale
          #    2. point() maps grayscale values (or the individual channels)
          #       of an RGB image) to different ones. Because it operates on
          #       individual channels, the grayscale conversion from step 1
          #       is necessary.
          #    3. The "1" second parameter to point() outputs the result as
          #       a monochrome bitmap. If the original RGB image were converted
          #       directly to monochrome, PIL would dither it.
          diff = diff.convert("L").point([255]+[0]*255, "1")

          # count the number of different pixels
          diff_pixels = diff.getcolors()[0][0]

          # is this too much?
          diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
          if diff_pixel_percent > command["--threshhold"]:
            print ("  Scrape differed from baseline by %.2f percent, ignoring"
                   % diff_pixel_percent)
          else:
            print "  Scrape differed in %d pixels, updating mask" % diff_pixels
            mask = ImageChops.multiply(mask, diff)
            mask.save(mask_filename)

            # reset the number of consecutive "good" scrapes
            url.consecutive_successes = 0

    # Remove URLs whose mask is deemed done
    complete_list.extend(
      [url for url in url_list if url.consecutive_successes >= scrapes])
    error_list.extend(
      [url for url in url_list if url.errors >= errors])
    url_list = [
      url for url in url_list if
      url.consecutive_successes < scrapes and
      url.errors < errors]

    scrape_pass += 1
    print "**Done with scrape pass %d\n" % scrape_pass

    if scrape_pass >= command["--giveup"]:
      print "**Exceeded giveup threshhold. Giving up."
    else:
      print "Waiting %d seconds..." % command["--wait"]
      time.sleep(command["--wait"])

  print
  print "*** MASKMAKER COMPLETE ***"
  print "Summary report:"
  print "  %d masks successfully generated" % len(complete_list)
  for url in complete_list:
    print "    ", url.url
  print "  %d masks failed with too many errors" % len(error_list)
  for url in error_list:
    print "    ", url.url
  if scrape_pass >= command["--giveup"]:
    print ("  %d masks were not completed before "
           "reaching the giveup threshhold" % len(url_list))
    for url in url_list:
      print "    ", url.url