summaryrefslogtreecommitdiffstats
path: root/tools/site_compare/commands/maskmaker.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/site_compare/commands/maskmaker.py')
-rw-r--r--tools/site_compare/commands/maskmaker.py298
1 files changed, 298 insertions, 0 deletions
diff --git a/tools/site_compare/commands/maskmaker.py b/tools/site_compare/commands/maskmaker.py
new file mode 100644
index 0000000..95bdeb45
--- /dev/null
+++ b/tools/site_compare/commands/maskmaker.py
@@ -0,0 +1,298 @@
+#!/usr/bin/python2.4
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Component for automatically creating masks of changing areas of a website.
+
+Works by repeated invokation of a browser and scraping of the resulting page.
+Areas that differ will be added to the auto-generated mask. The mask generator
+considers the mask complete when further scrapes fail to produce any differences
+in the mask.
+"""
+
+import os # Functions for walking the directory tree
+import tempfile # Get a temporary directory to hold intermediates
+import time # Used for sleep() and naming masks by time
+
+import command_line
+import drivers
+from PIL import Image
+from PIL import ImageChops
+import scrapers
+
+
+def CreateCommand(cmdline):
+ """Inserts the command and arguments into a command line for parsing."""
+ cmd = cmdline.AddCommand(
+ ["maskmaker"],
+ "Automatically generates a mask from a list of URLs",
+ ValidateMaskmaker,
+ ExecuteMaskmaker)
+
+ cmd.AddArgument(
+ ["-bp", "--browserpath"], "Full path to browser's executable",
+ type="readfile", metaname="PATH")
+ cmd.AddArgument(
+ ["-b", "--browser"], "Which browser to use", type="string",
+ default="chrome")
+ cmd.AddArgument(
+ ["-bv", "--browserver"], "Version of the browser", metaname="VERSION")
+ cmd.AddArgument(
+ ["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",
+ required=True)
+ cmd.AddArgument(
+ ["-u", "--url"], "URL to compare")
+ cmd.AddArgument(
+ ["-l", "--list"], "List of URLs to compare", type="readfile")
+ cmd.AddMutualExclusion(["--url", "--list"])
+ cmd.AddArgument(
+ ["-s", "--startline"], "First line of URL list", type="int")
+ cmd.AddArgument(
+ ["-e", "--endline"], "Last line of URL list (exclusive)", type="int")
+ cmd.AddArgument(
+ ["-c", "--count"], "Number of lines of URL file to use", type="int")
+ cmd.AddDependency("--startline", "--list")
+ cmd.AddRequiredGroup(["--url", "--list"])
+ cmd.AddDependency("--endline", "--list")
+ cmd.AddDependency("--count", "--list")
+ cmd.AddMutualExclusion(["--count", "--endline"])
+ cmd.AddDependency("--count", "--startline")
+ cmd.AddArgument(
+ ["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "
+ "finish loading",
+ type="int", default=60)
+ cmd.AddArgument(
+ ["-w", "--wait"],
+ "Amount of time (in seconds) to wait between successive scrapes",
+ type="int", default=60)
+ cmd.AddArgument(
+ ["-sc", "--scrapes"],
+ "Number of successive scrapes which must result in no change to a mask "
+ "before mask creation is considered complete", type="int", default=10)
+ cmd.AddArgument(
+ ["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")
+ cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")
+ cmd.AddArgument(
+ ["-gu", "--giveup"],
+ "Number of times to scrape before giving up", type="int", default=50)
+ cmd.AddArgument(
+ ["-th", "--threshhold"],
+ "Percentage of different pixels (0-100) above which the scrape will be"
+ "discarded and the mask not updated.", type="int", default=100)
+ cmd.AddArgument(
+ ["--er", "--errors"],
+ "Number of times a scrape can fail before giving up on the URL.",
+ type="int", default=1)
+
+
+def ValidateMaskmaker(command):
+ """Validate the arguments to maskmaker. Raises ParseError if failed."""
+ executables = [".exe", ".com", ".bat"]
+ if command["--browserpath"]:
+ if os.path.splitext(command["--browserpath"])[1].lower() not in executables:
+ raise command_line.ParseError("Browser filename must be an executable")
+
+
+def ExecuteMaskmaker(command):
+ """Performs automatic mask generation."""
+
+ # Get the list of URLs to generate masks for
+ class MaskmakerURL(object):
+ """Helper class for holding information about a URL passed to maskmaker."""
+ __slots__ = ['url', 'consecutive_successes', 'errors']
+ def __init__(self, url):
+ self.url = url
+ self.consecutive_successes = 0
+ self.errors = 0
+
+ if command["--url"]:
+ url_list = [MaskmakerURL(command["--url"])]
+ else:
+ startline = command["--startline"]
+ if command["--count"]:
+ endline = startline+command["--count"]
+ else:
+ endline = command["--endline"]
+ url_list = [MaskmakerURL(url.strip()) for url in
+ open(command["--list"], "r").readlines()[startline:endline]]
+
+ complete_list = []
+ error_list = []
+
+ outdir = command["--outdir"]
+ scrapes = command["--scrapes"]
+ errors = command["--errors"]
+ size = command["--size"]
+ scrape_pass = 0
+
+ scrapedir = command["--scrapedir"]
+ if not scrapedir: scrapedir = tempfile.gettempdir()
+
+ # Get the scraper
+ scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))
+
+ # Repeatedly iterate through the list of URLs until either every URL has
+ # a successful mask or too many errors, or we've exceeded the giveup limit
+ while url_list and scrape_pass < command["--giveup"]:
+ # Scrape each URL
+ for url in url_list:
+ print "Processing %r..." % url.url
+ mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")
+
+ # Load the existing mask. This is in a loop so we can try to recover
+ # from error conditions
+ while True:
+ try:
+ mask = Image.open(mask_filename)
+ if mask.size != size:
+ print " %r already exists and is the wrong size! (%r vs %r)" % (
+ mask_filename, mask.size, size)
+ mask_filename = "%s_%r%s" % (
+ mask_filename[:-4], size, mask_filename[-4:])
+ print " Trying again as %r..." % mask_filename
+ continue
+ break
+ except IOError:
+ print " %r does not exist, creating" % mask_filename
+ mask = Image.new("1", size, 1)
+ mask.save(mask_filename)
+
+ # Find the stored scrape path
+ mask_scrape_dir = os.path.join(
+ scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
+ drivers.windowing.PreparePath(mask_scrape_dir)
+
+ # Find the baseline image
+ mask_scrapes = os.listdir(mask_scrape_dir)
+ mask_scrapes.sort()
+
+ if not mask_scrapes:
+ print " No baseline image found, mask will not be updated"
+ baseline = None
+ else:
+ baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))
+
+ mask_scrape_filename = os.path.join(mask_scrape_dir,
+ time.strftime("%y%m%d-%H%M%S.bmp"))
+
+ # Do the scrape
+ result = scraper.Scrape(
+ [url.url], mask_scrape_dir, size, (0, 0),
+ command["--timeout"], path=command["--browserpath"],
+ filename=mask_scrape_filename)
+
+ if result:
+ # Return value other than None means an error
+ print " Scrape failed with error '%r'" % result
+ url.errors += 1
+ if url.errors >= errors:
+ print " ** Exceeded maximum error count for this URL, giving up"
+ continue
+
+ # Load the new scrape
+ scrape = Image.open(mask_scrape_filename)
+
+ # Calculate the difference between the new scrape and the baseline,
+ # subject to the current mask
+ if baseline:
+ diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
+ mask.convert(scrape.mode))
+
+ # If the difference is none, there's nothing to update
+ if max(diff.getextrema()) == (0, 0):
+ print " Scrape identical to baseline, no change in mask"
+ url.consecutive_successes += 1
+ if url.consecutive_successes >= scrapes:
+ print " ** No change for %r scrapes, done!" % scrapes
+ else:
+ # convert the difference to black and white, then change all
+ # black pixels (where the scrape and the baseline were identical)
+ # to white, all others (where the scrape and the baseline differed)
+ # to black.
+ #
+ # Since the below command is a little unclear, here's how it works.
+ # 1. convert("L") converts the RGB image to grayscale
+ # 2. point() maps grayscale values (or the individual channels)
+ # of an RGB image) to different ones. Because it operates on
+ # individual channels, the grayscale conversion from step 1
+ # is necessary.
+ # 3. The "1" second parameter to point() outputs the result as
+ # a monochrome bitmap. If the original RGB image were converted
+ # directly to monochrome, PIL would dither it.
+ diff = diff.convert("L").point([255]+[0]*255, "1")
+
+ # count the number of different pixels
+ diff_pixels = diff.getcolors()[0][0]
+
+ # is this too much?
+ diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
+ if diff_pixel_percent > command["--threshhold"]:
+ print (" Scrape differed from baseline by %.2f percent, ignoring"
+ % diff_pixel_percent)
+ else:
+ print " Scrape differed in %d pixels, updating mask" % diff_pixels
+ mask = ImageChops.multiply(mask, diff)
+ mask.save(mask_filename)
+
+ # reset the number of consecutive "good" scrapes
+ url.consecutive_successes = 0
+
+ # Remove URLs whose mask is deemed done
+ complete_list.extend(
+ [url for url in url_list if url.consecutive_successes >= scrapes])
+ error_list.extend(
+ [url for url in url_list if url.errors >= errors])
+ url_list = [
+ url for url in url_list if
+ url.consecutive_successes < scrapes and
+ url.errors < errors]
+
+ scrape_pass += 1
+ print "**Done with scrape pass %d\n" % scrape_pass
+
+ if scrape_pass >= command["--giveup"]:
+ print "**Exceeded giveup threshhold. Giving up."
+ else:
+ print "Waiting %d seconds..." % command["--wait"]
+ time.sleep(command["--wait"])
+
+ print
+ print "*** MASKMAKER COMPLETE ***"
+ print "Summary report:"
+ print " %d masks successfully generated" % len(complete_list)
+ for url in complete_list:
+ print " ", url.url
+ print " %d masks failed with too many errors" % len(error_list)
+ for url in error_list:
+ print " ", url.url
+ if scrape_pass >= command["--giveup"]:
+ print (" %d masks were not completed before "
+ "reaching the giveup threshhold" % len(url_list))
+ for url in url_list:
+ print " ", url.url