diff options
Diffstat (limited to 'tools/site_compare/commands/maskmaker.py')
-rw-r--r-- | tools/site_compare/commands/maskmaker.py | 298 |
1 files changed, 298 insertions, 0 deletions
diff --git a/tools/site_compare/commands/maskmaker.py b/tools/site_compare/commands/maskmaker.py new file mode 100644 index 0000000..95bdeb45 --- /dev/null +++ b/tools/site_compare/commands/maskmaker.py @@ -0,0 +1,298 @@ +#!/usr/bin/python2.4 +# Copyright 2008, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Component for automatically creating masks of changing areas of a website. + +Works by repeated invokation of a browser and scraping of the resulting page. +Areas that differ will be added to the auto-generated mask. The mask generator +considers the mask complete when further scrapes fail to produce any differences +in the mask. +""" + +import os # Functions for walking the directory tree +import tempfile # Get a temporary directory to hold intermediates +import time # Used for sleep() and naming masks by time + +import command_line +import drivers +from PIL import Image +from PIL import ImageChops +import scrapers + + +def CreateCommand(cmdline): + """Inserts the command and arguments into a command line for parsing.""" + cmd = cmdline.AddCommand( + ["maskmaker"], + "Automatically generates a mask from a list of URLs", + ValidateMaskmaker, + ExecuteMaskmaker) + + cmd.AddArgument( + ["-bp", "--browserpath"], "Full path to browser's executable", + type="readfile", metaname="PATH") + cmd.AddArgument( + ["-b", "--browser"], "Which browser to use", type="string", + default="chrome") + cmd.AddArgument( + ["-bv", "--browserver"], "Version of the browser", metaname="VERSION") + cmd.AddArgument( + ["-o", "--outdir"], "Directory to store generated masks", metaname="DIR", + required=True) + cmd.AddArgument( + ["-u", "--url"], "URL to compare") + cmd.AddArgument( + ["-l", "--list"], "List of URLs to compare", type="readfile") + cmd.AddMutualExclusion(["--url", "--list"]) + cmd.AddArgument( + ["-s", "--startline"], "First line of URL list", type="int") + cmd.AddArgument( + ["-e", "--endline"], "Last line of URL list (exclusive)", type="int") + cmd.AddArgument( + ["-c", "--count"], "Number of lines of URL file to use", type="int") + cmd.AddDependency("--startline", "--list") + cmd.AddRequiredGroup(["--url", "--list"]) + cmd.AddDependency("--endline", "--list") + cmd.AddDependency("--count", "--list") + cmd.AddMutualExclusion(["--count", "--endline"]) + cmd.AddDependency("--count", "--startline") + cmd.AddArgument( + ["-t", "--timeout"], "Amount of time (seconds) to wait for browser to " + "finish loading", + type="int", default=60) + cmd.AddArgument( + ["-w", "--wait"], + "Amount of time (in seconds) to wait between successive scrapes", + type="int", default=60) + cmd.AddArgument( + ["-sc", "--scrapes"], + "Number of successive scrapes which must result in no change to a mask " + "before mask creation is considered complete", type="int", default=10) + cmd.AddArgument( + ["-sz", "--size"], "Browser window size", default=(800, 600), type="coords") + cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes") + cmd.AddArgument( + ["-gu", "--giveup"], + "Number of times to scrape before giving up", type="int", default=50) + cmd.AddArgument( + ["-th", "--threshhold"], + "Percentage of different pixels (0-100) above which the scrape will be" + "discarded and the mask not updated.", type="int", default=100) + cmd.AddArgument( + ["--er", "--errors"], + "Number of times a scrape can fail before giving up on the URL.", + type="int", default=1) + + +def ValidateMaskmaker(command): + """Validate the arguments to maskmaker. Raises ParseError if failed.""" + executables = [".exe", ".com", ".bat"] + if command["--browserpath"]: + if os.path.splitext(command["--browserpath"])[1].lower() not in executables: + raise command_line.ParseError("Browser filename must be an executable") + + +def ExecuteMaskmaker(command): + """Performs automatic mask generation.""" + + # Get the list of URLs to generate masks for + class MaskmakerURL(object): + """Helper class for holding information about a URL passed to maskmaker.""" + __slots__ = ['url', 'consecutive_successes', 'errors'] + def __init__(self, url): + self.url = url + self.consecutive_successes = 0 + self.errors = 0 + + if command["--url"]: + url_list = [MaskmakerURL(command["--url"])] + else: + startline = command["--startline"] + if command["--count"]: + endline = startline+command["--count"] + else: + endline = command["--endline"] + url_list = [MaskmakerURL(url.strip()) for url in + open(command["--list"], "r").readlines()[startline:endline]] + + complete_list = [] + error_list = [] + + outdir = command["--outdir"] + scrapes = command["--scrapes"] + errors = command["--errors"] + size = command["--size"] + scrape_pass = 0 + + scrapedir = command["--scrapedir"] + if not scrapedir: scrapedir = tempfile.gettempdir() + + # Get the scraper + scraper = scrapers.GetScraper((command["--browser"], command["--browserver"])) + + # Repeatedly iterate through the list of URLs until either every URL has + # a successful mask or too many errors, or we've exceeded the giveup limit + while url_list and scrape_pass < command["--giveup"]: + # Scrape each URL + for url in url_list: + print "Processing %r..." % url.url + mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp") + + # Load the existing mask. This is in a loop so we can try to recover + # from error conditions + while True: + try: + mask = Image.open(mask_filename) + if mask.size != size: + print " %r already exists and is the wrong size! (%r vs %r)" % ( + mask_filename, mask.size, size) + mask_filename = "%s_%r%s" % ( + mask_filename[:-4], size, mask_filename[-4:]) + print " Trying again as %r..." % mask_filename + continue + break + except IOError: + print " %r does not exist, creating" % mask_filename + mask = Image.new("1", size, 1) + mask.save(mask_filename) + + # Find the stored scrape path + mask_scrape_dir = os.path.join( + scrapedir, os.path.splitext(os.path.basename(mask_filename))[0]) + drivers.windowing.PreparePath(mask_scrape_dir) + + # Find the baseline image + mask_scrapes = os.listdir(mask_scrape_dir) + mask_scrapes.sort() + + if not mask_scrapes: + print " No baseline image found, mask will not be updated" + baseline = None + else: + baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0])) + + mask_scrape_filename = os.path.join(mask_scrape_dir, + time.strftime("%y%m%d-%H%M%S.bmp")) + + # Do the scrape + result = scraper.Scrape( + [url.url], mask_scrape_dir, size, (0, 0), + command["--timeout"], path=command["--browserpath"], + filename=mask_scrape_filename) + + if result: + # Return value other than None means an error + print " Scrape failed with error '%r'" % result + url.errors += 1 + if url.errors >= errors: + print " ** Exceeded maximum error count for this URL, giving up" + continue + + # Load the new scrape + scrape = Image.open(mask_scrape_filename) + + # Calculate the difference between the new scrape and the baseline, + # subject to the current mask + if baseline: + diff = ImageChops.multiply(ImageChops.difference(scrape, baseline), + mask.convert(scrape.mode)) + + # If the difference is none, there's nothing to update + if max(diff.getextrema()) == (0, 0): + print " Scrape identical to baseline, no change in mask" + url.consecutive_successes += 1 + if url.consecutive_successes >= scrapes: + print " ** No change for %r scrapes, done!" % scrapes + else: + # convert the difference to black and white, then change all + # black pixels (where the scrape and the baseline were identical) + # to white, all others (where the scrape and the baseline differed) + # to black. + # + # Since the below command is a little unclear, here's how it works. + # 1. convert("L") converts the RGB image to grayscale + # 2. point() maps grayscale values (or the individual channels) + # of an RGB image) to different ones. Because it operates on + # individual channels, the grayscale conversion from step 1 + # is necessary. + # 3. The "1" second parameter to point() outputs the result as + # a monochrome bitmap. If the original RGB image were converted + # directly to monochrome, PIL would dither it. + diff = diff.convert("L").point([255]+[0]*255, "1") + + # count the number of different pixels + diff_pixels = diff.getcolors()[0][0] + + # is this too much? + diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1]) + if diff_pixel_percent > command["--threshhold"]: + print (" Scrape differed from baseline by %.2f percent, ignoring" + % diff_pixel_percent) + else: + print " Scrape differed in %d pixels, updating mask" % diff_pixels + mask = ImageChops.multiply(mask, diff) + mask.save(mask_filename) + + # reset the number of consecutive "good" scrapes + url.consecutive_successes = 0 + + # Remove URLs whose mask is deemed done + complete_list.extend( + [url for url in url_list if url.consecutive_successes >= scrapes]) + error_list.extend( + [url for url in url_list if url.errors >= errors]) + url_list = [ + url for url in url_list if + url.consecutive_successes < scrapes and + url.errors < errors] + + scrape_pass += 1 + print "**Done with scrape pass %d\n" % scrape_pass + + if scrape_pass >= command["--giveup"]: + print "**Exceeded giveup threshhold. Giving up." + else: + print "Waiting %d seconds..." % command["--wait"] + time.sleep(command["--wait"]) + + print + print "*** MASKMAKER COMPLETE ***" + print "Summary report:" + print " %d masks successfully generated" % len(complete_list) + for url in complete_list: + print " ", url.url + print " %d masks failed with too many errors" % len(error_list) + for url in error_list: + print " ", url.url + if scrape_pass >= command["--giveup"]: + print (" %d masks were not completed before " + "reaching the giveup threshhold" % len(url_list)) + for url in url_list: + print " ", url.url |