Aggregator script used for collecting web pages with filliable forms

such as registration forms. The script parses through a set of links from a text file and crawls the domain looking for web pages with forms then downloads the entire page to an html file. webforms_aggregator.py TEST=none BUG=none Review URL: http://codereview.chromium.org/6577026 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@83567 0039d316-1c4b-4281-b951-d872f2087c98
author: dyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-04-29 19:15:53 +0000
committer: dyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-04-29 19:15:53 +0000
commit: 5883af9856b04d06dc8572ac381d2207b737c78e (patch)
tree: ce48d5f8b47a634d19ac1aa5ef2820335b90a548 /chrome/tools
parent: 70e64f8b4bb55b01555feb4c340943630f19f3ef (diff)
download: chromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.zip
chromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.tar.gz
chromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.tar.bz2
3 files changed, 948 insertions, 0 deletions
diff --git a/chrome/tools/webforms_aggregator.py b/chrome/tools/webforms_aggregator.py
new file mode 100644
index 0000000..17e30d6
--- /dev/null
+++ b/chrome/tools/webforms_aggregator.py
@@ -0,0 +1,826 @@
+#!/usr/bin/python
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Downloads web pages with fillable forms after parsing through a set of links.
+
+Used for collecting web pages with forms. Used as a standalone script.
+This script assumes that it's run from within the same directory in which it's
+checked into. If this script were to be run elsewhere then the path for
+REGISTER_PAGE_DIR needs to be changed.
+
+This script assumes that third party modules are installed:
+httplib2, lxml, pycurl.
+
+Usage: webforms_aggregator.py [options] [single url or file containing urls]
+
+Options:
+  -l LOG_LEVEL, --log_level LOG_LEVEL
+    LOG_LEVEL: debug, info, warning or error [default: error]
+  -h, --help  show this help message and exit
+"""
+
+import datetime
+import errno
+import logging
+from optparse import OptionParser
+import os
+import re
+import sys
+import tempfile
+import threading
+import time
+from urlparse import urlparse, urljoin
+
+from httplib2 import iri2uri
+from lxml import html, etree
+import pycurl
+
+REGISTER_PAGE_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
+                                 'heuristics', 'input')
+NOT_FOUND_REG_PAGE_SITES_FILENAME = 'notFoundRegPageSites.txt'
+
+FORM_LOCATION_COMMENT = 'Form Location: %s'
+HTML_FILE_PREFIX = 'grabber-'
+
+MAX_REDIRECTIONS = 10
+
+# Strings in a webpage that are indicative of a registration link.
+LINK_CLUES = ['regist', 'user', 'sign', 'login', 'account']
+
+MAX_SAME_DOMAIN_URLS_NO = 30
+MAX_TOTAL_URLS_PER_DOMAIN = 300
+MAX_OPEN_FILES_NO = 500
+
+# URLs are selected for downloading with the following rules from the link
+# lists, giving more weight to the links that contain a link clue.
+CLUE_SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
+CLUE_GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
+SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
+GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
+
+MAX_ALLOWED_THREADS = MAX_OPEN_FILES_NO / MAX_SAME_DOMAIN_URLS_NO + 1
+
+
+class Retriever(object):
+  """Download, parse, and check if the web page contains a registration form.
+
+  If the page does not contain a registration form, the url links are retrieved,
+  each url is accessed and the HTML content is saved in a string member of the
+  Retriever object.
+  """
+  logger = logging.getLogger(__name__)
+
+  def __init__(self, url, domain, cookie_file):
+    """Initializes a Retriever object.
+
+    Args:
+      url: url to download page from.
+      domain: only links with this domain will be retrieved.
+      cookie_file: the name of a cookie file, needed for pages that use session
+      cookies to change their contents.
+    """
+    self._url = url
+    self._domain = domain
+    self._html_content = ''
+
+    # Http links without clues from LINK_CLUES.
+    self._general_links = []
+    # Http links that contain a clue from LINK_CLUES.
+    self._clues_general_links = []
+    # Https links that do not contain any clues from LINK_CLUES.
+    self._secure_links = []
+    # Https links that contain a clue from LINK_CLUES.
+    self._clues_secure_links = []
+    self._cookie_file = cookie_file
+    self._curl_object = None
+
+  def __del__(self):
+    """Cleans up before this object is destroyed.
+
+    The function closes the corresponding curl object that does the downloading.
+    """
+    if self._curl_object:
+      self._curl_object.close()
+
+  def _GetJavaScriptRedirectionURL(self):
+    """Checks whether the page contains js redirection to another location.
+
+    Returns:
+      The js redirection URL if one exists, or the empty string otherwise.
+    """
+    try:
+      tree = html.fromstring(self._html_content, parser=html.HTMLParser())
+    except etree.XMLSyntaxError:
+      return ''
+    redirect_url = ''
+    script_elements = tree.iter('script')
+    for script_elem in script_elements:
+      str = html.tostring(
+          script_elem, method='text', encoding='UTF-8').strip()
+      if re.match(r'^window.location', str):
+        m = re.search(r'(?P<quote>[\'\"])(?P<redirect_url>.*?)\1', str)
+        if m:
+          redirect_url = urljoin(self._url, m.group('redirect_url'))
+          break
+    return redirect_url
+
+  def _AddLink(self, link):
+    """Adds url |link|, if not already present, to the appropriate list.
+
+    The link only gets added to the single list that is appopriate for it:
+    _secure_links, _general_links, _clues_secure_links or _clues_general_links.
+
+    Args:
+      link: the url that is inserted to the appropriate links list.
+    """
+    link_parsed = urlparse(link)
+    link_lists = [self._clues_secure_links, self._secure_links,
+                  self._clues_general_links, self._general_links]
+    # Checks that the registration page is within the domain.
+    if ((self._domain in link_parsed[1]) and
+        all(map(lambda x: link not in x, link_lists))):
+      for clue in LINK_CLUES:
+        if clue in link.lower():
+          if link_parsed[0].startswith('https'):
+            self._clues_secure_links.append(link)
+            return
+          else:
+            self._clues_general_links.append(link)
+            return
+      if link_parsed[0].startswith('https'):  # No clues found in the link.
+        self._secure_links.append(link)
+      else:
+        self._general_links.append(link)
+
+  def ParseAndGetLinks(self):
+    """Parses downloaded page and gets url link for non registration page.
+
+    Checks if current page contains a registration page and if not it gets
+    the url links. If it is a registration page, it saves it in a file as
+    'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT
+    and it returns True. Otherwise it returns False.
+
+    Returns:
+      True if current page contains a registration form, and False otherwise.
+
+    Raises:
+      IOError: When can't write to the file.
+    """
+    if not self._domain:
+      self.logger.error('Error: self._domain was not set')
+      sys.exit(1)
+    m = re.findall('(?P<quote>[\'\"])(?P<link>https?://.*?)\1',
+                   self._html_content)
+    for link in m:
+      self._AddLink(link[1])
+    try:
+      tree = html.fromstring(self._html_content, parser=html.HTMLParser())
+    except etree.XMLSyntaxError:
+      self.logger.info('\t\tSkipping: %s <<< %s',
+                       'not valid HTML code in this page', self._url)
+      return False
+    try:
+      body_iterator = tree.iter('body')
+      body = body_iterator.next()
+    except StopIteration:
+      self.logger.info('\t\tSkipping: %s <<< %s',
+                       'no "BODY" tag in this page', self._url)
+      return False
+
+    # Get a list of all input elements with attribute type='password'
+    password_elements = list(body.iterfind('.//input[@type="password"]'))
+    # Check for multiple password elements to distinguish between a login form
+    # and a registration form (Password field and Confirm Password field).
+    if password_elements and len(password_elements) >= 2:
+      form_elements = []
+      for password_elem in password_elements:
+        form_elem = password_elem.xpath('ancestor::form[1]')
+        if not form_elem:
+          continue
+        if not form_elem[0] in form_elements:
+          form_elements.append(form_elem[0])
+        else:
+          # Confirms that the page contains a registration form if two passwords
+          # are contained in the same form for form_elem[0].
+          if not os.path.isdir(REGISTER_PAGE_DIR):
+            os.mkdir(REGISTER_PAGE_DIR)
+          # Locate the HTML tag and insert the form location comment after it.
+          html_tag = tree.iter('html').next()
+          comment = etree.Comment(FORM_LOCATION_COMMENT % self._url)
+          html_tag.insert(0, comment)
+          # Create a new file and save the HTML registration page code.
+          f = open('%s/%s%s.html' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX,
+                                     self._domain), 'wb')
+          try:
+            f.write(html.tostring(tree, pretty_print=True))
+          except IOError as e:
+            self.logger.error('Error: %s', e)
+            raise
+          finally:
+            f.close()
+          return True  # Registration page found.
+    # Indicates page is not a registration page and links must be parsed.
+    link_elements = list(body.iter('a'))
+    for link_elem in link_elements:
+      link = link_elem.get('href')
+      if not link or '#' == link[0]:
+        continue
+      link = urljoin(self._url, link)
+      link_parsed = urlparse(link)
+      if not link_parsed[0].startswith('http'):
+        continue
+      self._AddLink(link)
+    return False  # Registration page not found.
+
+  def _GetHeaders(self, buff):
+    """Gets the headers of a url HEAD or GET request.
+
+    The headers are stored into object variables, not returned by this function.
+
+    Args:
+      buff: each header read. It is needed in case there is a redirection of
+      the page. If one is found, self._url is changed to this redirected url.
+    """
+    if buff.lower().startswith('location:'):
+      self._redirect_url = buff[9:].strip()
+      self._url = urljoin(self._url, self._redirect_url)
+
+  def InitRequestHead(self):
+    """Initializes curl object for a HEAD request.
+
+    A HEAD request is initiated so that we can check from the headers if this is
+    a valid HTML file. If it is not a valid HTML file, then we do not initiate a
+    GET request, saving any unnecessary downloadings.
+    """
+    self._curl_object = pycurl.Curl()
+    # Handles sites with unicode URLs.
+    if isinstance(self._url, unicode):
+      self._url = str(iri2uri(self._url))
+    self._curl_object.setopt(pycurl.URL, self._url)
+    # The following line fixes the GnuTLS package error that pycurl depends
+    # on for getting https pages.
+    self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)
+    self._curl_object.setopt(pycurl.HEADERFUNCTION, self._GetHeaders)
+    self._curl_object.setopt(pycurl.FOLLOWLOCATION, True)
+    self._curl_object.setopt(pycurl.NOBODY, True)
+    self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False);
+    self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS)
+    self._curl_object.setopt(pycurl.FAILONERROR, False)
+    self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file)
+    self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file)
+    self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30)
+    self._curl_object.setopt(pycurl.TIMEOUT, 300)
+    self._curl_object.setopt(pycurl.NOSIGNAL, 1)
+
+  def InitRequestGet(self):
+    """Initializes curl object for a GET request.
+
+    This is called only for valid HTML files. The Pycurl makes a GET request.
+    The page begins to download, but since not all the data of the pages comes
+    at once. When some of the data on the page is downloaded Pycurl will put
+    this data in the buffer. The data is appended to the end of the page until
+    everything is downloaded.
+    """
+    self._curl_object.setopt(pycurl.NOBODY, False)
+    self._curl_object.setopt(pycurl.HEADERFUNCTION, lambda buff: None)
+    self._curl_object.setopt(
+        pycurl.WRITEFUNCTION, lambda buff: setattr(
+            self, '_html_content', self._html_content + buff))
+
+  def Download(self):
+    """Downloads the self._url page.
+
+    It first does a HEAD request and then it proceeds to a GET request.
+    It uses a curl object for a single download. This function is called only
+    once for the initial url of a site when we still don't have more urls from a
+    domain.
+
+    Returns:
+      True, if the downloaded page is valid HTML code, or False otherwise.
+    """
+    self.InitRequestHead()
+    try:
+      self._curl_object.perform()
+    except pycurl.error as e:
+      self.logger.error('Error: %s, url: %s', e, self._url)
+      return False
+    content_type = self._curl_object.getinfo(pycurl.CONTENT_TYPE)
+    if content_type and ('text/html' in content_type.lower()):
+      self.InitRequestGet()
+      try:
+        self._curl_object.perform()
+      except pycurl.error as e:
+        self.logger.error('Error: %s, url: %s', e, self._url)
+        return False
+      return True
+    else:
+      self.logger.info('\tSkipping: %s <<< %s',
+                       'Not an HTML page', self._url)
+      return False
+
+  def Run(self):
+    """Called only once for the initial url when we don't have more urls.
+
+    Downloads the originally-specified site url, checks it for redirections,
+    parses it and gets its links.
+
+    Returns:
+      True, if a registration page is found, and False otherwise.
+    """
+    redirection_counter = 0
+    while True:
+      is_HTML = self.Download()
+      if not is_HTML:
+        break
+      redirect_url = self._GetJavaScriptRedirectionURL()
+      if redirect_url:
+        redirection_counter += 1
+        if redirection_counter > MAX_REDIRECTIONS:
+          return False
+        self._url = redirect_url
+      else:
+        break
+    if is_HTML:
+      if not self._domain:
+        url_parsed = urlparse(self._url)
+        self._domain = url_parsed[1]
+        if self._domain.startswith('www.'):
+          self._domain = self._domain[4:]
+      if self.ParseAndGetLinks():
+        return True
+    return False
+
+
+class Crawler(object):
+  """Crawls a site until a registration page is found or max level is reached.
+
+  Creates, uses and destroys Retriever objects. Creates a cookie temp file
+  needed for session cookies. It keeps track of 'visited links' and
+  'links to visit' of the site. To do this it uses the links discovered from
+  each Retriever object. Use Run() to crawl the site.
+  """
+  try:
+    # Needed in Linux so that PyCurl does not throw a segmentation fault.
+    import signal
+    from signal import SIGPIPE, SIG_IGN
+    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
+  except ImportError:
+    pass
+  logger = logging.getLogger(__name__)
+  log_handlers = {'StreamHandler': None}
+
+  def __init__(self, url, logging_level=None):
+    """Init crawler URL, links lists, logger, and creates a cookie temp file.
+
+    The cookie temp file is needed for session cookies.
+
+    Args:
+      url: the initial "seed" url of the site.
+      logging_level: the desired verbosity level, default is None.
+    """
+    if logging_level:
+      if not self.log_handlers['StreamHandler']:
+        console = logging.StreamHandler()
+        console.setLevel(logging.INFO)
+        self.log_handlers['StreamHandler'] = console
+        self.logger.addHandler(console)
+      self.logger.setLevel(logging_level)
+    else:
+      if self.log_handlers['StreamHandler']:
+        self.logger.removeHandler(self.log_handlers['StreamHandler'])
+        self.log_handlers['StreamHandler'] = None
+
+    self.url_error = False
+    url_parsed = urlparse(url)
+    if not url_parsed[0].startswith('http'):
+      self.logger.error(
+          'Error: "%s" does not begin with http:// or https://', url)
+      self.url_error = True
+      return
+    # Example: if url is 'http://www.ebay.com?name=john' then value [1] or
+    # network location is 'www.ebay.com'.
+    if not url_parsed[1]:
+      self.logger.error('Error: "%s" is not a valid url', url)
+      self.url_error = True
+      return
+    self._root_url = url
+    self._url = url
+    self._domain = ''
+    # Http links that contain a clue from LINK_CLUES.
+    self._clues_general_links = []
+    # Http links that do not contain any clue from LINK_CLUES.
+    self._general_links = []
+    # Https links that contain a clue from LINK_CLUES.
+    self._clues_secure_links = []
+    # Https links that do not contain any clue from LINK_CLUES.
+    self._secure_links = []
+    # All links downloaded and parsed so far.
+    self._links_visited = []
+    self._retrievers_list = []
+    self._cookie_file = tempfile.NamedTemporaryFile(
+        suffix='.cookie', delete=False)
+    self._cookie_file.close()
+    self._cookie_file = self._cookie_file.name  # Keep only the filename.
+
+  def __del__(self):
+    """Deletes cookie file when Crawler instances are destroyed."""
+    if hasattr(self, '_cookie_file'):
+      self.logger.info('Deleting cookie file %s ...', self._cookie_file)
+      os.unlink(self._cookie_file)
+
+  def _MultiPerform(self, curl_multi_object):
+    """Performs concurrent downloads using a CurlMulti object.
+
+    Args:
+      curl_multi_object: a curl object that downloads multiple pages
+      concurrently.
+    """
+    while True:
+      ret, no_handles = curl_multi_object.perform()
+      # Following code uses the example from section for the CurlMulti object
+      # at http://pycurl.sourceforge.net/doc/curlmultiobject.html.
+      if ret != pycurl.E_CALL_MULTI_PERFORM:
+        break
+      while no_handles:
+        curl_multi_object.select(1.0)
+        while True:
+          ret, no_handles = curl_multi_object.perform()
+          if ret != pycurl.E_CALL_MULTI_PERFORM:
+            break
+
+  def _GetLinksPages(self, curl_multi_object):
+    """Downloads many pages concurrently using a CurlMulti Object.
+
+    Creates many Retriever objects and adds them to a list. The constant
+    MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded
+    concurrently from the same domain using the pycurl multi object. It's
+    currently set to 30 URLs. These URLs are taken from the links lists, which
+    are from csl, gcl, sl, and gl. The rules define how many URLs are taken from
+    each list during each iteration.
+
+    Example of the rules:
+      3/10 from csl results in 9 URLs
+      3/10 from cgl results in 9 URLs
+      2/10 from sl results in 6 URLs
+      2/10 from gl results in 6 URLs
+
+    Adding up the above URLs gives 30 URLs that can be downloaded concurrently.
+    If there are fewer items than the defined rules, such as if a site does not
+    contain any secure links, then csl and sl lists will have 0 length and only
+    15 pages will be downloaded concurrently from the same domain.
+
+    Args:
+      curl_multi_object: Each Retriever object has a curl object which is
+      added to the CurlMulti Object.
+    """
+    self._retrievers_list = []
+
+    csl_no = min(CLUE_SECURE_LINKS_NO, len(self._clues_secure_links))
+    cgl_no = min(CLUE_GENERAL_LINKS_NO, len(self._clues_general_links))
+    sl_no = min(SECURE_LINKS_NO, len(self._secure_links))
+    gl_no = min(GENERAL_LINKS_NO, len(self._general_links))
+
+    # If some links within the list have fewer items than needed, the missing
+    # links will be taken by the following priority: csl, cgl, sl, gl.
+    # c: clues, s: secure, g: general, l: list.
+    spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
+    if spare_links > 0:
+      csl_no = min(csl_no + spare_links, len(self._clues_secure_links))
+      spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
+    if spare_links > 0:
+      cgl_no = min(cgl_no + spare_links, len(self._clues_general_links))
+      spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
+    if spare_links > 0:
+      sl_no = min(sl_no + spare_links, len(self._secure_links))
+      spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
+    if spare_links > 0:
+      gl_no = min(gl_no + spare_links, len(self._general_links))
+
+    m = curl_multi_object
+    for no_of_links, links in [
+        (csl_no, self._clues_secure_links),
+        (sl_no, self._secure_links),
+        (cgl_no, self._clues_general_links),
+        (gl_no, self._general_links)]:
+      for i in xrange(no_of_links):
+        if not links:
+          break
+        url = links.pop(0)
+        self._links_visited.append(url)
+        r = Retriever(url, self._domain, self._cookie_file)
+        r.InitRequestHead()
+        m.add_handle(r._curl_object)
+        self._retrievers_list.append(r)
+
+    if self._retrievers_list:
+      try:
+        self._MultiPerform(m)
+      except pycurl.error as e:
+        self.logger.error('Error: %s, url: %s', e, self._url)
+      finally:
+        prRetrList = self._retrievers_list
+        self._retrievers_list = []
+        for r in prRetrList:
+          m.remove_handle(r._curl_object)
+      while prRetrList:
+        r = prRetrList.pop(0)
+        content_type = r._curl_object.getinfo(pycurl.CONTENT_TYPE)
+        if content_type and ('text/html' in content_type.lower()):
+          r.InitRequestGet()
+          m.add_handle(r._curl_object)
+          self._retrievers_list.append(r)
+        else:
+          self.logger.info('\tSkipping: %s <<< %s',
+                           'Not an HTML page', r._url)
+      if self._retrievers_list:
+        try:
+          self._MultiPerform(m)
+        except:
+          self.logger.error('Error: %s, url: %s', e, self._url)
+        finally:
+          for r in self._retrievers_list:
+            m.remove_handle(r._curl_object)
+            self.logger.info('Downloaded: %s', r._url)
+
+  def _LogRegPageFound(self, retriever):
+    """Display logging for registration page found.
+
+    Args:
+      retriever: The object that has retrieved the page.
+    """
+    self.logger.info('\t##############################################')
+    self.logger.info('\t### %s ###', retriever._domain)
+    self.logger.info('\t##############################################')
+    self.logger.info('\t!!!!!!!!!  registration page FOUND !!!!!!!!!!!')
+    self.logger.info('\t%s', retriever._url)
+    self.logger.info('\t##############################################')
+
+  def _GetNewLinks(self, retriever):
+    """Appends new links discovered by each retriever to the appropriate lists.
+
+    Links are copied to the links list of the crawler object, which holds all
+    the links found from all retrievers that the crawler object created. The
+    Crawler object exists as far as a specific site is examined and the
+    Retriever object exists as far as a page of this site is examined.
+
+    Args:
+      retriever: a temporary object that downloads a specific page, parses the
+      content and gets the page's href link.
+    """
+    for link in retriever._clues_secure_links:
+      if (not link in self._clues_secure_links and
+          not link in self._links_visited):
+        self._clues_secure_links.append(link)
+    for link in retriever._secure_links:
+      if (not link in self._secure_links and
+          not link in self._links_visited):
+        self._secure_links.append(link)
+    for link in retriever._clues_general_links:
+      if (not link in self._clues_general_links and
+          not link in self._links_visited):
+        self._clues_general_links.append(link)
+    for link in retriever._general_links:
+      if (not link in self._general_links and
+          not link in self._links_visited):
+        self._general_links.append(link)
+
+  def Run(self):
+    """Runs the Crawler.
+
+    Creates a Retriever object and calls its run method to get the first links,
+    and then uses CurlMulti object and creates many Retriever objects to get
+    the subsequent pages.
+
+    The number of pages (=Retriever objs) created each time is restricted by
+    MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download
+    and parse their pages, we do the same again. The number of total pages
+    visited is kept in urls_visited.
+    If no registration page is found, the Crawler object will give up its try
+    after MAX_TOTAL_URLS_PER_DOMAIN is reached.
+
+    Returns:
+      True is returned if registration page is found, or False otherwise.
+    """
+    reg_page_found = False
+    if self.url_error:
+      return False
+    r = Retriever(self._url, self._domain, self._cookie_file)
+    if r.Run():
+      self._LogRegPageFound(r)
+      reg_page_found = True
+    else:
+      self._url = r._url
+      self._domain = r._domain
+      self.logger.info('url to crawl: %s', self._url)
+      self.logger.info('domain: %s', self._domain)
+      self._links_visited.append(r._url)
+      self._GetNewLinks(r)
+      urls_visited = 1
+      while True:
+        if (not (self._clues_secure_links or self._secure_links or
+                 self._clues_general_links or self._general_links or
+                 urls_visited >= MAX_TOTAL_URLS_PER_DOMAIN)):
+          break  # Registration page not found.
+        m = pycurl.CurlMulti()
+        self._GetLinksPages(m)
+        self.logger.info('\t<----- URLs visited for domain "%s": %d ----->',
+                         self._domain, urls_visited)
+        for r in self._retrievers_list:
+          if r.ParseAndGetLinks():
+            self._LogRegPageFound(r)
+            reg_page_found = True
+            break
+          else:
+            self.logger.info('parsed: %s', r._url)
+            self._GetNewLinks(r)
+        m.close()
+        if reg_page_found:
+          break
+    while self._retrievers_list:
+      r = self._retrievers_list.pop()
+    return reg_page_found
+
+
+class WorkerThread(threading.Thread):
+  """Creates a new thread of execution."""
+  def __init__(self, url):
+    """Creates _url and pageFound attri to populate urls_with_no_reg_page file.
+
+    Used after thread's termination for the creation of a file with a list of
+    the urls for which a registration page wasn't found.
+
+    Args:
+      url: will be used as an argument to create a Crawler object later.
+    """
+    threading.Thread.__init__(self)
+    self._url = url
+    self.pageFound = False
+
+  def run(self):
+    """Execution of thread creates a Crawler object and runs it.
+
+    Caution: this function name should not be changed to 'Run' or any other
+    name becuase it is overriding the 'run' method of the 'threading.Thread'
+    class. Otherwise it will never be called.
+    """
+    c = Crawler(self._url)
+    if c.Run():
+      self.pageFound = True
+
+
+class ThreadedCrawler(object):
+  """Calls the Run function of WorkerThread which creates & runs a Crawler obj.
+
+  The crawler object runs concurrently, examining one site each.
+  """
+  logger = logging.getLogger(__name__)
+  log_handlers = {'StreamHandler': None}
+
+  def __init__(self, urls_file, logging_level=None):
+    """Creates threaded Crawler objects.
+
+    Args:
+      urls_file: urls from a file.
+      logging_level: verbosity level, default is None.
+
+    Raises:
+      IOError: If cannot find URLs from the list.
+    """
+    if logging_level:
+      if not self.log_handlers['StreamHandler']:
+        console = logging.StreamHandler()
+        console.setLevel(logging.INFO)
+        self.log_handlers['StreamHandler'] = console
+        self.logger.addHandler(console)
+      self.logger.setLevel(logging_level)
+    else:
+      if self.log_handlers['StreamHandler']:
+        self.logger.removeHandler(self.log_handlers['StreamHandler'])
+        self.log_handlers['StreamHandler'] = None
+    self._urls_list = []
+    f = open(urls_file)
+    try:
+      for url in f.readlines():
+        url = url.strip()
+        if not urlparse(url)[0].startswith('http'):
+          self.logger.info(
+              '%s: skipping this (does not begin with "http://")', url)
+          continue
+        self._urls_list.append(url)
+    except IOError as e:
+      self.logger.error('Error: %s', e)
+      raise
+    finally:
+      f.close()
+    if not self._urls_list:
+      raise IOError('no URLs were found')
+
+  def Run(self):
+    """Runs Crawler objects using python threads.
+
+    Number of concurrent threads is restricted to MAX_ALLOWED_THREADS.
+
+    Raises:
+      OSError: When creating the same directory that already exists.
+    """
+    t0 = datetime.datetime.now()
+    if self._urls_list:
+      allThreads = []
+      # originalNumThreads is the number of threads just before the
+      # ThreadedCrawler starts creating new threads. As a standalone script it
+      # will be 1.
+      originalNumThreads = threading.active_count()
+      for url in self._urls_list:
+        self.logger.info('url fed to a crawler thread: %s', url)
+        t = WorkerThread(url)
+        t.start()
+        allThreads.append(t)
+        while threading.active_count() >= (
+            MAX_ALLOWED_THREADS + originalNumThreads):
+          time.sleep(.4)
+      while threading.active_count() > originalNumThreads:
+        time.sleep(.4)
+      self.logger.info('----------------')
+      self.logger.info('--- FINISHED ---')
+      self.logger.info('----------------')
+      urls_no = 0
+      urls_not_found_no = 0
+      not_file_name = os.path.join(
+          REGISTER_PAGE_DIR, NOT_FOUND_REG_PAGE_SITES_FILENAME)
+      not_file_dir = os.path.dirname(not_file_name)
+      try:
+        os.makedirs(not_file_dir)
+      except OSError as e:
+        if e.errno != errno.EEXIST:
+          raise
+      fnot = open(not_file_name, 'wb')
+      try:
+        for t in sorted(allThreads, key=lambda t: t._url):
+          urls_no += 1
+          if not t.pageFound:
+            urls_not_found_no += 1
+            fnot.write('%s' % t._url)
+            fnot.write(os.linesep)
+      except IOError as e:
+        self.logger.error('Error: %s', e)
+      finally:
+        fnot.close()
+      self.logger.info('Total number of urls given: %d\n', urls_no)
+      self.logger.info(
+          'Registration pages found: %d\n', (urls_no - urls_not_found_no))
+      self.logger.info(
+          'URLs that did not return a registration page: %d\n',
+          urls_not_found_no)
+      t1 = datetime.datetime.now()
+      delta_t = t1 - t0
+      self.logger.info('started at: %s\n', t0)
+      self.logger.info('ended at: %s\n', t1)
+      self.logger.info('execution time was: %s\n', delta_t)
+    else:
+      self.logger.error('Error: %s', 'no URLs were found')
+      sys.exit(1)
+
+
+def main():
+  # Command line options.
+  usage = 'usage: %prog [options] single_url_or_urls_filename'
+  parser = OptionParser(usage)
+  parser.add_option(
+      '-l', '--log_level', metavar='LOG_LEVEL', default='error',
+      help='LOG_LEVEL: debug, info, warning or error [default: %default]')
+
+  (options, args) = parser.parse_args()
+  options.log_level = getattr(logging, options.log_level.upper(),
+                              default=None)
+  if not options.log_level:
+    print 'Wrong log_level argument.'
+    parser.print_help()
+    sys.exit(1)
+
+  if len(args) != 1:
+    print 'Wrong number of arguments.'
+    parser.print_help()
+    sys.exit(1)
+
+  if os.path.isfile(args[0]):
+    c = ThreadedCrawler(args[0], options.log_level)
+    c.Run()
+  else:
+    t0 = datetime.datetime.now()
+    c = Crawler(args[0], options.log_level)
+    c.Run()
+    logger = logging.getLogger(__name__)
+    if c.url_error:
+      logger.error(
+          'Error: "%s" is neither a valid filename nor a valid url' % args[0])
+    t1 = datetime.datetime.now()
+    delta_t = t1 - t0
+    logger.info('started at: %s\n', t0)
+    logger.info('ended at: %s\n', t1)
+    logger.info('execution time was: %s\n', delta_t)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/chrome/tools/webforms_aggregator_tests.py b/chrome/tools/webforms_aggregator_tests.py
new file mode 100644
index 0000000..fc12dc3
--- /dev/null
+++ b/chrome/tools/webforms_aggregator_tests.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import os
+import tempfile
+import unittest
+
+import webforms_aggregator
+
+
+class WebformsAggregatorTest(unittest.TestCase):
+  """Unit tests for the webforms_aggregator module."""
+
+  def setUp(self):
+    self.cookie_file = 'test.cookie'
+    self.url1 = 'http://www.google.com'
+    self.url2 = 'http://www.macys.com'
+    self.domain = 'google.com'
+    self.url_file = tempfile.NamedTemporaryFile(suffix='.txt', delete=False)
+    self.url_file.file.write(
+        'URLs to crawl:\n%s\n%s\n' % (self.url1, self.url2))
+    self.url_file.close()
+
+  def tearDown(self):
+    if os.path.isfile(self.cookie_file):
+      os.unlink(self.cookie_file)
+    if os.path.isfile(self.url_file.name):
+      self.url_file.close()
+      os.unlink(self.url_file.name)
+
+  def testRetrieverDownloadsPage(self):
+    """Verify the retriever can download a page."""
+    r = webforms_aggregator.Retriever(self.url1, self.domain, self.cookie_file)
+    self.assertTrue(r.Download(),
+                    msg='Retriever could not download "%s"' % self.url1)
+
+  def testCrawlerFindsRegPageFromUrl(self):
+    """Verify that the crawler is able to find a reg page from the given URL."""
+    c = webforms_aggregator.Crawler(self.url2)
+    self.assertTrue(
+        c.Run(), msg='Crawler could not find the reg page of "%s"' % self.url2)
+
+  def testThreadedCrawlerFindsRegPageFromUrlsFile(self):
+    """Verify the threaded crawler finds reg page from a file of URLs."""
+    c = webforms_aggregator.ThreadedCrawler(self.url_file.name)
+    self.assertNotEqual(
+        c.Run(), -1,
+        msg='Threaded crawler could not find the reg page from the URLs file')
+
+
+if __name__ == '__main__':
+  suite = unittest.TestLoader().loadTestsFromTestCase(
+      WebformsAggregatorTest)
+  unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/chrome/tools/weburl_links.txt b/chrome/tools/weburl_links.txt
new file mode 100644
index 0000000..6dc642a
--- /dev/null
+++ b/chrome/tools/weburl_links.txt
@@ -0,0 +1,66 @@
+# Web sites retrieved from bugs filed against Autofill and an assortment of well known web sites.
+
+[Top Level]
+http://www.kneedraggers.com
+http://www.macys.com
+http://www.gmail.com
+http://www.mcphee.com
+http://www.ebay.com
+http://www.rocketlawyer.com
+http://www.yahoo.com
+http://www.live.com
+http://www.myspace.com
+http://www.twitter.com
+http://www.aol.com
+http://www.imdb.com
+http://www.photobucket.com
+http://www.conduit.com
+http://www.linkedin.com
+http://www.mediafire.com
+http://www.skyrock.com
+http://www.livejournal.com
+http://www.rediff.com
+http://www.deviantart.com
+http://www.netlog.com
+http://www.valueclickmedia.com
+http://www.sourceforge.net
+http://www.target.com
+http://www.rei.com
+http://www.hotels.com
+http://www.officedepot.com
+
+
+[Direct Links]
+http://www.oases.org/support/donate.php
+http://www.getdigital.de/index/shop
+https://ecomm.dell.com/dellstore/myaccount/signup.aspx
+http://www.aerosoles.com/newuser.asp
+https://trueblue.jetblue.com/web/trueblue/register/
+http://www.spar-momsen.dk/shop/order1.html
+https://www.pyramidcollection.com/OrderForm.asp?customer=new
+http://www.heart.org/HEARTORG/General/Hoops-For-Heart-Registration-Form_UCM_314648_Form.jsp
+http://www.gymboree.com/rewards/enroll_now.jsp
+https://www.jbox.com/account/register
+http://www.uhaul.com/Dealer/
+http://www.facebook.com
+https://signup.live.com/signup.aspx
+http://www.hi5.com
+https://www.adobe.com/cfusion/membership/index.cfm?nl=1&nf=1
+http://www.tagged.com
+http://signup.clicksor.com/advertisers_account.php?service=0&aid=&nid=
+
+
+[Crawl Sites]
+http://www.supershuttle.com
+http://www.continental.com
+http://www.epson.com
+http://www.gap.com
+http://www.groupon.com
+http://www.megagear.com
+http://www.threadless.com
+http://www.bestbuy.com
+http://www.youtube.com
+http://www.amazon.com
+http://www.craigslist.org
+http://www.officemax.com
+http://www.newegg.com
author	dyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-04-29 19:15:53 +0000
committer	dyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-04-29 19:15:53 +0000
commit	5883af9856b04d06dc8572ac381d2207b737c78e (patch)
tree	ce48d5f8b47a634d19ac1aa5ef2820335b90a548 /chrome/tools
parent	70e64f8b4bb55b01555feb4c340943630f19f3ef (diff)
download	chromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.zip chromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.tar.gz chromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.tar.bz2