summaryrefslogtreecommitdiffstats
path: root/chrome/tools
diff options
context:
space:
mode:
authordyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-04-29 19:15:53 +0000
committerdyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-04-29 19:15:53 +0000
commit5883af9856b04d06dc8572ac381d2207b737c78e (patch)
treece48d5f8b47a634d19ac1aa5ef2820335b90a548 /chrome/tools
parent70e64f8b4bb55b01555feb4c340943630f19f3ef (diff)
downloadchromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.zip
chromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.tar.gz
chromium_src-5883af9856b04d06dc8572ac381d2207b737c78e.tar.bz2
Aggregator script used for collecting web pages with filliable forms
such as registration forms. The script parses through a set of links from a text file and crawls the domain looking for web pages with forms then downloads the entire page to an html file. webforms_aggregator.py TEST=none BUG=none Review URL: http://codereview.chromium.org/6577026 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@83567 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/tools')
-rw-r--r--chrome/tools/webforms_aggregator.py826
-rw-r--r--chrome/tools/webforms_aggregator_tests.py56
-rw-r--r--chrome/tools/weburl_links.txt66
3 files changed, 948 insertions, 0 deletions
diff --git a/chrome/tools/webforms_aggregator.py b/chrome/tools/webforms_aggregator.py
new file mode 100644
index 0000000..17e30d6
--- /dev/null
+++ b/chrome/tools/webforms_aggregator.py
@@ -0,0 +1,826 @@
+#!/usr/bin/python
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Downloads web pages with fillable forms after parsing through a set of links.
+
+Used for collecting web pages with forms. Used as a standalone script.
+This script assumes that it's run from within the same directory in which it's
+checked into. If this script were to be run elsewhere then the path for
+REGISTER_PAGE_DIR needs to be changed.
+
+This script assumes that third party modules are installed:
+httplib2, lxml, pycurl.
+
+Usage: webforms_aggregator.py [options] [single url or file containing urls]
+
+Options:
+ -l LOG_LEVEL, --log_level LOG_LEVEL
+ LOG_LEVEL: debug, info, warning or error [default: error]
+ -h, --help show this help message and exit
+"""
+
+import datetime
+import errno
+import logging
+from optparse import OptionParser
+import os
+import re
+import sys
+import tempfile
+import threading
+import time
+from urlparse import urlparse, urljoin
+
+from httplib2 import iri2uri
+from lxml import html, etree
+import pycurl
+
+REGISTER_PAGE_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
+ 'heuristics', 'input')
+NOT_FOUND_REG_PAGE_SITES_FILENAME = 'notFoundRegPageSites.txt'
+
+FORM_LOCATION_COMMENT = 'Form Location: %s'
+HTML_FILE_PREFIX = 'grabber-'
+
+MAX_REDIRECTIONS = 10
+
+# Strings in a webpage that are indicative of a registration link.
+LINK_CLUES = ['regist', 'user', 'sign', 'login', 'account']
+
+MAX_SAME_DOMAIN_URLS_NO = 30
+MAX_TOTAL_URLS_PER_DOMAIN = 300
+MAX_OPEN_FILES_NO = 500
+
+# URLs are selected for downloading with the following rules from the link
+# lists, giving more weight to the links that contain a link clue.
+CLUE_SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
+CLUE_GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
+SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
+GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
+
+MAX_ALLOWED_THREADS = MAX_OPEN_FILES_NO / MAX_SAME_DOMAIN_URLS_NO + 1
+
+
+class Retriever(object):
+ """Download, parse, and check if the web page contains a registration form.
+
+ If the page does not contain a registration form, the url links are retrieved,
+ each url is accessed and the HTML content is saved in a string member of the
+ Retriever object.
+ """
+ logger = logging.getLogger(__name__)
+
+ def __init__(self, url, domain, cookie_file):
+ """Initializes a Retriever object.
+
+ Args:
+ url: url to download page from.
+ domain: only links with this domain will be retrieved.
+ cookie_file: the name of a cookie file, needed for pages that use session
+ cookies to change their contents.
+ """
+ self._url = url
+ self._domain = domain
+ self._html_content = ''
+
+ # Http links without clues from LINK_CLUES.
+ self._general_links = []
+ # Http links that contain a clue from LINK_CLUES.
+ self._clues_general_links = []
+ # Https links that do not contain any clues from LINK_CLUES.
+ self._secure_links = []
+ # Https links that contain a clue from LINK_CLUES.
+ self._clues_secure_links = []
+ self._cookie_file = cookie_file
+ self._curl_object = None
+
+ def __del__(self):
+ """Cleans up before this object is destroyed.
+
+ The function closes the corresponding curl object that does the downloading.
+ """
+ if self._curl_object:
+ self._curl_object.close()
+
+ def _GetJavaScriptRedirectionURL(self):
+ """Checks whether the page contains js redirection to another location.
+
+ Returns:
+ The js redirection URL if one exists, or the empty string otherwise.
+ """
+ try:
+ tree = html.fromstring(self._html_content, parser=html.HTMLParser())
+ except etree.XMLSyntaxError:
+ return ''
+ redirect_url = ''
+ script_elements = tree.iter('script')
+ for script_elem in script_elements:
+ str = html.tostring(
+ script_elem, method='text', encoding='UTF-8').strip()
+ if re.match(r'^window.location', str):
+ m = re.search(r'(?P<quote>[\'\"])(?P<redirect_url>.*?)\1', str)
+ if m:
+ redirect_url = urljoin(self._url, m.group('redirect_url'))
+ break
+ return redirect_url
+
+ def _AddLink(self, link):
+ """Adds url |link|, if not already present, to the appropriate list.
+
+ The link only gets added to the single list that is appopriate for it:
+ _secure_links, _general_links, _clues_secure_links or _clues_general_links.
+
+ Args:
+ link: the url that is inserted to the appropriate links list.
+ """
+ link_parsed = urlparse(link)
+ link_lists = [self._clues_secure_links, self._secure_links,
+ self._clues_general_links, self._general_links]
+ # Checks that the registration page is within the domain.
+ if ((self._domain in link_parsed[1]) and
+ all(map(lambda x: link not in x, link_lists))):
+ for clue in LINK_CLUES:
+ if clue in link.lower():
+ if link_parsed[0].startswith('https'):
+ self._clues_secure_links.append(link)
+ return
+ else:
+ self._clues_general_links.append(link)
+ return
+ if link_parsed[0].startswith('https'): # No clues found in the link.
+ self._secure_links.append(link)
+ else:
+ self._general_links.append(link)
+
+ def ParseAndGetLinks(self):
+ """Parses downloaded page and gets url link for non registration page.
+
+ Checks if current page contains a registration page and if not it gets
+ the url links. If it is a registration page, it saves it in a file as
+ 'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT
+ and it returns True. Otherwise it returns False.
+
+ Returns:
+ True if current page contains a registration form, and False otherwise.
+
+ Raises:
+ IOError: When can't write to the file.
+ """
+ if not self._domain:
+ self.logger.error('Error: self._domain was not set')
+ sys.exit(1)
+ m = re.findall('(?P<quote>[\'\"])(?P<link>https?://.*?)\1',
+ self._html_content)
+ for link in m:
+ self._AddLink(link[1])
+ try:
+ tree = html.fromstring(self._html_content, parser=html.HTMLParser())
+ except etree.XMLSyntaxError:
+ self.logger.info('\t\tSkipping: %s <<< %s',
+ 'not valid HTML code in this page', self._url)
+ return False
+ try:
+ body_iterator = tree.iter('body')
+ body = body_iterator.next()
+ except StopIteration:
+ self.logger.info('\t\tSkipping: %s <<< %s',
+ 'no "BODY" tag in this page', self._url)
+ return False
+
+ # Get a list of all input elements with attribute type='password'
+ password_elements = list(body.iterfind('.//input[@type="password"]'))
+ # Check for multiple password elements to distinguish between a login form
+ # and a registration form (Password field and Confirm Password field).
+ if password_elements and len(password_elements) >= 2:
+ form_elements = []
+ for password_elem in password_elements:
+ form_elem = password_elem.xpath('ancestor::form[1]')
+ if not form_elem:
+ continue
+ if not form_elem[0] in form_elements:
+ form_elements.append(form_elem[0])
+ else:
+ # Confirms that the page contains a registration form if two passwords
+ # are contained in the same form for form_elem[0].
+ if not os.path.isdir(REGISTER_PAGE_DIR):
+ os.mkdir(REGISTER_PAGE_DIR)
+ # Locate the HTML tag and insert the form location comment after it.
+ html_tag = tree.iter('html').next()
+ comment = etree.Comment(FORM_LOCATION_COMMENT % self._url)
+ html_tag.insert(0, comment)
+ # Create a new file and save the HTML registration page code.
+ f = open('%s/%s%s.html' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX,
+ self._domain), 'wb')
+ try:
+ f.write(html.tostring(tree, pretty_print=True))
+ except IOError as e:
+ self.logger.error('Error: %s', e)
+ raise
+ finally:
+ f.close()
+ return True # Registration page found.
+ # Indicates page is not a registration page and links must be parsed.
+ link_elements = list(body.iter('a'))
+ for link_elem in link_elements:
+ link = link_elem.get('href')
+ if not link or '#' == link[0]:
+ continue
+ link = urljoin(self._url, link)
+ link_parsed = urlparse(link)
+ if not link_parsed[0].startswith('http'):
+ continue
+ self._AddLink(link)
+ return False # Registration page not found.
+
+ def _GetHeaders(self, buff):
+ """Gets the headers of a url HEAD or GET request.
+
+ The headers are stored into object variables, not returned by this function.
+
+ Args:
+ buff: each header read. It is needed in case there is a redirection of
+ the page. If one is found, self._url is changed to this redirected url.
+ """
+ if buff.lower().startswith('location:'):
+ self._redirect_url = buff[9:].strip()
+ self._url = urljoin(self._url, self._redirect_url)
+
+ def InitRequestHead(self):
+ """Initializes curl object for a HEAD request.
+
+ A HEAD request is initiated so that we can check from the headers if this is
+ a valid HTML file. If it is not a valid HTML file, then we do not initiate a
+ GET request, saving any unnecessary downloadings.
+ """
+ self._curl_object = pycurl.Curl()
+ # Handles sites with unicode URLs.
+ if isinstance(self._url, unicode):
+ self._url = str(iri2uri(self._url))
+ self._curl_object.setopt(pycurl.URL, self._url)
+ # The following line fixes the GnuTLS package error that pycurl depends
+ # on for getting https pages.
+ self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)
+ self._curl_object.setopt(pycurl.HEADERFUNCTION, self._GetHeaders)
+ self._curl_object.setopt(pycurl.FOLLOWLOCATION, True)
+ self._curl_object.setopt(pycurl.NOBODY, True)
+ self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False);
+ self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS)
+ self._curl_object.setopt(pycurl.FAILONERROR, False)
+ self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file)
+ self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file)
+ self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30)
+ self._curl_object.setopt(pycurl.TIMEOUT, 300)
+ self._curl_object.setopt(pycurl.NOSIGNAL, 1)
+
+ def InitRequestGet(self):
+ """Initializes curl object for a GET request.
+
+ This is called only for valid HTML files. The Pycurl makes a GET request.
+ The page begins to download, but since not all the data of the pages comes
+ at once. When some of the data on the page is downloaded Pycurl will put
+ this data in the buffer. The data is appended to the end of the page until
+ everything is downloaded.
+ """
+ self._curl_object.setopt(pycurl.NOBODY, False)
+ self._curl_object.setopt(pycurl.HEADERFUNCTION, lambda buff: None)
+ self._curl_object.setopt(
+ pycurl.WRITEFUNCTION, lambda buff: setattr(
+ self, '_html_content', self._html_content + buff))
+
+ def Download(self):
+ """Downloads the self._url page.
+
+ It first does a HEAD request and then it proceeds to a GET request.
+ It uses a curl object for a single download. This function is called only
+ once for the initial url of a site when we still don't have more urls from a
+ domain.
+
+ Returns:
+ True, if the downloaded page is valid HTML code, or False otherwise.
+ """
+ self.InitRequestHead()
+ try:
+ self._curl_object.perform()
+ except pycurl.error as e:
+ self.logger.error('Error: %s, url: %s', e, self._url)
+ return False
+ content_type = self._curl_object.getinfo(pycurl.CONTENT_TYPE)
+ if content_type and ('text/html' in content_type.lower()):
+ self.InitRequestGet()
+ try:
+ self._curl_object.perform()
+ except pycurl.error as e:
+ self.logger.error('Error: %s, url: %s', e, self._url)
+ return False
+ return True
+ else:
+ self.logger.info('\tSkipping: %s <<< %s',
+ 'Not an HTML page', self._url)
+ return False
+
+ def Run(self):
+ """Called only once for the initial url when we don't have more urls.
+
+ Downloads the originally-specified site url, checks it for redirections,
+ parses it and gets its links.
+
+ Returns:
+ True, if a registration page is found, and False otherwise.
+ """
+ redirection_counter = 0
+ while True:
+ is_HTML = self.Download()
+ if not is_HTML:
+ break
+ redirect_url = self._GetJavaScriptRedirectionURL()
+ if redirect_url:
+ redirection_counter += 1
+ if redirection_counter > MAX_REDIRECTIONS:
+ return False
+ self._url = redirect_url
+ else:
+ break
+ if is_HTML:
+ if not self._domain:
+ url_parsed = urlparse(self._url)
+ self._domain = url_parsed[1]
+ if self._domain.startswith('www.'):
+ self._domain = self._domain[4:]
+ if self.ParseAndGetLinks():
+ return True
+ return False
+
+
+class Crawler(object):
+ """Crawls a site until a registration page is found or max level is reached.
+
+ Creates, uses and destroys Retriever objects. Creates a cookie temp file
+ needed for session cookies. It keeps track of 'visited links' and
+ 'links to visit' of the site. To do this it uses the links discovered from
+ each Retriever object. Use Run() to crawl the site.
+ """
+ try:
+ # Needed in Linux so that PyCurl does not throw a segmentation fault.
+ import signal
+ from signal import SIGPIPE, SIG_IGN
+ signal.signal(signal.SIGPIPE, signal.SIG_IGN)
+ except ImportError:
+ pass
+ logger = logging.getLogger(__name__)
+ log_handlers = {'StreamHandler': None}
+
+ def __init__(self, url, logging_level=None):
+ """Init crawler URL, links lists, logger, and creates a cookie temp file.
+
+ The cookie temp file is needed for session cookies.
+
+ Args:
+ url: the initial "seed" url of the site.
+ logging_level: the desired verbosity level, default is None.
+ """
+ if logging_level:
+ if not self.log_handlers['StreamHandler']:
+ console = logging.StreamHandler()
+ console.setLevel(logging.INFO)
+ self.log_handlers['StreamHandler'] = console
+ self.logger.addHandler(console)
+ self.logger.setLevel(logging_level)
+ else:
+ if self.log_handlers['StreamHandler']:
+ self.logger.removeHandler(self.log_handlers['StreamHandler'])
+ self.log_handlers['StreamHandler'] = None
+
+ self.url_error = False
+ url_parsed = urlparse(url)
+ if not url_parsed[0].startswith('http'):
+ self.logger.error(
+ 'Error: "%s" does not begin with http:// or https://', url)
+ self.url_error = True
+ return
+ # Example: if url is 'http://www.ebay.com?name=john' then value [1] or
+ # network location is 'www.ebay.com'.
+ if not url_parsed[1]:
+ self.logger.error('Error: "%s" is not a valid url', url)
+ self.url_error = True
+ return
+ self._root_url = url
+ self._url = url
+ self._domain = ''
+ # Http links that contain a clue from LINK_CLUES.
+ self._clues_general_links = []
+ # Http links that do not contain any clue from LINK_CLUES.
+ self._general_links = []
+ # Https links that contain a clue from LINK_CLUES.
+ self._clues_secure_links = []
+ # Https links that do not contain any clue from LINK_CLUES.
+ self._secure_links = []
+ # All links downloaded and parsed so far.
+ self._links_visited = []
+ self._retrievers_list = []
+ self._cookie_file = tempfile.NamedTemporaryFile(
+ suffix='.cookie', delete=False)
+ self._cookie_file.close()
+ self._cookie_file = self._cookie_file.name # Keep only the filename.
+
+ def __del__(self):
+ """Deletes cookie file when Crawler instances are destroyed."""
+ if hasattr(self, '_cookie_file'):
+ self.logger.info('Deleting cookie file %s ...', self._cookie_file)
+ os.unlink(self._cookie_file)
+
+ def _MultiPerform(self, curl_multi_object):
+ """Performs concurrent downloads using a CurlMulti object.
+
+ Args:
+ curl_multi_object: a curl object that downloads multiple pages
+ concurrently.
+ """
+ while True:
+ ret, no_handles = curl_multi_object.perform()
+ # Following code uses the example from section for the CurlMulti object
+ # at http://pycurl.sourceforge.net/doc/curlmultiobject.html.
+ if ret != pycurl.E_CALL_MULTI_PERFORM:
+ break
+ while no_handles:
+ curl_multi_object.select(1.0)
+ while True:
+ ret, no_handles = curl_multi_object.perform()
+ if ret != pycurl.E_CALL_MULTI_PERFORM:
+ break
+
+ def _GetLinksPages(self, curl_multi_object):
+ """Downloads many pages concurrently using a CurlMulti Object.
+
+ Creates many Retriever objects and adds them to a list. The constant
+ MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded
+ concurrently from the same domain using the pycurl multi object. It's
+ currently set to 30 URLs. These URLs are taken from the links lists, which
+ are from csl, gcl, sl, and gl. The rules define how many URLs are taken from
+ each list during each iteration.
+
+ Example of the rules:
+ 3/10 from csl results in 9 URLs
+ 3/10 from cgl results in 9 URLs
+ 2/10 from sl results in 6 URLs
+ 2/10 from gl results in 6 URLs
+
+ Adding up the above URLs gives 30 URLs that can be downloaded concurrently.
+ If there are fewer items than the defined rules, such as if a site does not
+ contain any secure links, then csl and sl lists will have 0 length and only
+ 15 pages will be downloaded concurrently from the same domain.
+
+ Args:
+ curl_multi_object: Each Retriever object has a curl object which is
+ added to the CurlMulti Object.
+ """
+ self._retrievers_list = []
+
+ csl_no = min(CLUE_SECURE_LINKS_NO, len(self._clues_secure_links))
+ cgl_no = min(CLUE_GENERAL_LINKS_NO, len(self._clues_general_links))
+ sl_no = min(SECURE_LINKS_NO, len(self._secure_links))
+ gl_no = min(GENERAL_LINKS_NO, len(self._general_links))
+
+ # If some links within the list have fewer items than needed, the missing
+ # links will be taken by the following priority: csl, cgl, sl, gl.
+ # c: clues, s: secure, g: general, l: list.
+ spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
+ if spare_links > 0:
+ csl_no = min(csl_no + spare_links, len(self._clues_secure_links))
+ spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
+ if spare_links > 0:
+ cgl_no = min(cgl_no + spare_links, len(self._clues_general_links))
+ spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
+ if spare_links > 0:
+ sl_no = min(sl_no + spare_links, len(self._secure_links))
+ spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
+ if spare_links > 0:
+ gl_no = min(gl_no + spare_links, len(self._general_links))
+
+ m = curl_multi_object
+ for no_of_links, links in [
+ (csl_no, self._clues_secure_links),
+ (sl_no, self._secure_links),
+ (cgl_no, self._clues_general_links),
+ (gl_no, self._general_links)]:
+ for i in xrange(no_of_links):
+ if not links:
+ break
+ url = links.pop(0)
+ self._links_visited.append(url)
+ r = Retriever(url, self._domain, self._cookie_file)
+ r.InitRequestHead()
+ m.add_handle(r._curl_object)
+ self._retrievers_list.append(r)
+
+ if self._retrievers_list:
+ try:
+ self._MultiPerform(m)
+ except pycurl.error as e:
+ self.logger.error('Error: %s, url: %s', e, self._url)
+ finally:
+ prRetrList = self._retrievers_list
+ self._retrievers_list = []
+ for r in prRetrList:
+ m.remove_handle(r._curl_object)
+ while prRetrList:
+ r = prRetrList.pop(0)
+ content_type = r._curl_object.getinfo(pycurl.CONTENT_TYPE)
+ if content_type and ('text/html' in content_type.lower()):
+ r.InitRequestGet()
+ m.add_handle(r._curl_object)
+ self._retrievers_list.append(r)
+ else:
+ self.logger.info('\tSkipping: %s <<< %s',
+ 'Not an HTML page', r._url)
+ if self._retrievers_list:
+ try:
+ self._MultiPerform(m)
+ except:
+ self.logger.error('Error: %s, url: %s', e, self._url)
+ finally:
+ for r in self._retrievers_list:
+ m.remove_handle(r._curl_object)
+ self.logger.info('Downloaded: %s', r._url)
+
+ def _LogRegPageFound(self, retriever):
+ """Display logging for registration page found.
+
+ Args:
+ retriever: The object that has retrieved the page.
+ """
+ self.logger.info('\t##############################################')
+ self.logger.info('\t### %s ###', retriever._domain)
+ self.logger.info('\t##############################################')
+ self.logger.info('\t!!!!!!!!! registration page FOUND !!!!!!!!!!!')
+ self.logger.info('\t%s', retriever._url)
+ self.logger.info('\t##############################################')
+
+ def _GetNewLinks(self, retriever):
+ """Appends new links discovered by each retriever to the appropriate lists.
+
+ Links are copied to the links list of the crawler object, which holds all
+ the links found from all retrievers that the crawler object created. The
+ Crawler object exists as far as a specific site is examined and the
+ Retriever object exists as far as a page of this site is examined.
+
+ Args:
+ retriever: a temporary object that downloads a specific page, parses the
+ content and gets the page's href link.
+ """
+ for link in retriever._clues_secure_links:
+ if (not link in self._clues_secure_links and
+ not link in self._links_visited):
+ self._clues_secure_links.append(link)
+ for link in retriever._secure_links:
+ if (not link in self._secure_links and
+ not link in self._links_visited):
+ self._secure_links.append(link)
+ for link in retriever._clues_general_links:
+ if (not link in self._clues_general_links and
+ not link in self._links_visited):
+ self._clues_general_links.append(link)
+ for link in retriever._general_links:
+ if (not link in self._general_links and
+ not link in self._links_visited):
+ self._general_links.append(link)
+
+ def Run(self):
+ """Runs the Crawler.
+
+ Creates a Retriever object and calls its run method to get the first links,
+ and then uses CurlMulti object and creates many Retriever objects to get
+ the subsequent pages.
+
+ The number of pages (=Retriever objs) created each time is restricted by
+ MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download
+ and parse their pages, we do the same again. The number of total pages
+ visited is kept in urls_visited.
+ If no registration page is found, the Crawler object will give up its try
+ after MAX_TOTAL_URLS_PER_DOMAIN is reached.
+
+ Returns:
+ True is returned if registration page is found, or False otherwise.
+ """
+ reg_page_found = False
+ if self.url_error:
+ return False
+ r = Retriever(self._url, self._domain, self._cookie_file)
+ if r.Run():
+ self._LogRegPageFound(r)
+ reg_page_found = True
+ else:
+ self._url = r._url
+ self._domain = r._domain
+ self.logger.info('url to crawl: %s', self._url)
+ self.logger.info('domain: %s', self._domain)
+ self._links_visited.append(r._url)
+ self._GetNewLinks(r)
+ urls_visited = 1
+ while True:
+ if (not (self._clues_secure_links or self._secure_links or
+ self._clues_general_links or self._general_links or
+ urls_visited >= MAX_TOTAL_URLS_PER_DOMAIN)):
+ break # Registration page not found.
+ m = pycurl.CurlMulti()
+ self._GetLinksPages(m)
+ self.logger.info('\t<----- URLs visited for domain "%s": %d ----->',
+ self._domain, urls_visited)
+ for r in self._retrievers_list:
+ if r.ParseAndGetLinks():
+ self._LogRegPageFound(r)
+ reg_page_found = True
+ break
+ else:
+ self.logger.info('parsed: %s', r._url)
+ self._GetNewLinks(r)
+ m.close()
+ if reg_page_found:
+ break
+ while self._retrievers_list:
+ r = self._retrievers_list.pop()
+ return reg_page_found
+
+
+class WorkerThread(threading.Thread):
+ """Creates a new thread of execution."""
+ def __init__(self, url):
+ """Creates _url and pageFound attri to populate urls_with_no_reg_page file.
+
+ Used after thread's termination for the creation of a file with a list of
+ the urls for which a registration page wasn't found.
+
+ Args:
+ url: will be used as an argument to create a Crawler object later.
+ """
+ threading.Thread.__init__(self)
+ self._url = url
+ self.pageFound = False
+
+ def run(self):
+ """Execution of thread creates a Crawler object and runs it.
+
+ Caution: this function name should not be changed to 'Run' or any other
+ name becuase it is overriding the 'run' method of the 'threading.Thread'
+ class. Otherwise it will never be called.
+ """
+ c = Crawler(self._url)
+ if c.Run():
+ self.pageFound = True
+
+
+class ThreadedCrawler(object):
+ """Calls the Run function of WorkerThread which creates & runs a Crawler obj.
+
+ The crawler object runs concurrently, examining one site each.
+ """
+ logger = logging.getLogger(__name__)
+ log_handlers = {'StreamHandler': None}
+
+ def __init__(self, urls_file, logging_level=None):
+ """Creates threaded Crawler objects.
+
+ Args:
+ urls_file: urls from a file.
+ logging_level: verbosity level, default is None.
+
+ Raises:
+ IOError: If cannot find URLs from the list.
+ """
+ if logging_level:
+ if not self.log_handlers['StreamHandler']:
+ console = logging.StreamHandler()
+ console.setLevel(logging.INFO)
+ self.log_handlers['StreamHandler'] = console
+ self.logger.addHandler(console)
+ self.logger.setLevel(logging_level)
+ else:
+ if self.log_handlers['StreamHandler']:
+ self.logger.removeHandler(self.log_handlers['StreamHandler'])
+ self.log_handlers['StreamHandler'] = None
+ self._urls_list = []
+ f = open(urls_file)
+ try:
+ for url in f.readlines():
+ url = url.strip()
+ if not urlparse(url)[0].startswith('http'):
+ self.logger.info(
+ '%s: skipping this (does not begin with "http://")', url)
+ continue
+ self._urls_list.append(url)
+ except IOError as e:
+ self.logger.error('Error: %s', e)
+ raise
+ finally:
+ f.close()
+ if not self._urls_list:
+ raise IOError('no URLs were found')
+
+ def Run(self):
+ """Runs Crawler objects using python threads.
+
+ Number of concurrent threads is restricted to MAX_ALLOWED_THREADS.
+
+ Raises:
+ OSError: When creating the same directory that already exists.
+ """
+ t0 = datetime.datetime.now()
+ if self._urls_list:
+ allThreads = []
+ # originalNumThreads is the number of threads just before the
+ # ThreadedCrawler starts creating new threads. As a standalone script it
+ # will be 1.
+ originalNumThreads = threading.active_count()
+ for url in self._urls_list:
+ self.logger.info('url fed to a crawler thread: %s', url)
+ t = WorkerThread(url)
+ t.start()
+ allThreads.append(t)
+ while threading.active_count() >= (
+ MAX_ALLOWED_THREADS + originalNumThreads):
+ time.sleep(.4)
+ while threading.active_count() > originalNumThreads:
+ time.sleep(.4)
+ self.logger.info('----------------')
+ self.logger.info('--- FINISHED ---')
+ self.logger.info('----------------')
+ urls_no = 0
+ urls_not_found_no = 0
+ not_file_name = os.path.join(
+ REGISTER_PAGE_DIR, NOT_FOUND_REG_PAGE_SITES_FILENAME)
+ not_file_dir = os.path.dirname(not_file_name)
+ try:
+ os.makedirs(not_file_dir)
+ except OSError as e:
+ if e.errno != errno.EEXIST:
+ raise
+ fnot = open(not_file_name, 'wb')
+ try:
+ for t in sorted(allThreads, key=lambda t: t._url):
+ urls_no += 1
+ if not t.pageFound:
+ urls_not_found_no += 1
+ fnot.write('%s' % t._url)
+ fnot.write(os.linesep)
+ except IOError as e:
+ self.logger.error('Error: %s', e)
+ finally:
+ fnot.close()
+ self.logger.info('Total number of urls given: %d\n', urls_no)
+ self.logger.info(
+ 'Registration pages found: %d\n', (urls_no - urls_not_found_no))
+ self.logger.info(
+ 'URLs that did not return a registration page: %d\n',
+ urls_not_found_no)
+ t1 = datetime.datetime.now()
+ delta_t = t1 - t0
+ self.logger.info('started at: %s\n', t0)
+ self.logger.info('ended at: %s\n', t1)
+ self.logger.info('execution time was: %s\n', delta_t)
+ else:
+ self.logger.error('Error: %s', 'no URLs were found')
+ sys.exit(1)
+
+
+def main():
+ # Command line options.
+ usage = 'usage: %prog [options] single_url_or_urls_filename'
+ parser = OptionParser(usage)
+ parser.add_option(
+ '-l', '--log_level', metavar='LOG_LEVEL', default='error',
+ help='LOG_LEVEL: debug, info, warning or error [default: %default]')
+
+ (options, args) = parser.parse_args()
+ options.log_level = getattr(logging, options.log_level.upper(),
+ default=None)
+ if not options.log_level:
+ print 'Wrong log_level argument.'
+ parser.print_help()
+ sys.exit(1)
+
+ if len(args) != 1:
+ print 'Wrong number of arguments.'
+ parser.print_help()
+ sys.exit(1)
+
+ if os.path.isfile(args[0]):
+ c = ThreadedCrawler(args[0], options.log_level)
+ c.Run()
+ else:
+ t0 = datetime.datetime.now()
+ c = Crawler(args[0], options.log_level)
+ c.Run()
+ logger = logging.getLogger(__name__)
+ if c.url_error:
+ logger.error(
+ 'Error: "%s" is neither a valid filename nor a valid url' % args[0])
+ t1 = datetime.datetime.now()
+ delta_t = t1 - t0
+ logger.info('started at: %s\n', t0)
+ logger.info('ended at: %s\n', t1)
+ logger.info('execution time was: %s\n', delta_t)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/chrome/tools/webforms_aggregator_tests.py b/chrome/tools/webforms_aggregator_tests.py
new file mode 100644
index 0000000..fc12dc3
--- /dev/null
+++ b/chrome/tools/webforms_aggregator_tests.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import os
+import tempfile
+import unittest
+
+import webforms_aggregator
+
+
+class WebformsAggregatorTest(unittest.TestCase):
+ """Unit tests for the webforms_aggregator module."""
+
+ def setUp(self):
+ self.cookie_file = 'test.cookie'
+ self.url1 = 'http://www.google.com'
+ self.url2 = 'http://www.macys.com'
+ self.domain = 'google.com'
+ self.url_file = tempfile.NamedTemporaryFile(suffix='.txt', delete=False)
+ self.url_file.file.write(
+ 'URLs to crawl:\n%s\n%s\n' % (self.url1, self.url2))
+ self.url_file.close()
+
+ def tearDown(self):
+ if os.path.isfile(self.cookie_file):
+ os.unlink(self.cookie_file)
+ if os.path.isfile(self.url_file.name):
+ self.url_file.close()
+ os.unlink(self.url_file.name)
+
+ def testRetrieverDownloadsPage(self):
+ """Verify the retriever can download a page."""
+ r = webforms_aggregator.Retriever(self.url1, self.domain, self.cookie_file)
+ self.assertTrue(r.Download(),
+ msg='Retriever could not download "%s"' % self.url1)
+
+ def testCrawlerFindsRegPageFromUrl(self):
+ """Verify that the crawler is able to find a reg page from the given URL."""
+ c = webforms_aggregator.Crawler(self.url2)
+ self.assertTrue(
+ c.Run(), msg='Crawler could not find the reg page of "%s"' % self.url2)
+
+ def testThreadedCrawlerFindsRegPageFromUrlsFile(self):
+ """Verify the threaded crawler finds reg page from a file of URLs."""
+ c = webforms_aggregator.ThreadedCrawler(self.url_file.name)
+ self.assertNotEqual(
+ c.Run(), -1,
+ msg='Threaded crawler could not find the reg page from the URLs file')
+
+
+if __name__ == '__main__':
+ suite = unittest.TestLoader().loadTestsFromTestCase(
+ WebformsAggregatorTest)
+ unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/chrome/tools/weburl_links.txt b/chrome/tools/weburl_links.txt
new file mode 100644
index 0000000..6dc642a
--- /dev/null
+++ b/chrome/tools/weburl_links.txt
@@ -0,0 +1,66 @@
+# Web sites retrieved from bugs filed against Autofill and an assortment of well known web sites.
+
+[Top Level]
+http://www.kneedraggers.com
+http://www.macys.com
+http://www.gmail.com
+http://www.mcphee.com
+http://www.ebay.com
+http://www.rocketlawyer.com
+http://www.yahoo.com
+http://www.live.com
+http://www.myspace.com
+http://www.twitter.com
+http://www.aol.com
+http://www.imdb.com
+http://www.photobucket.com
+http://www.conduit.com
+http://www.linkedin.com
+http://www.mediafire.com
+http://www.skyrock.com
+http://www.livejournal.com
+http://www.rediff.com
+http://www.deviantart.com
+http://www.netlog.com
+http://www.valueclickmedia.com
+http://www.sourceforge.net
+http://www.target.com
+http://www.rei.com
+http://www.hotels.com
+http://www.officedepot.com
+
+
+[Direct Links]
+http://www.oases.org/support/donate.php
+http://www.getdigital.de/index/shop
+https://ecomm.dell.com/dellstore/myaccount/signup.aspx
+http://www.aerosoles.com/newuser.asp
+https://trueblue.jetblue.com/web/trueblue/register/
+http://www.spar-momsen.dk/shop/order1.html
+https://www.pyramidcollection.com/OrderForm.asp?customer=new
+http://www.heart.org/HEARTORG/General/Hoops-For-Heart-Registration-Form_UCM_314648_Form.jsp
+http://www.gymboree.com/rewards/enroll_now.jsp
+https://www.jbox.com/account/register
+http://www.uhaul.com/Dealer/
+http://www.facebook.com
+https://signup.live.com/signup.aspx
+http://www.hi5.com
+https://www.adobe.com/cfusion/membership/index.cfm?nl=1&nf=1
+http://www.tagged.com
+http://signup.clicksor.com/advertisers_account.php?service=0&aid=&nid=
+
+
+[Crawl Sites]
+http://www.supershuttle.com
+http://www.continental.com
+http://www.epson.com
+http://www.gap.com
+http://www.groupon.com
+http://www.megagear.com
+http://www.threadless.com
+http://www.bestbuy.com
+http://www.youtube.com
+http://www.amazon.com
+http://www.craigslist.org
+http://www.officemax.com
+http://www.newegg.com