Improve the webforms extractor script.

Removing duplicate code by adding command line options to either remove js elements or all non form elements. Add regex pattern to extract the form location comment so the extracted from has the location of the originating form. TEST=none BUG=none Review URL: http://codereview.chromium.org/7381005 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@93247 0039d316-1c4b-4281-b951-d872f2087c98
author: dyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-07-20 21:03:46 +0000
committer: dyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-07-20 21:03:46 +0000
commit: 6bec74ff29932a85fa2c62d1225ad09473963d84 (patch)
tree: 4aa8285e951925b366e3dde40e39ac6a188156c9 /chrome/tools/webforms_extractor.py
parent: e1d9ac808e5684756e82b814c2992ab233105015 (diff)
download: chromium_src-6bec74ff29932a85fa2c62d1225ad09473963d84.zip
chromium_src-6bec74ff29932a85fa2c62d1225ad09473963d84.tar.gz
chromium_src-6bec74ff29932a85fa2c62d1225ad09473963d84.tar.bz2
1 files changed, 253 insertions, 0 deletions
diff --git a/chrome/tools/webforms_extractor.py b/chrome/tools/webforms_extractor.py
new file mode 100644
index 0000000..71fed7c
--- /dev/null
+++ b/chrome/tools/webforms_extractor.py
@@ -0,0 +1,253 @@
+#!/usr/bin/python
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be found
+# in the LICENSE file.
+
+"""Extracts registration forms from the corresponding HTML files.
+
+Used for extracting forms within HTML files. This script is used in
+conjunction with the webforms_aggregator.py script, which aggregates web pages
+with fillable forms (i.e registration forms).
+
+The purpose of this script is to extract out all non-form elements that may be
+causing parsing errors and timeout issues when running browser_tests.
+
+This script extracts all forms from a HTML file.
+If there are multiple forms per downloaded site, multiple files are created
+for each form.
+
+Used as a standalone script but assumes that it is run from the directory in
+which it is checked into.
+
+Usage: forms_extractor.py [options]
+
+Options:
+  -l LOG_LEVEL, --log_level=LOG_LEVEL,
+    LOG_LEVEL: debug, info, warning or error [default: error]
+  -j, --js  extracts javascript elements from web form.
+  -h, --help  show this help message and exit
+"""
+
+import glob
+import logging
+from optparse import OptionParser
+import os
+import re
+import sys
+
+
+class FormsExtractor(object):
+  """Extracts HTML files, leaving only registration forms from the HTML file."""
+  _HTML_FILES_PATTERN = r'*.html'
+  _HTML_FILE_PREFIX = r'grabber-'
+  _FORM_FILE_PREFIX = r'grabber-stripped-'
+
+  _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
+                                         'heuristics', 'input')
+  _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
+                                      'heuristics', 'input')
+
+  logger = logging.getLogger(__name__)
+  log_handlers = {'StreamHandler': None}
+
+  # This pattern is used for retrieving the form location comment located at the
+  # top of each downloaded HTML file indicating where the form originated from.
+  _RE_FORM_LOCATION_PATTERN = re.compile(
+      ur"""
+      <!--Form\s{1}Location:  # Starting of form location comment.
+      .*?                     # Any characters (non-greedy).
+      -->                     # Ending of the form comment.
+      """, re.U | re.S | re.I | re.X)
+
+  # This pattern is used for removing all script code.
+  _RE_SCRIPT_PATTERN = re.compile(
+      ur"""
+      <script       # A new opening '<script' tag.
+      \b            # The end of the word 'script'.
+      .*?           # Any characters (non-greedy).
+      >             # Ending of the (opening) tag: '>'.
+      .*?           # Any characters (non-greedy) between the tags.
+      </script\s*>  # The '</script>' closing tag.
+      """, re.U | re.S | re.I | re.X)
+
+  # This pattern is used for removing all href js code.
+  _RE_HREF_JS_PATTERN = re.compile(
+      ur"""
+      \bhref             # The word href and its beginning.
+      \s*=\s*            # The '=' with all whitespace before and after it.
+      (?P<quote>[\'\"])  # A single or double quote which is captured.
+      \s*javascript\s*:  # The word 'javascript:' with any whitespace possible.
+      .*?                # Any characters (non-greedy) between the quotes.
+      \1                 # The previously captured single or double quote.
+      """, re.U | re.S | re.I | re.X)
+
+  _RE_EVENT_EXPR = (
+      ur"""
+      \b                 # The beginning of a new word.
+      on\w+?             # All words starting with 'on' (non-greedy)
+                         # example: |onmouseover|.
+      \s*=\s*            # The '=' with all whitespace before and after it.
+      (?P<quote>[\'\"])  # A captured single or double quote.
+      .*?                # Any characters (non-greedy) between the quotes.
+      \1                 # The previously captured single or double quote.
+      """)
+
+  # This pattern is used for removing code with js events, such as |onload|.
+  # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
+  # pattern matches to strings such as '<tr class="nav"
+  # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
+  _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
+      ur"""
+      <        # Matches character '<'.
+      [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).""" +
+      _RE_EVENT_EXPR +
+      ur"""
+      [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).
+      >        # Matches character '>'.
+      """, re.U | re.S | re.I | re.X)
+
+  # Adds whitespace chars at the end of the matched event. Also match trailing
+  # whitespaces for JS events. Do not match leading whitespace.
+  # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
+  # considered valid HTML.
+  _RE_EVENT_PATTERN = re.compile(
+      _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
+
+  # This pattern is used for finding form elements.
+  _RE_FORM_PATTERN = re.compile(
+      ur"""
+      <form       # A new opening '<form' tag.
+      \b          # The end of the word 'form'.
+      .*?         # Any characters (non-greedy).
+      >           # Ending of the (opening) tag: '>'.
+      .*?         # Any characters (non-greedy) between the tags.
+      </form\s*>  # The '</form>' closing tag.
+      """, re.U | re.S | re.I | re.X)
+
+  def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
+               output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
+    """Creates a FormsExtractor object.
+
+    Args:
+      input_dir: the directory of HTML files.
+      output_dir: the directory where the registration form files will be
+                  saved.
+      logging_level: verbosity level, default is None.
+
+    Raises:
+      IOError exception if input directory doesn't exist.
+    """
+    if logging_level:
+      if not self.log_handlers['StreamHandler']:
+        console = logging.StreamHandler()
+        console.setLevel(logging.DEBUG)
+        self.log_handlers['StreamHandler'] = console
+        self.logger.addHandler(console)
+      self.logger.setLevel(logging_level)
+    else:
+      if self.log_handlers['StreamHandler']:
+        self.logger.removeHandler(self.log_handlers['StreamHandler'])
+        self.log_handlers['StreamHandler'] = None
+
+    self._input_dir = input_dir
+    self._output_dir = output_dir
+    if not os.path.isdir(self._input_dir):
+      error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
+      self.logger.error('Error: %s', error_msg)
+      raise IOError(error_msg)
+    if not os.path.isdir(output_dir):
+      os.makedirs(output_dir)
+    self._form_location_comment = ''
+
+  def _SubstituteAllEvents(self, matchobj):
+    """Remove all js events that are present as attributes within a tag.
+
+    Args:
+      matchobj: A regexp |re.MatchObject| containing text that has at least one
+                event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
+                onmouseout="mOut1(this);">|.
+
+    Returns:
+      The text containing the tag with all the attributes except for the tags
+      with events. Example: |<tr class="nav">|.
+    """
+    tag_with_all_attrs = matchobj.group(0)
+    return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
+
+  def Extract(self, strip_js_only):
+    """Extracts and saves the extracted registration forms.
+
+    Iterates through all the HTML files.
+
+    Args:
+      strip_js_only: If True, only Javascript is stripped from the HTML content.
+                     Otherwise, all non-form elements are stripped.
+    """
+    pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
+    html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
+    for filename in html_files:
+      self.logger.info('Stripping file "%s" ...', filename)
+      with open(filename, 'U') as f:
+        html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
+            self._SubstituteAllEvents,
+            self._RE_HREF_JS_PATTERN.sub(
+                '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
+
+        form_filename = os.path.split(filename)[1]  # Path dropped.
+        form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
+        (form_filename, extension) = os.path.splitext(form_filename)
+        form_filename = (self._FORM_FILE_PREFIX + form_filename +
+                         '%s' + extension)
+        form_filename = os.path.join(self._output_dir, form_filename)
+        if strip_js_only:
+          form_filename = form_filename % ''
+          try:
+            with open(form_filename, 'w') as f:
+              f.write(html_content)
+          except IOError as e:
+            self.logger.error('Error: %s', e)
+            continue
+        else:  # Remove all non form elements.
+          match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
+          if match:
+            form_location_comment = match.group() + os.linesep
+          else:
+            form_location_comment = ''
+          forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
+          for form_number, form_match in enumerate(forms_iterator, start=1):
+            form_content = form_match.group()
+            numbered_form_filename = form_filename % form_number
+            try:
+              with open(numbered_form_filename, 'w') as f:
+                f.write(form_location_comment)
+                f.write(form_content)
+            except IOError as e:
+              self.logger.error('Error: %s', e)
+              continue
+          self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
+
+
+def main():
+  # Command line options.
+  parser = OptionParser()
+  parser.add_option(
+      '-l', '--log_level', metavar='LOG_LEVEL', default='error',
+      help='LOG_LEVEL: debug, info, warning or error [default: %default]')
+  parser.add_option(
+      '-j', '--js', dest='js', action='store_true', default=False,
+      help='Removes all javascript elements [default: %default]')
+
+  (options, args) = parser.parse_args()
+  options.log_level = options.log_level.upper()
+  if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
+    print 'Wrong log_level argument.'
+    parser.print_help()
+    sys.exit(1)
+
+  options.log_level = getattr(logging, options.log_level)
+  extractor = FormsExtractor(logging_level=options.log_level)
+  extractor.Extract(options.js)
+
+
+if __name__ == '__main__':
+  main()
author	dyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-07-20 21:03:46 +0000
committer	dyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-07-20 21:03:46 +0000
commit	6bec74ff29932a85fa2c62d1225ad09473963d84 (patch)
tree	4aa8285e951925b366e3dde40e39ac6a188156c9 /chrome/tools/webforms_extractor.py
parent	e1d9ac808e5684756e82b814c2992ab233105015 (diff)
download	chromium_src-6bec74ff29932a85fa2c62d1225ad09473963d84.zip chromium_src-6bec74ff29932a85fa2c62d1225ad09473963d84.tar.gz chromium_src-6bec74ff29932a85fa2c62d1225ad09473963d84.tar.bz2