1 files changed, 254 insertions, 253 deletions
diff --git a/chrome/tools/webforms_extractor.py b/chrome/tools/webforms_extractor.py
index 71fed7c..1dd1d95 100644..100755
--- a/chrome/tools/webforms_extractor.py
+++ b/chrome/tools/webforms_extractor.py
@@ -1,253 +1,254 @@
-#!/usr/bin/python
-# Copyright (c) 2011 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be found
-# in the LICENSE file.
-
-"""Extracts registration forms from the corresponding HTML files.
-
-Used for extracting forms within HTML files. This script is used in
-conjunction with the webforms_aggregator.py script, which aggregates web pages
-with fillable forms (i.e registration forms).
-
-The purpose of this script is to extract out all non-form elements that may be
-causing parsing errors and timeout issues when running browser_tests.
-
-This script extracts all forms from a HTML file.
-If there are multiple forms per downloaded site, multiple files are created
-for each form.
-
-Used as a standalone script but assumes that it is run from the directory in
-which it is checked into.
-
-Usage: forms_extractor.py [options]
-
-Options:
-  -l LOG_LEVEL, --log_level=LOG_LEVEL,
-    LOG_LEVEL: debug, info, warning or error [default: error]
-  -j, --js  extracts javascript elements from web form.
-  -h, --help  show this help message and exit
-"""
-
-import glob
-import logging
-from optparse import OptionParser
-import os
-import re
-import sys
-
-
-class FormsExtractor(object):
-  """Extracts HTML files, leaving only registration forms from the HTML file."""
-  _HTML_FILES_PATTERN = r'*.html'
-  _HTML_FILE_PREFIX = r'grabber-'
-  _FORM_FILE_PREFIX = r'grabber-stripped-'
-
-  _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
-                                         'heuristics', 'input')
-  _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
-                                      'heuristics', 'input')
-
-  logger = logging.getLogger(__name__)
-  log_handlers = {'StreamHandler': None}
-
-  # This pattern is used for retrieving the form location comment located at the
-  # top of each downloaded HTML file indicating where the form originated from.
-  _RE_FORM_LOCATION_PATTERN = re.compile(
-      ur"""
-      <!--Form\s{1}Location:  # Starting of form location comment.
-      .*?                     # Any characters (non-greedy).
-      -->                     # Ending of the form comment.
-      """, re.U | re.S | re.I | re.X)
-
-  # This pattern is used for removing all script code.
-  _RE_SCRIPT_PATTERN = re.compile(
-      ur"""
-      <script       # A new opening '<script' tag.
-      \b            # The end of the word 'script'.
-      .*?           # Any characters (non-greedy).
-      >             # Ending of the (opening) tag: '>'.
-      .*?           # Any characters (non-greedy) between the tags.
-      </script\s*>  # The '</script>' closing tag.
-      """, re.U | re.S | re.I | re.X)
-
-  # This pattern is used for removing all href js code.
-  _RE_HREF_JS_PATTERN = re.compile(
-      ur"""
-      \bhref             # The word href and its beginning.
-      \s*=\s*            # The '=' with all whitespace before and after it.
-      (?P<quote>[\'\"])  # A single or double quote which is captured.
-      \s*javascript\s*:  # The word 'javascript:' with any whitespace possible.
-      .*?                # Any characters (non-greedy) between the quotes.
-      \1                 # The previously captured single or double quote.
-      """, re.U | re.S | re.I | re.X)
-
-  _RE_EVENT_EXPR = (
-      ur"""
-      \b                 # The beginning of a new word.
-      on\w+?             # All words starting with 'on' (non-greedy)
-                         # example: |onmouseover|.
-      \s*=\s*            # The '=' with all whitespace before and after it.
-      (?P<quote>[\'\"])  # A captured single or double quote.
-      .*?                # Any characters (non-greedy) between the quotes.
-      \1                 # The previously captured single or double quote.
-      """)
-
-  # This pattern is used for removing code with js events, such as |onload|.
-  # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
-  # pattern matches to strings such as '<tr class="nav"
-  # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
-  _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
-      ur"""
-      <        # Matches character '<'.
-      [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).""" +
-      _RE_EVENT_EXPR +
-      ur"""
-      [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).
-      >        # Matches character '>'.
-      """, re.U | re.S | re.I | re.X)
-
-  # Adds whitespace chars at the end of the matched event. Also match trailing
-  # whitespaces for JS events. Do not match leading whitespace.
-  # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
-  # considered valid HTML.
-  _RE_EVENT_PATTERN = re.compile(
-      _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
-
-  # This pattern is used for finding form elements.
-  _RE_FORM_PATTERN = re.compile(
-      ur"""
-      <form       # A new opening '<form' tag.
-      \b          # The end of the word 'form'.
-      .*?         # Any characters (non-greedy).
-      >           # Ending of the (opening) tag: '>'.
-      .*?         # Any characters (non-greedy) between the tags.
-      </form\s*>  # The '</form>' closing tag.
-      """, re.U | re.S | re.I | re.X)
-
-  def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
-               output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
-    """Creates a FormsExtractor object.
-
-    Args:
-      input_dir: the directory of HTML files.
-      output_dir: the directory where the registration form files will be
-                  saved.
-      logging_level: verbosity level, default is None.
-
-    Raises:
-      IOError exception if input directory doesn't exist.
-    """
-    if logging_level:
-      if not self.log_handlers['StreamHandler']:
-        console = logging.StreamHandler()
-        console.setLevel(logging.DEBUG)
-        self.log_handlers['StreamHandler'] = console
-        self.logger.addHandler(console)
-      self.logger.setLevel(logging_level)
-    else:
-      if self.log_handlers['StreamHandler']:
-        self.logger.removeHandler(self.log_handlers['StreamHandler'])
-        self.log_handlers['StreamHandler'] = None
-
-    self._input_dir = input_dir
-    self._output_dir = output_dir
-    if not os.path.isdir(self._input_dir):
-      error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
-      self.logger.error('Error: %s', error_msg)
-      raise IOError(error_msg)
-    if not os.path.isdir(output_dir):
-      os.makedirs(output_dir)
-    self._form_location_comment = ''
-
-  def _SubstituteAllEvents(self, matchobj):
-    """Remove all js events that are present as attributes within a tag.
-
-    Args:
-      matchobj: A regexp |re.MatchObject| containing text that has at least one
-                event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
-                onmouseout="mOut1(this);">|.
-
-    Returns:
-      The text containing the tag with all the attributes except for the tags
-      with events. Example: |<tr class="nav">|.
-    """
-    tag_with_all_attrs = matchobj.group(0)
-    return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
-
-  def Extract(self, strip_js_only):
-    """Extracts and saves the extracted registration forms.
-
-    Iterates through all the HTML files.
-
-    Args:
-      strip_js_only: If True, only Javascript is stripped from the HTML content.
-                     Otherwise, all non-form elements are stripped.
-    """
-    pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
-    html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
-    for filename in html_files:
-      self.logger.info('Stripping file "%s" ...', filename)
-      with open(filename, 'U') as f:
-        html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
-            self._SubstituteAllEvents,
-            self._RE_HREF_JS_PATTERN.sub(
-                '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
-
-        form_filename = os.path.split(filename)[1]  # Path dropped.
-        form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
-        (form_filename, extension) = os.path.splitext(form_filename)
-        form_filename = (self._FORM_FILE_PREFIX + form_filename +
-                         '%s' + extension)
-        form_filename = os.path.join(self._output_dir, form_filename)
-        if strip_js_only:
-          form_filename = form_filename % ''
-          try:
-            with open(form_filename, 'w') as f:
-              f.write(html_content)
-          except IOError as e:
-            self.logger.error('Error: %s', e)
-            continue
-        else:  # Remove all non form elements.
-          match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
-          if match:
-            form_location_comment = match.group() + os.linesep
-          else:
-            form_location_comment = ''
-          forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
-          for form_number, form_match in enumerate(forms_iterator, start=1):
-            form_content = form_match.group()
-            numbered_form_filename = form_filename % form_number
-            try:
-              with open(numbered_form_filename, 'w') as f:
-                f.write(form_location_comment)
-                f.write(form_content)
-            except IOError as e:
-              self.logger.error('Error: %s', e)
-              continue
-          self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
-
-
-def main():
-  # Command line options.
-  parser = OptionParser()
-  parser.add_option(
-      '-l', '--log_level', metavar='LOG_LEVEL', default='error',
-      help='LOG_LEVEL: debug, info, warning or error [default: %default]')
-  parser.add_option(
-      '-j', '--js', dest='js', action='store_true', default=False,
-      help='Removes all javascript elements [default: %default]')
-
-  (options, args) = parser.parse_args()
-  options.log_level = options.log_level.upper()
-  if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
-    print 'Wrong log_level argument.'
-    parser.print_help()
-    sys.exit(1)
-
-  options.log_level = getattr(logging, options.log_level)
-  extractor = FormsExtractor(logging_level=options.log_level)
-  extractor.Extract(options.js)
-
-
-if __name__ == '__main__':
-  main()
+#!/usr/bin/env python
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+
+"""Extracts registration forms from the corresponding HTML files.
+
+Used for extracting forms within HTML files. This script is used in
+conjunction with the webforms_aggregator.py script, which aggregates web pages
+with fillable forms (i.e registration forms).
+
+The purpose of this script is to extract out all non-form elements that may be
+causing parsing errors and timeout issues when running browser_tests.
+
+This script extracts all forms from a HTML file.
+If there are multiple forms per downloaded site, multiple files are created
+for each form.
+
+Used as a standalone script but assumes that it is run from the directory in
+which it is checked into.
+
+Usage: forms_extractor.py [options]
+
+Options:
+  -l LOG_LEVEL, --log_level=LOG_LEVEL,
+    LOG_LEVEL: debug, info, warning or error [default: error]
+  -j, --js  extracts javascript elements from web form.
+  -h, --help  show this help message and exit
+"""
+
+import glob
+import logging
+from optparse import OptionParser
+import os
+import re
+import sys
+
+
+class FormsExtractor(object):
+  """Extracts HTML files, leaving only registration forms from the HTML file."""
+  _HTML_FILES_PATTERN = r'*.html'
+  _HTML_FILE_PREFIX = r'grabber-'
+  _FORM_FILE_PREFIX = r'grabber-stripped-'
+
+  _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
+                                         'heuristics', 'input')
+  _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
+                                      'heuristics', 'input')
+
+  logger = logging.getLogger(__name__)
+  log_handlers = {'StreamHandler': None}
+
+  # This pattern is used for retrieving the form location comment located at the
+  # top of each downloaded HTML file indicating where the form originated from.
+  _RE_FORM_LOCATION_PATTERN = re.compile(
+      ur"""
+      <!--Form\s{1}Location:  # Starting of form location comment.
+      .*?                     # Any characters (non-greedy).
+      -->                     # Ending of the form comment.
+      """, re.U | re.S | re.I | re.X)
+
+  # This pattern is used for removing all script code.
+  _RE_SCRIPT_PATTERN = re.compile(
+      ur"""
+      <script       # A new opening '<script' tag.
+      \b            # The end of the word 'script'.
+      .*?           # Any characters (non-greedy).
+      >             # Ending of the (opening) tag: '>'.
+      .*?           # Any characters (non-greedy) between the tags.
+      </script\s*>  # The '</script>' closing tag.
+      """, re.U | re.S | re.I | re.X)
+
+  # This pattern is used for removing all href js code.
+  _RE_HREF_JS_PATTERN = re.compile(
+      ur"""
+      \bhref             # The word href and its beginning.
+      \s*=\s*            # The '=' with all whitespace before and after it.
+      (?P<quote>[\'\"])  # A single or double quote which is captured.
+      \s*javascript\s*:  # The word 'javascript:' with any whitespace possible.
+      .*?                # Any characters (non-greedy) between the quotes.
+      \1                 # The previously captured single or double quote.
+      """, re.U | re.S | re.I | re.X)
+
+  _RE_EVENT_EXPR = (
+      ur"""
+      \b                 # The beginning of a new word.
+      on\w+?             # All words starting with 'on' (non-greedy)
+                         # example: |onmouseover|.
+      \s*=\s*            # The '=' with all whitespace before and after it.
+      (?P<quote>[\'\"])  # A captured single or double quote.
+      .*?                # Any characters (non-greedy) between the quotes.
+      \1                 # The previously captured single or double quote.
+      """)
+
+  # This pattern is used for removing code with js events, such as |onload|.
+  # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
+  # pattern matches to strings such as '<tr class="nav"
+  # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
+  _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
+      ur"""
+      <        # Matches character '<'.
+      [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).""" +
+      _RE_EVENT_EXPR +
+      ur"""
+      [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).
+      >        # Matches character '>'.
+      """, re.U | re.S | re.I | re.X)
+
+  # Adds whitespace chars at the end of the matched event. Also match trailing
+  # whitespaces for JS events. Do not match leading whitespace.
+  # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
+  # considered valid HTML.
+  _RE_EVENT_PATTERN = re.compile(
+      _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
+
+  # This pattern is used for finding form elements.
+  _RE_FORM_PATTERN = re.compile(
+      ur"""
+      <form       # A new opening '<form' tag.
+      \b          # The end of the word 'form'.
+      .*?         # Any characters (non-greedy).
+      >           # Ending of the (opening) tag: '>'.
+      .*?         # Any characters (non-greedy) between the tags.
+      </form\s*>  # The '</form>' closing tag.
+      """, re.U | re.S | re.I | re.X)
+
+  def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
+               output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
+    """Creates a FormsExtractor object.
+
+    Args:
+      input_dir: the directory of HTML files.
+      output_dir: the directory where the registration form files will be
+                  saved.
+      logging_level: verbosity level, default is None.
+
+    Raises:
+      IOError exception if input directory doesn't exist.
+    """
+    if logging_level:
+      if not self.log_handlers['StreamHandler']:
+        console = logging.StreamHandler()
+        console.setLevel(logging.DEBUG)
+        self.log_handlers['StreamHandler'] = console
+        self.logger.addHandler(console)
+      self.logger.setLevel(logging_level)
+    else:
+      if self.log_handlers['StreamHandler']:
+        self.logger.removeHandler(self.log_handlers['StreamHandler'])
+        self.log_handlers['StreamHandler'] = None
+
+    self._input_dir = input_dir
+    self._output_dir = output_dir
+    if not os.path.isdir(self._input_dir):
+      error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
+      self.logger.error('Error: %s', error_msg)
+      raise IOError(error_msg)
+    if not os.path.isdir(output_dir):
+      os.makedirs(output_dir)
+    self._form_location_comment = ''
+
+  def _SubstituteAllEvents(self, matchobj):
+    """Remove all js events that are present as attributes within a tag.
+
+    Args:
+      matchobj: A regexp |re.MatchObject| containing text that has at least one
+                event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
+                onmouseout="mOut1(this);">|.
+
+    Returns:
+      The text containing the tag with all the attributes except for the tags
+      with events. Example: |<tr class="nav">|.
+    """
+    tag_with_all_attrs = matchobj.group(0)
+    return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
+
+  def Extract(self, strip_js_only):
+    """Extracts and saves the extracted registration forms.
+
+    Iterates through all the HTML files.
+
+    Args:
+      strip_js_only: If True, only Javascript is stripped from the HTML content.
+                     Otherwise, all non-form elements are stripped.
+    """
+    pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
+    html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
+    for filename in html_files:
+      self.logger.info('Stripping file "%s" ...', filename)
+      with open(filename, 'U') as f:
+        html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
+            self._SubstituteAllEvents,
+            self._RE_HREF_JS_PATTERN.sub(
+                '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
+
+        form_filename = os.path.split(filename)[1]  # Path dropped.
+        form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
+        (form_filename, extension) = os.path.splitext(form_filename)
+        form_filename = (self._FORM_FILE_PREFIX + form_filename +
+                         '%s' + extension)
+        form_filename = os.path.join(self._output_dir, form_filename)
+        if strip_js_only:
+          form_filename = form_filename % ''
+          try:
+            with open(form_filename, 'w') as f:
+              f.write(html_content)
+          except IOError as e:
+            self.logger.error('Error: %s', e)
+            continue
+        else:  # Remove all non form elements.
+          match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
+          if match:
+            form_location_comment = match.group() + os.linesep
+          else:
+            form_location_comment = ''
+          forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
+          for form_number, form_match in enumerate(forms_iterator, start=1):
+            form_content = form_match.group()
+            numbered_form_filename = form_filename % form_number
+            try:
+              with open(numbered_form_filename, 'w') as f:
+                f.write(form_location_comment)
+                f.write(form_content)
+            except IOError as e:
+              self.logger.error('Error: %s', e)
+              continue
+          self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
+
+
+def main():
+  parser = OptionParser()
+  parser.add_option(
+      '-l', '--log_level', metavar='LOG_LEVEL', default='error',
+      help='LOG_LEVEL: debug, info, warning or error [default: %default]')
+  parser.add_option(
+      '-j', '--js', dest='js', action='store_true', default=False,
+      help='Removes all javascript elements [default: %default]')
+
+  (options, args) = parser.parse_args()
+  options.log_level = options.log_level.upper()
+  if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
+    print 'Wrong log_level argument.'
+    parser.print_help()
+    return 1
+
+  options.log_level = getattr(logging, options.log_level)
+  extractor = FormsExtractor(logging_level=options.log_level)
+  extractor.Extract(options.js)
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main())