summaryrefslogtreecommitdiffstats
path: root/chrome/tools/webforms_extractor.py
diff options
context:
space:
mode:
Diffstat (limited to 'chrome/tools/webforms_extractor.py')
-rwxr-xr-x[-rw-r--r--]chrome/tools/webforms_extractor.py507
1 files changed, 254 insertions, 253 deletions
diff --git a/chrome/tools/webforms_extractor.py b/chrome/tools/webforms_extractor.py
index 71fed7c..1dd1d95 100644..100755
--- a/chrome/tools/webforms_extractor.py
+++ b/chrome/tools/webforms_extractor.py
@@ -1,253 +1,254 @@
-#!/usr/bin/python
-# Copyright (c) 2011 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be found
-# in the LICENSE file.
-
-"""Extracts registration forms from the corresponding HTML files.
-
-Used for extracting forms within HTML files. This script is used in
-conjunction with the webforms_aggregator.py script, which aggregates web pages
-with fillable forms (i.e registration forms).
-
-The purpose of this script is to extract out all non-form elements that may be
-causing parsing errors and timeout issues when running browser_tests.
-
-This script extracts all forms from a HTML file.
-If there are multiple forms per downloaded site, multiple files are created
-for each form.
-
-Used as a standalone script but assumes that it is run from the directory in
-which it is checked into.
-
-Usage: forms_extractor.py [options]
-
-Options:
- -l LOG_LEVEL, --log_level=LOG_LEVEL,
- LOG_LEVEL: debug, info, warning or error [default: error]
- -j, --js extracts javascript elements from web form.
- -h, --help show this help message and exit
-"""
-
-import glob
-import logging
-from optparse import OptionParser
-import os
-import re
-import sys
-
-
-class FormsExtractor(object):
- """Extracts HTML files, leaving only registration forms from the HTML file."""
- _HTML_FILES_PATTERN = r'*.html'
- _HTML_FILE_PREFIX = r'grabber-'
- _FORM_FILE_PREFIX = r'grabber-stripped-'
-
- _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
- 'heuristics', 'input')
- _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
- 'heuristics', 'input')
-
- logger = logging.getLogger(__name__)
- log_handlers = {'StreamHandler': None}
-
- # This pattern is used for retrieving the form location comment located at the
- # top of each downloaded HTML file indicating where the form originated from.
- _RE_FORM_LOCATION_PATTERN = re.compile(
- ur"""
- <!--Form\s{1}Location: # Starting of form location comment.
- .*? # Any characters (non-greedy).
- --> # Ending of the form comment.
- """, re.U | re.S | re.I | re.X)
-
- # This pattern is used for removing all script code.
- _RE_SCRIPT_PATTERN = re.compile(
- ur"""
- <script # A new opening '<script' tag.
- \b # The end of the word 'script'.
- .*? # Any characters (non-greedy).
- > # Ending of the (opening) tag: '>'.
- .*? # Any characters (non-greedy) between the tags.
- </script\s*> # The '</script>' closing tag.
- """, re.U | re.S | re.I | re.X)
-
- # This pattern is used for removing all href js code.
- _RE_HREF_JS_PATTERN = re.compile(
- ur"""
- \bhref # The word href and its beginning.
- \s*=\s* # The '=' with all whitespace before and after it.
- (?P<quote>[\'\"]) # A single or double quote which is captured.
- \s*javascript\s*: # The word 'javascript:' with any whitespace possible.
- .*? # Any characters (non-greedy) between the quotes.
- \1 # The previously captured single or double quote.
- """, re.U | re.S | re.I | re.X)
-
- _RE_EVENT_EXPR = (
- ur"""
- \b # The beginning of a new word.
- on\w+? # All words starting with 'on' (non-greedy)
- # example: |onmouseover|.
- \s*=\s* # The '=' with all whitespace before and after it.
- (?P<quote>[\'\"]) # A captured single or double quote.
- .*? # Any characters (non-greedy) between the quotes.
- \1 # The previously captured single or double quote.
- """)
-
- # This pattern is used for removing code with js events, such as |onload|.
- # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
- # pattern matches to strings such as '<tr class="nav"
- # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
- _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
- ur"""
- < # Matches character '<'.
- [^<>]*? # Matches any characters except '<' and '>' (non-greedy).""" +
- _RE_EVENT_EXPR +
- ur"""
- [^<>]*? # Matches any characters except '<' and '>' (non-greedy).
- > # Matches character '>'.
- """, re.U | re.S | re.I | re.X)
-
- # Adds whitespace chars at the end of the matched event. Also match trailing
- # whitespaces for JS events. Do not match leading whitespace.
- # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
- # considered valid HTML.
- _RE_EVENT_PATTERN = re.compile(
- _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
-
- # This pattern is used for finding form elements.
- _RE_FORM_PATTERN = re.compile(
- ur"""
- <form # A new opening '<form' tag.
- \b # The end of the word 'form'.
- .*? # Any characters (non-greedy).
- > # Ending of the (opening) tag: '>'.
- .*? # Any characters (non-greedy) between the tags.
- </form\s*> # The '</form>' closing tag.
- """, re.U | re.S | re.I | re.X)
-
- def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
- output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
- """Creates a FormsExtractor object.
-
- Args:
- input_dir: the directory of HTML files.
- output_dir: the directory where the registration form files will be
- saved.
- logging_level: verbosity level, default is None.
-
- Raises:
- IOError exception if input directory doesn't exist.
- """
- if logging_level:
- if not self.log_handlers['StreamHandler']:
- console = logging.StreamHandler()
- console.setLevel(logging.DEBUG)
- self.log_handlers['StreamHandler'] = console
- self.logger.addHandler(console)
- self.logger.setLevel(logging_level)
- else:
- if self.log_handlers['StreamHandler']:
- self.logger.removeHandler(self.log_handlers['StreamHandler'])
- self.log_handlers['StreamHandler'] = None
-
- self._input_dir = input_dir
- self._output_dir = output_dir
- if not os.path.isdir(self._input_dir):
- error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
- self.logger.error('Error: %s', error_msg)
- raise IOError(error_msg)
- if not os.path.isdir(output_dir):
- os.makedirs(output_dir)
- self._form_location_comment = ''
-
- def _SubstituteAllEvents(self, matchobj):
- """Remove all js events that are present as attributes within a tag.
-
- Args:
- matchobj: A regexp |re.MatchObject| containing text that has at least one
- event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
- onmouseout="mOut1(this);">|.
-
- Returns:
- The text containing the tag with all the attributes except for the tags
- with events. Example: |<tr class="nav">|.
- """
- tag_with_all_attrs = matchobj.group(0)
- return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
-
- def Extract(self, strip_js_only):
- """Extracts and saves the extracted registration forms.
-
- Iterates through all the HTML files.
-
- Args:
- strip_js_only: If True, only Javascript is stripped from the HTML content.
- Otherwise, all non-form elements are stripped.
- """
- pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
- html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
- for filename in html_files:
- self.logger.info('Stripping file "%s" ...', filename)
- with open(filename, 'U') as f:
- html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
- self._SubstituteAllEvents,
- self._RE_HREF_JS_PATTERN.sub(
- '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
-
- form_filename = os.path.split(filename)[1] # Path dropped.
- form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
- (form_filename, extension) = os.path.splitext(form_filename)
- form_filename = (self._FORM_FILE_PREFIX + form_filename +
- '%s' + extension)
- form_filename = os.path.join(self._output_dir, form_filename)
- if strip_js_only:
- form_filename = form_filename % ''
- try:
- with open(form_filename, 'w') as f:
- f.write(html_content)
- except IOError as e:
- self.logger.error('Error: %s', e)
- continue
- else: # Remove all non form elements.
- match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
- if match:
- form_location_comment = match.group() + os.linesep
- else:
- form_location_comment = ''
- forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
- for form_number, form_match in enumerate(forms_iterator, start=1):
- form_content = form_match.group()
- numbered_form_filename = form_filename % form_number
- try:
- with open(numbered_form_filename, 'w') as f:
- f.write(form_location_comment)
- f.write(form_content)
- except IOError as e:
- self.logger.error('Error: %s', e)
- continue
- self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
-
-
-def main():
- # Command line options.
- parser = OptionParser()
- parser.add_option(
- '-l', '--log_level', metavar='LOG_LEVEL', default='error',
- help='LOG_LEVEL: debug, info, warning or error [default: %default]')
- parser.add_option(
- '-j', '--js', dest='js', action='store_true', default=False,
- help='Removes all javascript elements [default: %default]')
-
- (options, args) = parser.parse_args()
- options.log_level = options.log_level.upper()
- if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
- print 'Wrong log_level argument.'
- parser.print_help()
- sys.exit(1)
-
- options.log_level = getattr(logging, options.log_level)
- extractor = FormsExtractor(logging_level=options.log_level)
- extractor.Extract(options.js)
-
-
-if __name__ == '__main__':
- main()
+#!/usr/bin/env python
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+
+"""Extracts registration forms from the corresponding HTML files.
+
+Used for extracting forms within HTML files. This script is used in
+conjunction with the webforms_aggregator.py script, which aggregates web pages
+with fillable forms (i.e registration forms).
+
+The purpose of this script is to extract out all non-form elements that may be
+causing parsing errors and timeout issues when running browser_tests.
+
+This script extracts all forms from a HTML file.
+If there are multiple forms per downloaded site, multiple files are created
+for each form.
+
+Used as a standalone script but assumes that it is run from the directory in
+which it is checked into.
+
+Usage: forms_extractor.py [options]
+
+Options:
+ -l LOG_LEVEL, --log_level=LOG_LEVEL,
+ LOG_LEVEL: debug, info, warning or error [default: error]
+ -j, --js extracts javascript elements from web form.
+ -h, --help show this help message and exit
+"""
+
+import glob
+import logging
+from optparse import OptionParser
+import os
+import re
+import sys
+
+
+class FormsExtractor(object):
+ """Extracts HTML files, leaving only registration forms from the HTML file."""
+ _HTML_FILES_PATTERN = r'*.html'
+ _HTML_FILE_PREFIX = r'grabber-'
+ _FORM_FILE_PREFIX = r'grabber-stripped-'
+
+ _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
+ 'heuristics', 'input')
+ _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
+ 'heuristics', 'input')
+
+ logger = logging.getLogger(__name__)
+ log_handlers = {'StreamHandler': None}
+
+ # This pattern is used for retrieving the form location comment located at the
+ # top of each downloaded HTML file indicating where the form originated from.
+ _RE_FORM_LOCATION_PATTERN = re.compile(
+ ur"""
+ <!--Form\s{1}Location: # Starting of form location comment.
+ .*? # Any characters (non-greedy).
+ --> # Ending of the form comment.
+ """, re.U | re.S | re.I | re.X)
+
+ # This pattern is used for removing all script code.
+ _RE_SCRIPT_PATTERN = re.compile(
+ ur"""
+ <script # A new opening '<script' tag.
+ \b # The end of the word 'script'.
+ .*? # Any characters (non-greedy).
+ > # Ending of the (opening) tag: '>'.
+ .*? # Any characters (non-greedy) between the tags.
+ </script\s*> # The '</script>' closing tag.
+ """, re.U | re.S | re.I | re.X)
+
+ # This pattern is used for removing all href js code.
+ _RE_HREF_JS_PATTERN = re.compile(
+ ur"""
+ \bhref # The word href and its beginning.
+ \s*=\s* # The '=' with all whitespace before and after it.
+ (?P<quote>[\'\"]) # A single or double quote which is captured.
+ \s*javascript\s*: # The word 'javascript:' with any whitespace possible.
+ .*? # Any characters (non-greedy) between the quotes.
+ \1 # The previously captured single or double quote.
+ """, re.U | re.S | re.I | re.X)
+
+ _RE_EVENT_EXPR = (
+ ur"""
+ \b # The beginning of a new word.
+ on\w+? # All words starting with 'on' (non-greedy)
+ # example: |onmouseover|.
+ \s*=\s* # The '=' with all whitespace before and after it.
+ (?P<quote>[\'\"]) # A captured single or double quote.
+ .*? # Any characters (non-greedy) between the quotes.
+ \1 # The previously captured single or double quote.
+ """)
+
+ # This pattern is used for removing code with js events, such as |onload|.
+ # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
+ # pattern matches to strings such as '<tr class="nav"
+ # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
+ _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
+ ur"""
+ < # Matches character '<'.
+ [^<>]*? # Matches any characters except '<' and '>' (non-greedy).""" +
+ _RE_EVENT_EXPR +
+ ur"""
+ [^<>]*? # Matches any characters except '<' and '>' (non-greedy).
+ > # Matches character '>'.
+ """, re.U | re.S | re.I | re.X)
+
+ # Adds whitespace chars at the end of the matched event. Also match trailing
+ # whitespaces for JS events. Do not match leading whitespace.
+ # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
+ # considered valid HTML.
+ _RE_EVENT_PATTERN = re.compile(
+ _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
+
+ # This pattern is used for finding form elements.
+ _RE_FORM_PATTERN = re.compile(
+ ur"""
+ <form # A new opening '<form' tag.
+ \b # The end of the word 'form'.
+ .*? # Any characters (non-greedy).
+ > # Ending of the (opening) tag: '>'.
+ .*? # Any characters (non-greedy) between the tags.
+ </form\s*> # The '</form>' closing tag.
+ """, re.U | re.S | re.I | re.X)
+
+ def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
+ output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
+ """Creates a FormsExtractor object.
+
+ Args:
+ input_dir: the directory of HTML files.
+ output_dir: the directory where the registration form files will be
+ saved.
+ logging_level: verbosity level, default is None.
+
+ Raises:
+ IOError exception if input directory doesn't exist.
+ """
+ if logging_level:
+ if not self.log_handlers['StreamHandler']:
+ console = logging.StreamHandler()
+ console.setLevel(logging.DEBUG)
+ self.log_handlers['StreamHandler'] = console
+ self.logger.addHandler(console)
+ self.logger.setLevel(logging_level)
+ else:
+ if self.log_handlers['StreamHandler']:
+ self.logger.removeHandler(self.log_handlers['StreamHandler'])
+ self.log_handlers['StreamHandler'] = None
+
+ self._input_dir = input_dir
+ self._output_dir = output_dir
+ if not os.path.isdir(self._input_dir):
+ error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
+ self.logger.error('Error: %s', error_msg)
+ raise IOError(error_msg)
+ if not os.path.isdir(output_dir):
+ os.makedirs(output_dir)
+ self._form_location_comment = ''
+
+ def _SubstituteAllEvents(self, matchobj):
+ """Remove all js events that are present as attributes within a tag.
+
+ Args:
+ matchobj: A regexp |re.MatchObject| containing text that has at least one
+ event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
+ onmouseout="mOut1(this);">|.
+
+ Returns:
+ The text containing the tag with all the attributes except for the tags
+ with events. Example: |<tr class="nav">|.
+ """
+ tag_with_all_attrs = matchobj.group(0)
+ return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
+
+ def Extract(self, strip_js_only):
+ """Extracts and saves the extracted registration forms.
+
+ Iterates through all the HTML files.
+
+ Args:
+ strip_js_only: If True, only Javascript is stripped from the HTML content.
+ Otherwise, all non-form elements are stripped.
+ """
+ pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
+ html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
+ for filename in html_files:
+ self.logger.info('Stripping file "%s" ...', filename)
+ with open(filename, 'U') as f:
+ html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
+ self._SubstituteAllEvents,
+ self._RE_HREF_JS_PATTERN.sub(
+ '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
+
+ form_filename = os.path.split(filename)[1] # Path dropped.
+ form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
+ (form_filename, extension) = os.path.splitext(form_filename)
+ form_filename = (self._FORM_FILE_PREFIX + form_filename +
+ '%s' + extension)
+ form_filename = os.path.join(self._output_dir, form_filename)
+ if strip_js_only:
+ form_filename = form_filename % ''
+ try:
+ with open(form_filename, 'w') as f:
+ f.write(html_content)
+ except IOError as e:
+ self.logger.error('Error: %s', e)
+ continue
+ else: # Remove all non form elements.
+ match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
+ if match:
+ form_location_comment = match.group() + os.linesep
+ else:
+ form_location_comment = ''
+ forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
+ for form_number, form_match in enumerate(forms_iterator, start=1):
+ form_content = form_match.group()
+ numbered_form_filename = form_filename % form_number
+ try:
+ with open(numbered_form_filename, 'w') as f:
+ f.write(form_location_comment)
+ f.write(form_content)
+ except IOError as e:
+ self.logger.error('Error: %s', e)
+ continue
+ self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
+
+
+def main():
+ parser = OptionParser()
+ parser.add_option(
+ '-l', '--log_level', metavar='LOG_LEVEL', default='error',
+ help='LOG_LEVEL: debug, info, warning or error [default: %default]')
+ parser.add_option(
+ '-j', '--js', dest='js', action='store_true', default=False,
+ help='Removes all javascript elements [default: %default]')
+
+ (options, args) = parser.parse_args()
+ options.log_level = options.log_level.upper()
+ if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
+ print 'Wrong log_level argument.'
+ parser.print_help()
+ return 1
+
+ options.log_level = getattr(logging, options.log_level)
+ extractor = FormsExtractor(logging_level=options.log_level)
+ extractor.Extract(options.js)
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())