summaryrefslogtreecommitdiffstats
path: root/chrome/tools/webforms_extractor.py
diff options
context:
space:
mode:
authordyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-06-07 22:15:32 +0000
committerdyu@chromium.org <dyu@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-06-07 22:15:32 +0000
commit79051b766a63dd48e35d2e75b89a0233975187fe (patch)
tree52d8a4cc8e5bb21c6b0dcd697ce9213c6cc9dbbf /chrome/tools/webforms_extractor.py
parent8d1fa70549a38d51c5233b813eaca18483997d2c (diff)
downloadchromium_src-79051b766a63dd48e35d2e75b89a0233975187fe.zip
chromium_src-79051b766a63dd48e35d2e75b89a0233975187fe.tar.gz
chromium_src-79051b766a63dd48e35d2e75b89a0233975187fe.tar.bz2
Minor improvments in regex and added Verbose mode for expressing the regex.
BUG=none TEST=none Review URL: http://codereview.chromium.org/7044036 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@88217 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/tools/webforms_extractor.py')
-rw-r--r--chrome/tools/webforms_extractor.py65
1 files changed, 50 insertions, 15 deletions
diff --git a/chrome/tools/webforms_extractor.py b/chrome/tools/webforms_extractor.py
index 2256ad1..765c4e5 100644
--- a/chrome/tools/webforms_extractor.py
+++ b/chrome/tools/webforms_extractor.py
@@ -45,29 +45,62 @@ class FormsExtractor(object):
log_handlers = {'StreamHandler': None}
# This pattern is used for removing all |<script>| code.
- re_script_pattern = re.compile(ur'<\bscript\b.*?>.*?</\bscript\b>',
- re.U | re.S | re.I)
-
- # This pattern is used for capturing js event |'onmouseover="mOvr1(this);"'|
- # \bon\w+ # Matches 'onmouseover'
- # ? # Makes |\w+| non-greedy.
- # \s*=\s* # Match any spaces before and after the '=' symbol.
- # (?P<quote>[\'\"]) # Adds the single or double quote in the matching
- # pattern.
- # .*? # Adds the non-greedy to any chars.
- # \1 # Matches the previously captured quote.
- re_event = ur'\bon\w+?\s*=\s*(?P<quote>[\'\"]).*?\1'
+ re_script_pattern = re.compile(
+ ur"""
+ <script # A new opening 'script' tag.
+ \b # The end of the word 'script'.
+ .*? # Any characters (non-greedy).
+ > # Ending of the opening tag '>'.
+ .*? # Any characters (non-greedy) between the tags.
+ </script\s*> # The '<script>' closing tag.
+ """, re.U | re.S | re.I | re.X)
+
+ # This pattern is used to remove all href js code.
+ re_href_js_pattern = re.compile(
+ ur"""
+ \bhref # The beginning of a new word.
+ \s*=\s* # Matches any spaces before and after the '=' symbol.
+ (?P<quote>[\'\"]) # Captures the single or double quote in the
+ # matching pattern.
+ \s*javascript\s* # Matches anything before and after 'javascript'.
+ .*? # Matches any characters (non-greedy) between the
+ # quotes.
+ \1 # Matches the previously captured single or double
+ # quotes.
+ """, re.U | re.S | re.I | re.X)
+
+ # This pattern is used for capturing js event |'onmouseover="..."'|
+ re_event = (
+ ur"""
+ \b # The beginning of a new word.
+ on\w+? # Matches all words starting with 'on' (non-greedy)
+ # |onmouseover|.
+ \s*=\s* # Matches any spaces before and after the '=' symbol.
+ (?P<quote>[\'\"]) # Captures the single or double quote in the matching
+ # pattern.
+ .*? # Matches any characters (non-greedy) between the
+ # quotes.
+ \1 # Matches the previously captured single or double
+ # quotes.
+ """)
# This pattern is used for removing code with js events, such as |onload|.
# By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
# pattern matches to strings such as '<tr class="nav"
# onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
re_tag_with_events_pattern = re.compile(
- ur'<[^<>]*?' + re_event + ur'[^<>]*?>', re.U | re.S | re.I)
+ ur"""
+ < # Matches character '<'.
+ [^<>]*? # Matches any characters except '<' and '>' (non-greedy).""" +
+ re_event +
+ ur"""
+ [^<>]*? # Matches any characters except '<' and '>' (non-greedy)
+ > # Matches character '>'.
+ """, re.U | re.S | re.I | re.X)
# Add white space characters at the end of the matched event. Leaves only the
# leading white space when substituted with empty string.
- re_event_pattern = re.compile(re_event + ur'\s*', re.U | re.S | re.I)
+ re_event_pattern = re.compile(re_event + ur'\s*', re.U | re.S | re.I | re.X)
def __init__(self, input_dir=REGISTRATION_PAGES_DIR,
@@ -132,7 +165,9 @@ class FormsExtractor(object):
self.logger.info('extracting file "%s" ...', filename)
with open(filename) as f:
html_content = self.re_tag_with_events_pattern.sub(
- self._SubstituteAllEvents, self.re_script_pattern.sub('', f.read()))
+ self._SubstituteAllEvents,
+ self.re_href_js_pattern.sub(
+ '', self.re_script_pattern.sub('', f.read())))
try:
with open(
os.path.join(self._output_dir, '%s%s' % (