#!/usr/bin/python
# Copyright (c) 2011 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be found
# in the LICENSE file.
"""Extracts registration forms from the corresponding HTML files.
Used for extracting forms within HTML files. This script is used in
conjunction with the webforms_aggregator.py script, which aggregates web pages
with fillable forms (i.e registration forms).
The purpose of this script is to extract out JavaScript elements that may be
causing parsing errors and timeout issues when running browser_tests.
Used as a standalone script but assumes that it is run from the directory in
which it is checked into.
Usage: forms_extractor.py [options]
Options:
-l LOG_LEVEL, --log_level=LOG_LEVEL,
LOG_LEVEL: debug, info, warning or error [default: error]
-h, --help show this help message and exit
"""
import glob
import logging
from optparse import OptionParser
import os
import re
import sys
class FormsExtractor(object):
"""Extracts HTML files, leaving only registration forms from the HTML file."""
HTML_FILES_PATTERN = r'*.html'
HTML_FILE_PREFIX = r'grabber-'
FORM_FILE_PREFIX = r'top100_'
REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
'heuristics', 'input')
EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
'heuristics', 'input')
logger = logging.getLogger(__name__)
log_handlers = {'StreamHandler': None}
# This pattern is used for removing all | # The '