#!/usr/bin/python # Copyright (c) 2011 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be found # in the LICENSE file. """Extracts registration forms from the corresponding HTML files. Used for extracting forms within HTML files. This script is used in conjunction with the webforms_aggregator.py script, which aggregates web pages with fillable forms (i.e registration forms). The purpose of this script is to extract out JavaScript elements that may be causing parsing errors and timeout issues when running browser_tests. Used as a standalone script but assumes that it is run from the directory in which it is checked into. Usage: forms_extractor.py [options] Options: -l LOG_LEVEL, --log_level=LOG_LEVEL, LOG_LEVEL: debug, info, warning or error [default: error] -h, --help show this help message and exit """ import glob import logging from optparse import OptionParser import os import re import sys class FormsExtractor(object): """Extracts HTML files, leaving only registration forms from the HTML file.""" HTML_FILES_PATTERN = r'*.html' HTML_FILE_PREFIX = r'grabber-' FORM_FILE_PREFIX = r'top100_' REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill', 'heuristics', 'input') EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill', 'heuristics', 'input') logger = logging.getLogger(__name__) log_handlers = {'StreamHandler': None} # This pattern is used for removing all |