// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/renderer/autofill/form_manager.h" #include "base/logging.h" #include "base/scoped_vector.h" #include "base/string_util.h" #include "base/stl_util-inl.h" #include "base/utf_string_conversions.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebFormControlElement.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebInputElement.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebLabelElement.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebNode.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeList.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebOptionElement.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebSelectElement.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebVector.h" #include "webkit/glue/form_data.h" #include "webkit/glue/form_field.h" #include "webkit/glue/web_io_operators.h" using webkit_glue::FormData; using webkit_glue::FormField; using WebKit::WebDocument; using WebKit::WebElement; using WebKit::WebFormControlElement; using WebKit::WebFormElement; using WebKit::WebFrame; using WebKit::WebInputElement; using WebKit::WebLabelElement; using WebKit::WebNode; using WebKit::WebNodeList; using WebKit::WebOptionElement; using WebKit::WebSelectElement; using WebKit::WebString; using WebKit::WebVector; namespace { // The number of fields required by AutoFill. Ideally we could send the forms // to AutoFill no matter how many fields are in the forms; however, finding the // label for each field is a costly operation and we can't spare the cycles if // it's not necessary. const size_t kRequiredAutoFillFields = 3; // The maximum length allowed for form data. const size_t kMaxDataLength = 1024; // In HTML5, all text fields except password are text input fields to // autocomplete. bool IsTextInput(const WebInputElement* element) { if (!element) return false; return element->isTextField() && !element->isPasswordField(); } bool IsSelectElement(const WebFormControlElement& element) { return element.formControlType() == ASCIIToUTF16("select-one"); } bool IsOptionElement(const WebElement& element) { return element.hasTagName("option"); } bool IsAutoFillableElement(const WebFormControlElement& element) { const WebInputElement* input_element = toWebInputElement(&element); return IsTextInput(input_element) || IsSelectElement(element); } // This is a helper function for the FindChildText() function (see below). // Search depth is limited with the |depth| parameter. string16 FindChildTextInner(const WebNode& node, int depth) { string16 element_text; if (depth <= 0 || node.isNull()) return element_text; string16 node_text = node.nodeValue(); TrimWhitespace(node_text, TRIM_ALL, &node_text); if (!node_text.empty()) element_text = node_text; string16 child_text = FindChildTextInner(node.firstChild(), depth-1); if (!child_text.empty()) element_text = element_text + child_text; string16 sibling_text = FindChildTextInner(node.nextSibling(), depth-1); if (!sibling_text.empty()) element_text = element_text + sibling_text; return element_text; } // Returns the aggregated values of the descendants or siblings of |node| that // are non-empty text nodes. This is a faster alternative to |innerText()| for // performance critical operations. It does a full depth-first search so can be // used when the structure is not directly known. Whitespace is trimmed from // text accumulated at descendant and sibling. Search is limited to within 10 // siblings and/or descendants. string16 FindChildText(const WebElement& element) { WebNode child = element.firstChild(); const int kChildSearchDepth = 10; return FindChildTextInner(child, kChildSearchDepth); } // Helper for |InferLabelForElement()| that infers a label, if possible, from // a previous node of |element|. string16 InferLabelFromPrevious(const WebFormControlElement& element) { string16 inferred_label; WebNode previous = element.previousSibling(); if (previous.isNull()) return string16(); if (previous.isTextNode()) { inferred_label = previous.nodeValue(); TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); } // If we didn't find text, check for previous paragraph. // Eg.

Some Text

// Note the lack of whitespace between

and elements. if (inferred_label.empty() && previous.isElementNode()) { WebElement element = previous.to(); if (element.hasTagName("p")) { inferred_label = FindChildText(element); } } // If we didn't find paragraph, check for previous paragraph to this. // Eg.

Some Text

// Note the whitespace between

and elements. if (inferred_label.empty()) { WebNode sibling = previous.previousSibling(); if (!sibling.isNull() && sibling.isElementNode()) { WebElement element = sibling.to(); if (element.hasTagName("p")) { inferred_label = FindChildText(element); } } } // Look for text node prior to tag. // Eg. Some Text if (inferred_label.empty()) { while (inferred_label.empty() && !previous.isNull()) { if (previous.isTextNode()) { inferred_label = previous.nodeValue(); TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); } else if (previous.isElementNode()) { WebElement element = previous.to(); if (!element.hasTagName("img")) break; } else { break; } previous = previous.previousSibling(); } } // Look for label node prior to tag. // Eg. if (inferred_label.empty()) { while (inferred_label.empty() && !previous.isNull()) { if (previous.isTextNode()) { inferred_label = previous.nodeValue(); TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); } else if (previous.isElementNode()) { WebElement element = previous.to(); if (element.hasTagName("label")) { inferred_label = FindChildText(element); } else { break; } } else { break; } previous = previous.previousSibling(); } } return inferred_label; } // Helper for |InferLabelForElement()| that infers a label, if possible, from // surrounding table structure. // Eg. Some Text // Eg. Some Text string16 InferLabelFromTable(const WebFormControlElement& element) { string16 inferred_label; WebNode parent = element.parentNode(); while (!parent.isNull() && parent.isElementNode() && !parent.to().hasTagName("td")) parent = parent.parentNode(); // Check all previous siblings, skipping non-element nodes, until we find a // non-empty text block. WebNode previous = parent; while (!previous.isNull()) { if (previous.isElementNode()) { WebElement e = previous.to(); if (e.hasTagName("td")) { inferred_label = FindChildText(e); if (!inferred_label.empty()) break; } } previous = previous.previousSibling(); } return inferred_label; } // Helper for |InferLabelForElement()| that infers a label, if possible, from // a surrounding div table. // Eg.

Some Text
string16 InferLabelFromDivTable(const WebFormControlElement& element) { WebNode parent = element.parentNode(); while (!parent.isNull() && parent.isElementNode() && !parent.to().hasTagName("div")) parent = parent.parentNode(); if (parent.isNull() || !parent.isElementNode()) return string16(); WebElement e = parent.to(); if (e.isNull() || !e.hasTagName("div")) return string16(); return FindChildText(e); } // Helper for |InferLabelForElement()| that infers a label, if possible, from // a surrounding definition list. // Eg.
Some Text
// Eg.
Some Text
string16 InferLabelFromDefinitionList(const WebFormControlElement& element) { string16 inferred_label; WebNode parent = element.parentNode(); while (!parent.isNull() && parent.isElementNode() && !parent.to().hasTagName("dd")) parent = parent.parentNode(); if (!parent.isNull() && parent.isElementNode()) { WebElement element = parent.to(); if (element.hasTagName("dd")) { WebNode previous = parent.previousSibling(); // Skip by any intervening text nodes. while (!previous.isNull() && previous.isTextNode()) previous = previous.previousSibling(); if (!previous.isNull() && previous.isElementNode()) { element = previous.to(); if (element.hasTagName("dt")) { inferred_label = FindChildText(element); } } } } return inferred_label; } // Infers corresponding label for |element| from surrounding context in the DOM. // Contents of preceding

tag or preceding text element found in the form. string16 InferLabelForElement(const WebFormControlElement& element) { string16 inferred_label = InferLabelFromPrevious(element); // If we didn't find a label, check for table cell case. if (inferred_label.empty()) inferred_label = InferLabelFromTable(element); // If we didn't find a label, check for div table case. if (inferred_label.empty()) inferred_label = InferLabelFromDivTable(element); // If we didn't find a label, check for definition list case. if (inferred_label.empty()) inferred_label = InferLabelFromDefinitionList(element); return inferred_label; } void GetOptionStringsFromElement(const WebSelectElement& select_element, std::vector* option_strings) { DCHECK(!select_element.isNull()); DCHECK(option_strings); option_strings->clear(); WebVector list_items = select_element.listItems(); option_strings->reserve(list_items.size()); for (size_t i = 0; i < list_items.size(); ++i) { if (IsOptionElement(list_items[i])) { option_strings->push_back( list_items[i].toConst().value()); } } } } // namespace namespace autofill { struct FormManager::FormElement { WebKit::WebFormElement form_element; std::vector control_elements; std::vector control_values; }; FormManager::FormManager() { } FormManager::~FormManager() { Reset(); } // static void FormManager::WebFormControlElementToFormField( const WebFormControlElement& element, ExtractMask extract_mask, FormField* field) { DCHECK(field); DCHECK(!element.isNull()); // The label is not officially part of a WebFormControlElement; however, the // labels for all form control elements are scraped from the DOM and set in // WebFormElementToFormData. field->set_name(element.nameForAutofill()); field->set_form_control_type(element.formControlType()); if (!IsAutoFillableElement(element)) return; const WebInputElement* input_element = toWebInputElement(&element); if (IsTextInput(input_element)) { field->set_max_length(input_element->maxLength()); field->set_autofilled(input_element->isAutofilled()); } else if (extract_mask & EXTRACT_OPTIONS) { // Set option strings on the field if available. DCHECK(IsSelectElement(element)); const WebSelectElement select_element = element.toConst(); std::vector option_strings; GetOptionStringsFromElement(select_element, &option_strings); field->set_option_strings(option_strings); } if (!(extract_mask & EXTRACT_VALUE)) return; string16 value; if (IsTextInput(input_element)) { value = input_element->value(); } else { DCHECK(IsSelectElement(element)); const WebSelectElement select_element = element.toConst(); value = select_element.value(); // Convert the |select_element| value to text if requested. if (extract_mask & EXTRACT_OPTION_TEXT) { WebVector list_items = select_element.listItems(); for (size_t i = 0; i < list_items.size(); ++i) { if (IsOptionElement(list_items[i])) { const WebOptionElement option_element = list_items[i].toConst(); if (option_element.value() == value) { value = option_element.text(); break; } } } } } // TODO(jhawkins): This is a temporary stop-gap measure designed to prevent // a malicious site from DOS'ing the browser with extremely large profile // data. The correct solution is to parse this data asynchronously. // See http://crbug.com/49332. if (value.size() > kMaxDataLength) value = value.substr(0, kMaxDataLength); field->set_value(value); } // static string16 FormManager::LabelForElement(const WebFormControlElement& element) { // Don't scrape labels for elements we can't possibly autofill anyway. if (!IsAutoFillableElement(element)) return string16(); WebNodeList labels = element.document().getElementsByTagName("label"); for (unsigned i = 0; i < labels.length(); ++i) { WebLabelElement label = labels.item(i).to(); DCHECK(label.hasTagName("label")); if (label.correspondingControl() == element) return FindChildText(label); } // Infer the label from context if not found in label element. return InferLabelForElement(element); } // static bool FormManager::WebFormElementToFormData(const WebFormElement& element, RequirementsMask requirements, ExtractMask extract_mask, FormData* form) { DCHECK(form); const WebFrame* frame = element.document().frame(); if (!frame) return false; if (requirements & REQUIRE_AUTOCOMPLETE && !element.autoComplete()) return false; form->name = element.name(); form->method = element.method(); form->origin = frame->url(); form->action = frame->document().completeURL(element.action()); form->user_submitted = element.wasUserSubmitted(); // If the completed URL is not valid, just use the action we get from // WebKit. if (!form->action.is_valid()) form->action = GURL(element.action()); // A map from a FormField's name to the FormField itself. std::map name_map; // The extracted FormFields. We use pointers so we can store them in // |name_map|. ScopedVector form_fields; WebVector control_elements; element.getFormControlElements(control_elements); // A vector of bools that indicate whether each field in the form meets the // requirements and thus will be in the resulting |form|. std::vector fields_extracted(control_elements.size(), false); for (size_t i = 0; i < control_elements.size(); ++i) { const WebFormControlElement& control_element = control_elements[i]; if (!IsAutoFillableElement(control_element)) continue; const WebInputElement* input_element = toWebInputElement(&control_element); if (requirements & REQUIRE_AUTOCOMPLETE && IsTextInput(input_element) && !input_element->autoComplete()) continue; if (requirements & REQUIRE_ENABLED && !control_element.isEnabled()) continue; // Create a new FormField, fill it out and map it to the field's name. FormField* field = new FormField; WebFormControlElementToFormField(control_element, extract_mask, field); form_fields.push_back(field); // TODO(jhawkins): A label element is mapped to a form control element's id. // field->name() will contain the id only if the name does not exist. Add // an id() method to WebFormControlElement and use that here. name_map[field->name()] = field; fields_extracted[i] = true; } // Don't extract field labels if we have no fields. if (form_fields.empty()) return false; // Loop through the label elements inside the form element. For each label // element, get the corresponding form control element, use the form control // element's name as a key into the map to find the // previously created FormField and set the FormField's label to the // label.firstChild().nodeValue() of the label element. WebNodeList labels = element.getElementsByTagName("label"); for (unsigned i = 0; i < labels.length(); ++i) { WebLabelElement label = labels.item(i).to(); WebFormControlElement field_element = label.correspondingControl().to(); if (field_element.isNull() || !field_element.isFormControlElement() || field_element.formControlType() == WebString::fromUTF8("hidden")) continue; std::map::iterator iter = name_map.find(field_element.nameForAutofill()); if (iter != name_map.end()) iter->second->set_label(FindChildText(label)); } // Loop through the form control elements, extracting the label text from the // DOM. We use the |fields_extracted| vector to make sure we assign the // extracted label to the correct field, as it's possible |form_fields| will // not contain all of the elements in |control_elements|. for (size_t i = 0, field_idx = 0; i < control_elements.size() && field_idx < form_fields.size(); ++i) { // This field didn't meet the requirements, so don't try to find a label for // it. if (!fields_extracted[i]) continue; const WebFormControlElement& control_element = control_elements[i]; if (form_fields[field_idx]->label().empty()) form_fields[field_idx]->set_label(InferLabelForElement(control_element)); ++field_idx; } // Copy the created FormFields into the resulting FormData object. for (ScopedVector::const_iterator iter = form_fields.begin(); iter != form_fields.end(); ++iter) { form->fields.push_back(**iter); } return true; } void FormManager::ExtractForms(const WebFrame* frame) { DCHECK(frame); // Reset the vector of FormElements for this frame. ResetFrame(frame); WebVector web_forms; frame->forms(web_forms); for (size_t i = 0; i < web_forms.size(); ++i) { // Owned by |form_elements_|. FormElement* form_element = new FormElement; form_element->form_element = web_forms[i]; WebVector control_elements; form_element->form_element.getFormControlElements(control_elements); for (size_t j = 0; j < control_elements.size(); ++j) { WebFormControlElement element = control_elements[j]; if (!IsAutoFillableElement(element)) continue; form_element->control_elements.push_back(element); // Save original values of