// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/autofill/form_manager.h"
#include "base/logging.h"
#include "base/scoped_vector.h"
#include "base/string_util.h"
#include "base/stl_util-inl.h"
#include "base/utf_string_conversions.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebFormControlElement.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebInputElement.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebLabelElement.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebNode.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeList.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebOptionElement.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebSelectElement.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebVector.h"
#include "webkit/glue/form_data.h"
#include "webkit/glue/form_field.h"
#include "webkit/glue/web_io_operators.h"
using webkit_glue::FormData;
using webkit_glue::FormField;
using WebKit::WebDocument;
using WebKit::WebElement;
using WebKit::WebFormControlElement;
using WebKit::WebFormElement;
using WebKit::WebFrame;
using WebKit::WebInputElement;
using WebKit::WebLabelElement;
using WebKit::WebNode;
using WebKit::WebNodeList;
using WebKit::WebOptionElement;
using WebKit::WebSelectElement;
using WebKit::WebString;
using WebKit::WebVector;
namespace {
// The number of fields required by AutoFill. Ideally we could send the forms
// to AutoFill no matter how many fields are in the forms; however, finding the
// label for each field is a costly operation and we can't spare the cycles if
// it's not necessary.
const size_t kRequiredAutoFillFields = 3;
// The maximum length allowed for form data.
const size_t kMaxDataLength = 1024;
// In HTML5, all text fields except password are text input fields to
// autocomplete.
bool IsTextInput(const WebInputElement* element) {
if (!element)
return false;
return element->isTextField() && !element->isPasswordField();
}
bool IsSelectElement(const WebFormControlElement& element) {
return element.formControlType() == ASCIIToUTF16("select-one");
}
bool IsOptionElement(const WebElement& element) {
return element.hasTagName("option");
}
bool IsAutoFillableElement(const WebFormControlElement& element) {
const WebInputElement* input_element = toWebInputElement(&element);
return IsTextInput(input_element) || IsSelectElement(element);
}
// This is a helper function for the FindChildText() function (see below).
// Search depth is limited with the |depth| parameter.
string16 FindChildTextInner(const WebNode& node, int depth) {
string16 element_text;
if (depth <= 0 || node.isNull())
return element_text;
string16 node_text = node.nodeValue();
TrimWhitespace(node_text, TRIM_ALL, &node_text);
if (!node_text.empty())
element_text = node_text;
string16 child_text = FindChildTextInner(node.firstChild(), depth-1);
if (!child_text.empty())
element_text = element_text + child_text;
string16 sibling_text = FindChildTextInner(node.nextSibling(), depth-1);
if (!sibling_text.empty())
element_text = element_text + sibling_text;
return element_text;
}
// Returns the aggregated values of the descendants or siblings of |node| that
// are non-empty text nodes. This is a faster alternative to |innerText()| for
// performance critical operations. It does a full depth-first search so can be
// used when the structure is not directly known. Whitespace is trimmed from
// text accumulated at descendant and sibling. Search is limited to within 10
// siblings and/or descendants.
string16 FindChildText(const WebElement& element) {
WebNode child = element.firstChild();
const int kChildSearchDepth = 10;
return FindChildTextInner(child, kChildSearchDepth);
}
// Helper for |InferLabelForElement()| that infers a label, if possible, from
// a previous node of |element|.
string16 InferLabelFromPrevious(const WebFormControlElement& element) {
string16 inferred_label;
WebNode previous = element.previousSibling();
if (previous.isNull())
return string16();
if (previous.isTextNode()) {
inferred_label = previous.nodeValue();
TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
}
// If we didn't find text, check for previous paragraph.
// Eg.
Some Text
// Note the lack of whitespace between
and elements.
if (inferred_label.empty() && previous.isElementNode()) {
WebElement element = previous.to();
if (element.hasTagName("p")) {
inferred_label = FindChildText(element);
}
}
// If we didn't find paragraph, check for previous paragraph to this.
// Eg.
Some Text
// Note the whitespace between
and elements.
if (inferred_label.empty()) {
WebNode sibling = previous.previousSibling();
if (!sibling.isNull() && sibling.isElementNode()) {
WebElement element = sibling.to();
if (element.hasTagName("p")) {
inferred_label = FindChildText(element);
}
}
}
// Look for text node prior to tag.
// Eg. Some Text
if (inferred_label.empty()) {
while (inferred_label.empty() && !previous.isNull()) {
if (previous.isTextNode()) {
inferred_label = previous.nodeValue();
TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
} else if (previous.isElementNode()) {
WebElement element = previous.to();
if (!element.hasTagName("img"))
break;
} else {
break;
}
previous = previous.previousSibling();
}
}
// Look for label node prior to tag.
// Eg.
if (inferred_label.empty()) {
while (inferred_label.empty() && !previous.isNull()) {
if (previous.isTextNode()) {
inferred_label = previous.nodeValue();
TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
} else if (previous.isElementNode()) {
WebElement element = previous.to();
if (element.hasTagName("label")) {
inferred_label = FindChildText(element);
} else {
break;
}
} else {
break;
}
previous = previous.previousSibling();
}
}
return inferred_label;
}
// Helper for |InferLabelForElement()| that infers a label, if possible, from
// surrounding table structure.
// Eg.
Some Text
// Eg.
Some Text
string16 InferLabelFromTable(const WebFormControlElement& element) {
string16 inferred_label;
WebNode parent = element.parentNode();
while (!parent.isNull() && parent.isElementNode() &&
!parent.to().hasTagName("td"))
parent = parent.parentNode();
// Check all previous siblings, skipping non-element nodes, until we find a
// non-empty text block.
WebNode previous = parent;
while (!previous.isNull()) {
if (previous.isElementNode()) {
WebElement e = previous.to();
if (e.hasTagName("td")) {
inferred_label = FindChildText(e);
if (!inferred_label.empty())
break;
}
}
previous = previous.previousSibling();
}
return inferred_label;
}
// Helper for |InferLabelForElement()| that infers a label, if possible, from
// a surrounding div table.
// Eg.
Some Text
string16 InferLabelFromDivTable(const WebFormControlElement& element) {
WebNode parent = element.parentNode();
while (!parent.isNull() && parent.isElementNode() &&
!parent.to().hasTagName("div"))
parent = parent.parentNode();
if (parent.isNull() || !parent.isElementNode())
return string16();
WebElement e = parent.to();
if (e.isNull() || !e.hasTagName("div"))
return string16();
return FindChildText(e);
}
// Helper for |InferLabelForElement()| that infers a label, if possible, from
// a surrounding definition list.
// Eg.
Some Text
// Eg.
Some Text
string16 InferLabelFromDefinitionList(const WebFormControlElement& element) {
string16 inferred_label;
WebNode parent = element.parentNode();
while (!parent.isNull() && parent.isElementNode() &&
!parent.to().hasTagName("dd"))
parent = parent.parentNode();
if (!parent.isNull() && parent.isElementNode()) {
WebElement element = parent.to();
if (element.hasTagName("dd")) {
WebNode previous = parent.previousSibling();
// Skip by any intervening text nodes.
while (!previous.isNull() && previous.isTextNode())
previous = previous.previousSibling();
if (!previous.isNull() && previous.isElementNode()) {
element = previous.to();
if (element.hasTagName("dt")) {
inferred_label = FindChildText(element);
}
}
}
}
return inferred_label;
}
// Infers corresponding label for |element| from surrounding context in the DOM.
// Contents of preceding
tag or preceding text element found in the form.
string16 InferLabelForElement(const WebFormControlElement& element) {
string16 inferred_label = InferLabelFromPrevious(element);
// If we didn't find a label, check for table cell case.
if (inferred_label.empty())
inferred_label = InferLabelFromTable(element);
// If we didn't find a label, check for div table case.
if (inferred_label.empty())
inferred_label = InferLabelFromDivTable(element);
// If we didn't find a label, check for definition list case.
if (inferred_label.empty())
inferred_label = InferLabelFromDefinitionList(element);
return inferred_label;
}
void GetOptionStringsFromElement(const WebSelectElement& select_element,
std::vector* option_strings) {
DCHECK(!select_element.isNull());
DCHECK(option_strings);
option_strings->clear();
WebVector list_items = select_element.listItems();
option_strings->reserve(list_items.size());
for (size_t i = 0; i < list_items.size(); ++i) {
if (IsOptionElement(list_items[i])) {
option_strings->push_back(
list_items[i].toConst().value());
}
}
}
} // namespace
namespace autofill {
struct FormManager::FormElement {
WebKit::WebFormElement form_element;
std::vector control_elements;
std::vector control_values;
};
FormManager::FormManager() {
}
FormManager::~FormManager() {
Reset();
}
// static
void FormManager::WebFormControlElementToFormField(
const WebFormControlElement& element,
ExtractMask extract_mask,
FormField* field) {
DCHECK(field);
DCHECK(!element.isNull());
// The label is not officially part of a WebFormControlElement; however, the
// labels for all form control elements are scraped from the DOM and set in
// WebFormElementToFormData.
field->set_name(element.nameForAutofill());
field->set_form_control_type(element.formControlType());
if (!IsAutoFillableElement(element))
return;
const WebInputElement* input_element = toWebInputElement(&element);
if (IsTextInput(input_element)) {
field->set_max_length(input_element->maxLength());
field->set_autofilled(input_element->isAutofilled());
} else if (extract_mask & EXTRACT_OPTIONS) {
// Set option strings on the field if available.
DCHECK(IsSelectElement(element));
const WebSelectElement select_element = element.toConst();
std::vector option_strings;
GetOptionStringsFromElement(select_element, &option_strings);
field->set_option_strings(option_strings);
}
if (!(extract_mask & EXTRACT_VALUE))
return;
string16 value;
if (IsTextInput(input_element)) {
value = input_element->value();
} else {
DCHECK(IsSelectElement(element));
const WebSelectElement select_element = element.toConst();
value = select_element.value();
// Convert the |select_element| value to text if requested.
if (extract_mask & EXTRACT_OPTION_TEXT) {
WebVector list_items = select_element.listItems();
for (size_t i = 0; i < list_items.size(); ++i) {
if (IsOptionElement(list_items[i])) {
const WebOptionElement option_element =
list_items[i].toConst();
if (option_element.value() == value) {
value = option_element.text();
break;
}
}
}
}
}
// TODO(jhawkins): This is a temporary stop-gap measure designed to prevent
// a malicious site from DOS'ing the browser with extremely large profile
// data. The correct solution is to parse this data asynchronously.
// See http://crbug.com/49332.
if (value.size() > kMaxDataLength)
value = value.substr(0, kMaxDataLength);
field->set_value(value);
}
// static
string16 FormManager::LabelForElement(const WebFormControlElement& element) {
// Don't scrape labels for elements we can't possibly autofill anyway.
if (!IsAutoFillableElement(element))
return string16();
WebNodeList labels = element.document().getElementsByTagName("label");
for (unsigned i = 0; i < labels.length(); ++i) {
WebLabelElement label = labels.item(i).to();
DCHECK(label.hasTagName("label"));
if (label.correspondingControl() == element)
return FindChildText(label);
}
// Infer the label from context if not found in label element.
return InferLabelForElement(element);
}
// static
bool FormManager::WebFormElementToFormData(const WebFormElement& element,
RequirementsMask requirements,
ExtractMask extract_mask,
FormData* form) {
DCHECK(form);
const WebFrame* frame = element.document().frame();
if (!frame)
return false;
if (requirements & REQUIRE_AUTOCOMPLETE && !element.autoComplete())
return false;
form->name = element.name();
form->method = element.method();
form->origin = frame->url();
form->action = frame->document().completeURL(element.action());
form->user_submitted = element.wasUserSubmitted();
// If the completed URL is not valid, just use the action we get from
// WebKit.
if (!form->action.is_valid())
form->action = GURL(element.action());
// A map from a FormField's name to the FormField itself.
std::map name_map;
// The extracted FormFields. We use pointers so we can store them in
// |name_map|.
ScopedVector form_fields;
WebVector control_elements;
element.getFormControlElements(control_elements);
// A vector of bools that indicate whether each field in the form meets the
// requirements and thus will be in the resulting |form|.
std::vector fields_extracted(control_elements.size(), false);
for (size_t i = 0; i < control_elements.size(); ++i) {
const WebFormControlElement& control_element = control_elements[i];
if (!IsAutoFillableElement(control_element))
continue;
const WebInputElement* input_element = toWebInputElement(&control_element);
if (requirements & REQUIRE_AUTOCOMPLETE && IsTextInput(input_element) &&
!input_element->autoComplete())
continue;
if (requirements & REQUIRE_ENABLED && !control_element.isEnabled())
continue;
// Create a new FormField, fill it out and map it to the field's name.
FormField* field = new FormField;
WebFormControlElementToFormField(control_element, extract_mask, field);
form_fields.push_back(field);
// TODO(jhawkins): A label element is mapped to a form control element's id.
// field->name() will contain the id only if the name does not exist. Add
// an id() method to WebFormControlElement and use that here.
name_map[field->name()] = field;
fields_extracted[i] = true;
}
// Don't extract field labels if we have no fields.
if (form_fields.empty())
return false;
// Loop through the label elements inside the form element. For each label
// element, get the corresponding form control element, use the form control
// element's name as a key into the map to find the
// previously created FormField and set the FormField's label to the
// label.firstChild().nodeValue() of the label element.
WebNodeList labels = element.getElementsByTagName("label");
for (unsigned i = 0; i < labels.length(); ++i) {
WebLabelElement label = labels.item(i).to();
WebFormControlElement field_element =
label.correspondingControl().to();
if (field_element.isNull() ||
!field_element.isFormControlElement() ||
field_element.formControlType() == WebString::fromUTF8("hidden"))
continue;
std::map::iterator iter =
name_map.find(field_element.nameForAutofill());
if (iter != name_map.end())
iter->second->set_label(FindChildText(label));
}
// Loop through the form control elements, extracting the label text from the
// DOM. We use the |fields_extracted| vector to make sure we assign the
// extracted label to the correct field, as it's possible |form_fields| will
// not contain all of the elements in |control_elements|.
for (size_t i = 0, field_idx = 0;
i < control_elements.size() && field_idx < form_fields.size(); ++i) {
// This field didn't meet the requirements, so don't try to find a label for
// it.
if (!fields_extracted[i])
continue;
const WebFormControlElement& control_element = control_elements[i];
if (form_fields[field_idx]->label().empty())
form_fields[field_idx]->set_label(InferLabelForElement(control_element));
++field_idx;
}
// Copy the created FormFields into the resulting FormData object.
for (ScopedVector::const_iterator iter = form_fields.begin();
iter != form_fields.end(); ++iter) {
form->fields.push_back(**iter);
}
return true;
}
void FormManager::ExtractForms(const WebFrame* frame) {
DCHECK(frame);
// Reset the vector of FormElements for this frame.
ResetFrame(frame);
WebVector web_forms;
frame->forms(web_forms);
for (size_t i = 0; i < web_forms.size(); ++i) {
// Owned by |form_elements_|.
FormElement* form_element = new FormElement;
form_element->form_element = web_forms[i];
WebVector control_elements;
form_element->form_element.getFormControlElements(control_elements);
for (size_t j = 0; j < control_elements.size(); ++j) {
WebFormControlElement element = control_elements[j];
if (!IsAutoFillableElement(element))
continue;
form_element->control_elements.push_back(element);
// Save original values of