// Copyright 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/autofill/content/renderer/form_autofill_util.h" #include #include #include "base/command_line.h" #include "base/logging.h" #include "base/memory/scoped_vector.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" #include "build/build_config.h" #include "components/autofill/core/common/autofill_data_validation.h" #include "components/autofill/core/common/autofill_regexes.h" #include "components/autofill/core/common/autofill_switches.h" #include "components/autofill/core/common/autofill_util.h" #include "components/autofill/core/common/form_data.h" #include "components/autofill/core/common/form_field_data.h" #include "third_party/WebKit/public/platform/WebString.h" #include "third_party/WebKit/public/platform/WebVector.h" #include "third_party/WebKit/public/web/WebDocument.h" #include "third_party/WebKit/public/web/WebElement.h" #include "third_party/WebKit/public/web/WebElementCollection.h" #include "third_party/WebKit/public/web/WebFormControlElement.h" #include "third_party/WebKit/public/web/WebFormElement.h" #include "third_party/WebKit/public/web/WebInputElement.h" #include "third_party/WebKit/public/web/WebLabelElement.h" #include "third_party/WebKit/public/web/WebLocalFrame.h" #include "third_party/WebKit/public/web/WebNode.h" #include "third_party/WebKit/public/web/WebOptionElement.h" #include "third_party/WebKit/public/web/WebSelectElement.h" using blink::WebDocument; using blink::WebElement; using blink::WebElementCollection; using blink::WebFormControlElement; using blink::WebFormElement; using blink::WebFrame; using blink::WebInputElement; using blink::WebLabelElement; using blink::WebNode; using blink::WebOptionElement; using blink::WebSelectElement; using blink::WebString; using blink::WebVector; namespace autofill { namespace form_util { const size_t kMaxParseableFields = 200; namespace { // A bit field mask for FillForm functions to not fill some fields. enum FieldFilterMask { FILTER_NONE = 0, FILTER_DISABLED_ELEMENTS = 1 << 0, FILTER_READONLY_ELEMENTS = 1 << 1, FILTER_NON_FOCUSABLE_ELEMENTS = 1 << 2, FILTER_ALL_NON_EDITABLE_ELEMENTS = FILTER_DISABLED_ELEMENTS | FILTER_READONLY_ELEMENTS | FILTER_NON_FOCUSABLE_ELEMENTS, }; void TruncateString(base::string16* str, size_t max_length) { if (str->length() > max_length) str->resize(max_length); } bool IsOptionElement(const WebElement& element) { CR_DEFINE_STATIC_LOCAL(WebString, kOption, ("option")); return element.hasHTMLTagName(kOption); } bool IsScriptElement(const WebElement& element) { CR_DEFINE_STATIC_LOCAL(WebString, kScript, ("script")); return element.hasHTMLTagName(kScript); } bool IsNoScriptElement(const WebElement& element) { CR_DEFINE_STATIC_LOCAL(WebString, kNoScript, ("noscript")); return element.hasHTMLTagName(kNoScript); } bool HasTagName(const WebNode& node, const blink::WebString& tag) { return node.isElementNode() && node.toConst().hasHTMLTagName(tag); } bool IsAutofillableElement(const WebFormControlElement& element) { const WebInputElement* input_element = toWebInputElement(&element); return IsAutofillableInputElement(input_element) || IsSelectElement(element) || IsTextAreaElement(element); } bool IsElementInControlElementSet( const WebElement& element, const std::vector& control_elements) { if (!element.isFormControlElement()) return false; const WebFormControlElement form_control_element = element.toConst(); return std::find(control_elements.begin(), control_elements.end(), form_control_element) != control_elements.end(); } bool IsElementInsideFormOrFieldSet(const WebElement& element) { for (WebNode parent_node = element.parentNode(); !parent_node.isNull(); parent_node = parent_node.parentNode()) { if (!parent_node.isElementNode()) continue; WebElement cur_element = parent_node.to(); if (cur_element.hasHTMLTagName("form") || cur_element.hasHTMLTagName("fieldset")) { return true; } } return false; } // Returns true if |node| is an element and it is a container type that // InferLabelForElement() can traverse. bool IsTraversableContainerElement(const WebNode& node) { if (!node.isElementNode()) return false; const WebElement element = node.toConst(); return element.hasHTMLTagName("dd") || element.hasHTMLTagName("div") || element.hasHTMLTagName("fieldset") || element.hasHTMLTagName("li") || element.hasHTMLTagName("td") || element.hasHTMLTagName("table"); } // Returns the colspan for a / . Defaults to 1. size_t CalculateTableCellColumnSpan(const WebElement& element) { DCHECK(element.hasHTMLTagName("td") || element.hasHTMLTagName("th")); size_t span = 1; if (element.hasAttribute("colspan")) { base::string16 colspan = element.getAttribute("colspan"); // Do not check return value to accept imperfect conversions. base::StringToSizeT(colspan, &span); // Handle overflow. if (span == std::numeric_limits::max()) span = 1; span = std::max(span, static_cast(1)); } return span; } // Appends |suffix| to |prefix| so that any intermediary whitespace is collapsed // to a single space. If |force_whitespace| is true, then the resulting string // is guaranteed to have a space between |prefix| and |suffix|. Otherwise, the // result includes a space only if |prefix| has trailing whitespace or |suffix| // has leading whitespace. // A few examples: // * CombineAndCollapseWhitespace("foo", "bar", false) -> "foobar" // * CombineAndCollapseWhitespace("foo", "bar", true) -> "foo bar" // * CombineAndCollapseWhitespace("foo ", "bar", false) -> "foo bar" // * CombineAndCollapseWhitespace("foo", " bar", false) -> "foo bar" // * CombineAndCollapseWhitespace("foo", " bar", true) -> "foo bar" // * CombineAndCollapseWhitespace("foo ", " bar", false) -> "foo bar" // * CombineAndCollapseWhitespace(" foo", "bar ", false) -> " foobar " // * CombineAndCollapseWhitespace(" foo", "bar ", true) -> " foo bar " const base::string16 CombineAndCollapseWhitespace( const base::string16& prefix, const base::string16& suffix, bool force_whitespace) { base::string16 prefix_trimmed; base::TrimPositions prefix_trailing_whitespace = base::TrimWhitespace(prefix, base::TRIM_TRAILING, &prefix_trimmed); // Recursively compute the children's text. base::string16 suffix_trimmed; base::TrimPositions suffix_leading_whitespace = base::TrimWhitespace(suffix, base::TRIM_LEADING, &suffix_trimmed); if (prefix_trailing_whitespace || suffix_leading_whitespace || force_whitespace) { return prefix_trimmed + base::ASCIIToUTF16(" ") + suffix_trimmed; } else { return prefix_trimmed + suffix_trimmed; } } // This is a helper function for the FindChildText() function (see below). // Search depth is limited with the |depth| parameter. // |divs_to_skip| is a list of
tags to ignore if encountered. base::string16 FindChildTextInner(const WebNode& node, int depth, const std::set& divs_to_skip) { if (depth <= 0 || node.isNull()) return base::string16(); // Skip over comments. if (node.isCommentNode()) return FindChildTextInner(node.nextSibling(), depth - 1, divs_to_skip); if (!node.isElementNode() && !node.isTextNode()) return base::string16(); // Ignore elements known not to contain inferable labels. if (node.isElementNode()) { const WebElement element = node.toConst(); if (IsOptionElement(element) || IsScriptElement(element) || IsNoScriptElement(element) || (element.isFormControlElement() && IsAutofillableElement(element.toConst()))) { return base::string16(); } if (element.hasHTMLTagName("div") && ContainsKey(divs_to_skip, node)) return base::string16(); } // Extract the text exactly at this node. base::string16 node_text = node.nodeValue(); // Recursively compute the children's text. // Preserve inter-element whitespace separation. base::string16 child_text = FindChildTextInner(node.firstChild(), depth - 1, divs_to_skip); bool add_space = node.isTextNode() && node_text.empty(); node_text = CombineAndCollapseWhitespace(node_text, child_text, add_space); // Recursively compute the siblings' text. // Again, preserve inter-element whitespace separation. base::string16 sibling_text = FindChildTextInner(node.nextSibling(), depth - 1, divs_to_skip); add_space = node.isTextNode() && node_text.empty(); node_text = CombineAndCollapseWhitespace(node_text, sibling_text, add_space); return node_text; } // Same as FindChildText() below, but with a list of div nodes to skip. // TODO(thestig): See if other FindChildText() callers can benefit from this. base::string16 FindChildTextWithIgnoreList( const WebNode& node, const std::set& divs_to_skip) { if (node.isTextNode()) return node.nodeValue(); WebNode child = node.firstChild(); const int kChildSearchDepth = 10; base::string16 node_text = FindChildTextInner(child, kChildSearchDepth, divs_to_skip); base::TrimWhitespace(node_text, base::TRIM_ALL, &node_text); return node_text; } // Returns the aggregated values of the descendants of |element| that are // non-empty text nodes. This is a faster alternative to |innerText()| for // performance critical operations. It does a full depth-first search so can be // used when the structure is not directly known. However, unlike with // |innerText()|, the search depth and breadth are limited to a fixed threshold. // Whitespace is trimmed from text accumulated at descendant nodes. base::string16 FindChildText(const WebNode& node) { return FindChildTextWithIgnoreList(node, std::set()); } // Shared function for InferLabelFromPrevious() and InferLabelFromNext(). base::string16 InferLabelFromSibling(const WebFormControlElement& element, bool forward) { base::string16 inferred_label; WebNode sibling = element; while (true) { sibling = forward ? sibling.nextSibling() : sibling.previousSibling(); if (sibling.isNull()) break; // Skip over comments. if (sibling.isCommentNode()) continue; // Otherwise, only consider normal HTML elements and their contents. if (!sibling.isElementNode() && !sibling.isTextNode()) break; // A label might be split across multiple "lightweight" nodes. // Coalesce any text contained in multiple consecutive // (a) plain text nodes or // (b) inline HTML elements that are essentially equivalent to text nodes. CR_DEFINE_STATIC_LOCAL(WebString, kBold, ("b")); CR_DEFINE_STATIC_LOCAL(WebString, kStrong, ("strong")); CR_DEFINE_STATIC_LOCAL(WebString, kSpan, ("span")); CR_DEFINE_STATIC_LOCAL(WebString, kFont, ("font")); if (sibling.isTextNode() || HasTagName(sibling, kBold) || HasTagName(sibling, kStrong) || HasTagName(sibling, kSpan) || HasTagName(sibling, kFont)) { base::string16 value = FindChildText(sibling); // A text node's value will be empty if it is for a line break. bool add_space = sibling.isTextNode() && value.empty(); inferred_label = CombineAndCollapseWhitespace(value, inferred_label, add_space); continue; } // If we have identified a partial label and have reached a non-lightweight // element, consider the label to be complete. base::string16 trimmed_label; base::TrimWhitespace(inferred_label, base::TRIM_ALL, &trimmed_label); if (!trimmed_label.empty()) break; // and
tags often appear between the input element and its // label text, so skip over them. CR_DEFINE_STATIC_LOCAL(WebString, kImage, ("img")); CR_DEFINE_STATIC_LOCAL(WebString, kBreak, ("br")); if (HasTagName(sibling, kImage) || HasTagName(sibling, kBreak)) continue; // We only expect

and

Some Text

// or // or Some Text // or Some Text
. base::string16 InferLabelFromPrevious(const WebFormControlElement& element) { return InferLabelFromSibling(element, false /* forward? */); } // Same as InferLabelFromPrevious(), but in the other direction. // Useful for cases like: Label For Checkbox base::string16 InferLabelFromNext(const WebFormControlElement& element) { return InferLabelFromSibling(element, true /* forward? */); } // Helper for |InferLabelForElement()| that infers a label, if possible, from // the placeholder text. e.g. base::string16 InferLabelFromPlaceholder(const WebFormControlElement& element) { CR_DEFINE_STATIC_LOCAL(WebString, kPlaceholder, ("placeholder")); if (element.hasAttribute(kPlaceholder)) return element.getAttribute(kPlaceholder); return base::string16(); } // Helper for |InferLabelForElement()| that infers a label, from // the value attribute when it is present and user has not typed in (if // element's value attribute is same as the element's value). base::string16 InferLabelFromValueAttr(const WebFormControlElement& element) { CR_DEFINE_STATIC_LOCAL(WebString, kValue, ("value")); if (element.hasAttribute(kValue) && element.getAttribute(kValue) == element.value()) { return element.getAttribute(kValue); } return base::string16(); } // Helper for |InferLabelForElement()| that infers a label, if possible, from // enclosing list item, // e.g.
  • Some Text
  • base::string16 InferLabelFromListItem(const WebFormControlElement& element) { WebNode parent = element.parentNode(); CR_DEFINE_STATIC_LOCAL(WebString, kListItem, ("li")); while (!parent.isNull() && parent.isElementNode() && !parent.to().hasHTMLTagName(kListItem)) { parent = parent.parentNode(); } if (!parent.isNull() && HasTagName(parent, kListItem)) return FindChildText(parent); return base::string16(); } // Helper for |InferLabelForElement()| that infers a label, if possible, from // enclosing label, // e.g. base::string16 InferLabelFromEnclosingLabel( const WebFormControlElement& element) { WebNode parent = element.parentNode(); CR_DEFINE_STATIC_LOCAL(WebString, kLabel, ("label")); while (!parent.isNull() && parent.isElementNode() && !parent.to().hasHTMLTagName(kLabel)) { parent = parent.parentNode(); } if (!parent.isNull() && HasTagName(parent, kLabel)) return FindChildText(parent); return base::string16(); } // Helper for |InferLabelForElement()| that infers a label, if possible, from // surrounding table structure, // e.g. Some Text // or Some Text // or Some Text // or Some Text base::string16 InferLabelFromTableColumn(const WebFormControlElement& element) { CR_DEFINE_STATIC_LOCAL(WebString, kTableCell, ("td")); WebNode parent = element.parentNode(); while (!parent.isNull() && parent.isElementNode() && !parent.to().hasHTMLTagName(kTableCell)) { parent = parent.parentNode(); } if (parent.isNull()) return base::string16(); // Check all previous siblings, skipping non-element nodes, until we find a // non-empty text block. base::string16 inferred_label; WebNode previous = parent.previousSibling(); CR_DEFINE_STATIC_LOCAL(WebString, kTableHeader, ("th")); while (inferred_label.empty() && !previous.isNull()) { if (HasTagName(previous, kTableCell) || HasTagName(previous, kTableHeader)) inferred_label = FindChildText(previous); previous = previous.previousSibling(); } return inferred_label; } // Helper for |InferLabelForElement()| that infers a label, if possible, from // surrounding table structure, // // If there are multiple cells and the row with the input matches up with the // previous row, then look for a specific cell within the previous row. // e.g. Input 1 labelInput 2 label // // // Otherwise, just look in the entire previous row. // e.g. Some Text base::string16 InferLabelFromTableRow(const WebFormControlElement& element) { CR_DEFINE_STATIC_LOCAL(WebString, kTableCell, ("td")); base::string16 inferred_label; // First find the that contains |element|. WebNode cell = element.parentNode(); while (!cell.isNull()) { if (cell.isElementNode() && cell.to().hasHTMLTagName(kTableCell)) { break; } cell = cell.parentNode(); } // Not in a cell - bail out. if (cell.isNull()) return inferred_label; // Count the cell holding |element|. size_t cell_count = CalculateTableCellColumnSpan(cell.to()); size_t cell_position = 0; size_t cell_position_end = cell_count - 1; // Count cells to the left to figure out |element|'s cell's position. for (WebNode cell_it = cell.previousSibling(); !cell_it.isNull(); cell_it = cell_it.previousSibling()) { if (cell_it.isElementNode() && cell_it.to().hasHTMLTagName(kTableCell)) { cell_position += CalculateTableCellColumnSpan(cell_it.to()); } } // Count cells to the right. for (WebNode cell_it = cell.nextSibling(); !cell_it.isNull(); cell_it = cell_it.nextSibling()) { if (cell_it.isElementNode() && cell_it.to().hasHTMLTagName(kTableCell)) { cell_count += CalculateTableCellColumnSpan(cell_it.to()); } } // Combine left + right. cell_count += cell_position; cell_position_end += cell_position; // Find the current row. CR_DEFINE_STATIC_LOCAL(WebString, kTableRow, ("tr")); WebNode parent = element.parentNode(); while (!parent.isNull() && parent.isElementNode() && !parent.to().hasHTMLTagName(kTableRow)) { parent = parent.parentNode(); } if (parent.isNull()) return inferred_label; // Now find the previous row. WebNode row_it = parent.previousSibling(); while (!row_it.isNull()) { if (row_it.isElementNode() && row_it.to().hasHTMLTagName(kTableRow)) { break; } row_it = row_it.previousSibling(); } // If there exists a previous row, check its cells and size. If they align // with the current row, infer the label from the cell above. if (!row_it.isNull()) { WebNode matching_cell; size_t prev_row_count = 0; WebNode prev_row_it = row_it.firstChild(); CR_DEFINE_STATIC_LOCAL(WebString, kTableHeader, ("th")); while (!prev_row_it.isNull()) { if (prev_row_it.isElementNode()) { WebElement prev_row_element = prev_row_it.to(); if (prev_row_element.hasHTMLTagName(kTableCell) || prev_row_element.hasHTMLTagName(kTableHeader)) { size_t span = CalculateTableCellColumnSpan(prev_row_element); size_t prev_row_count_end = prev_row_count + span - 1; if (prev_row_count == cell_position && prev_row_count_end == cell_position_end) { matching_cell = prev_row_it; } prev_row_count += span; } } prev_row_it = prev_row_it.nextSibling(); } if ((cell_count == prev_row_count) && !matching_cell.isNull()) { inferred_label = FindChildText(matching_cell); if (!inferred_label.empty()) return inferred_label; } } // If there is no previous row, or if the previous row and current row do not // align, check all previous siblings, skipping non-element nodes, until we // find a non-empty text block. WebNode previous = parent.previousSibling(); while (inferred_label.empty() && !previous.isNull()) { if (HasTagName(previous, kTableRow)) inferred_label = FindChildText(previous); previous = previous.previousSibling(); } return inferred_label; } // Helper for |InferLabelForElement()| that infers a label, if possible, from // a surrounding div table, // e.g.
    Some Text
    // e.g.
    Some Text
    // // Because this is already traversing the
    structure, if it finds a