diff options
author | dhollowa@chromium.org <dhollowa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-05-13 19:27:44 +0000 |
---|---|---|
committer | dhollowa@chromium.org <dhollowa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-05-13 19:27:44 +0000 |
commit | b9214d4348aed078b9c4ee4660183bb7a3f3ba36 (patch) | |
tree | 99dc0ecc106ccbe6973dcb694b8fd4f4eeb329a7 | |
parent | d91061d4e4bb1ce5c2ff597902daf68b4872cdc8 (diff) | |
download | chromium_src-b9214d4348aed078b9c4ee4660183bb7a3f3ba36.zip chromium_src-b9214d4348aed078b9c4ee4660183bb7a3f3ba36.tar.gz chromium_src-b9214d4348aed078b9c4ee4660183bb7a3f3ba36.tar.bz2 |
AutoFill label scraping nested table contents.
Label scraping in forms is now extended to include text nested within formatting (and other) elements within the table. For example:
<TR>
<TD>
<FONT>
Last name:
</FONT>
</TD>
<TD>
<FONT>
<INPUT type="text" id="lastname" value="Smith"/>
</FONT>
</TD>
</TR>
The "Last name:" text is now correctly scraped.
BUG=38269
TEST=Manual testing of forms: \
http://www.mycontactform.com/samples/rental.php \
http://www.mycontactform.com/samples/real_estate.php \
http://www.mycontactform.com/samples/jobapp.php \
http://www.mycontactform.com/samples/employee_eval.php \
http://www.mycontactform.com/samples/customer_complaint.php \
Unit test: FormManagerTest.LabelsInferredFromTableCellNested.
Review URL: http://codereview.chromium.org/2061008
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@47176 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | chrome/renderer/form_manager.cc | 63 | ||||
-rw-r--r-- | chrome/renderer/form_manager_unittest.cc | 73 |
2 files changed, 116 insertions, 20 deletions
diff --git a/chrome/renderer/form_manager.cc b/chrome/renderer/form_manager.cc index f81d6df..ebdccbc 100644 --- a/chrome/renderer/form_manager.cc +++ b/chrome/renderer/form_manager.cc @@ -45,22 +45,43 @@ namespace { // it's not necessary. const size_t kRequiredAutoFillFields = 3; -// Returns the node value of the first offspring of |element| that is a text -// node. This is a faster alternative to |innerText()| for performance -// critical operations when the child structure of |element| is known. -string16 GetChildText(const WebElement& element) { +// This is a helper function for the FindChildText() function. +// Returns the node value of the descendant or sibling of |node| that is a +// non-empty text node. This is a faster alternative to |innerText()| for +// performance critical operations. It does a full depth-first search so +// can be used when the structure is not directly known. It does not aggregate +// the text of multiple nodes, it just returns the value of the first found. +// "Non-empty" in this case means non-empty after the whitespace has been +// stripped. +string16 FindChildTextInner(const WebNode& node) { string16 element_text; - WebNode child = element.firstChild(); - // Find the text node. - while (!child.isNull() && !child.isTextNode()) - child = child.firstChild(); - if (!child.isNull()) { - element_text = child.nodeValue(); - TrimWhitespace(element_text, TRIM_ALL, &element_text); - } + if (node.isNull()) + return element_text; + + element_text = node.nodeValue(); + TrimWhitespace(element_text, TRIM_ALL, &element_text); + if (!element_text.empty()) + return element_text; + + element_text = FindChildTextInner(node.firstChild()); + if (!element_text.empty()) + return element_text; + + element_text = FindChildTextInner(node.nextSibling()); + if (!element_text.empty()) + return element_text; + return element_text; } +// Returns the node value of the first decendant of |element| that is a +// non-empty text node. "Non-empty" in this case means non-empty after the +// whitespace has been stripped. +string16 FindChildText(const WebElement& element) { + WebNode child = element.firstChild(); + return FindChildTextInner(child); +} + } // namespace FormManager::FormManager() { @@ -109,7 +130,7 @@ string16 FormManager::LabelForElement(const WebFormControlElement& element) { if (e.hasTagName("label")) { WebLabelElement label = e.to<WebLabelElement>(); if (label.correspondingControl() == element) - return GetChildText(label); + return FindChildText(label); } } @@ -200,7 +221,7 @@ bool FormManager::WebFormElementToFormData(const WebFormElement& element, std::map<string16, FormField*>::iterator iter = name_map.find(field_element.nameForAutofill()); if (iter != name_map.end()) - iter->second->set_label(GetChildText(label)); + iter->second->set_label(FindChildText(label)); } // Loop through the form control elements, extracting the label text from the @@ -548,8 +569,7 @@ string16 FormManager::InferLabelForElement( if (previous.isElementNode()) { WebElement element = previous.to<WebElement>(); if (element.hasTagName("p")) { - inferred_label = GetChildText(element); - TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); + inferred_label = FindChildText(element); } } } @@ -562,8 +582,7 @@ string16 FormManager::InferLabelForElement( if (!previous.isNull() && previous.isElementNode()) { WebElement element = previous.to<WebElement>(); if (element.hasTagName("p")) { - inferred_label = GetChildText(element); - TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); + inferred_label = FindChildText(element); } } } @@ -571,8 +590,13 @@ string16 FormManager::InferLabelForElement( // If we didn't find paragraph, check for table cell case. // Eg. <tr><td>Some Text</td><td><input ...></td></tr> + // Eg. <tr><td><b>Some Text</b></td><td><b><input ...></b></td></tr> if (inferred_label.empty()) { WebNode parent = element.parentNode(); + while (!parent.isNull() && parent.isElementNode() && + !parent.to<WebElement>().hasTagName("td")) + parent = parent.parentNode(); + if (!parent.isNull() && parent.isElementNode()) { WebElement element = parent.to<WebElement>(); if (element.hasTagName("td")) { @@ -585,8 +609,7 @@ string16 FormManager::InferLabelForElement( if (!previous.isNull() && previous.isElementNode()) { element = previous.to<WebElement>(); if (element.hasTagName("td")) { - inferred_label = GetChildText(element); - TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); + inferred_label = FindChildText(element); } } } diff --git a/chrome/renderer/form_manager_unittest.cc b/chrome/renderer/form_manager_unittest.cc index c7f55d4..2a5ca77 100644 --- a/chrome/renderer/form_manager_unittest.cc +++ b/chrome/renderer/form_manager_unittest.cc @@ -742,6 +742,79 @@ TEST_F(FormManagerTest, LabelsInferredFromTableCell) { fields[2]); } +TEST_F(FormManagerTest, LabelsInferredFromTableCellNested) { + LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">" + "<TABLE>" + " <TR>" + " <TD>" + " <FONT>" + " First name:" + " </FONT>" + " <FONT>" + " Bogus" + " </FONT>" + " </TD>" + " <TD>" + " <FONT>" + " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>" + " </FONT>" + " </TD>" + " </TR>" + " <TR>" + " <TD>" + " <FONT>" + " Last name:" + " </FONT>" + " </TD>" + " <TD>" + " <FONT>" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " </FONT>" + " </TD>" + " </TR>" + " <TR>" + " <TD></TD>" + " <TD>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" + " </TD>" + " </TR>" + "</TABLE>" + "</FORM>"); + + WebFrame* web_frame = GetMainFrame(); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); + + FormManager form_manager; + form_manager.ExtractForms(web_frame); + + std::vector<FormData> forms; + form_manager.GetForms(FormManager::REQUIRE_NONE, &forms); + ASSERT_EQ(1U, forms.size()); + + const FormData& form = forms[0]; + EXPECT_EQ(ASCIIToUTF16("TestForm"), form.name); + EXPECT_EQ(GURL(web_frame->url()), form.origin); + EXPECT_EQ(GURL("http://cnn.com"), form.action); + + const std::vector<FormField>& fields = form.fields; + ASSERT_EQ(3U, fields.size()); + EXPECT_EQ(FormField(ASCIIToUTF16("First name:"), + ASCIIToUTF16("firstname"), + ASCIIToUTF16("John"), + ASCIIToUTF16("text")), + fields[0]); + EXPECT_EQ(FormField(ASCIIToUTF16("Last name:"), + ASCIIToUTF16("lastname"), + ASCIIToUTF16("Smith"), + ASCIIToUTF16("text")), + fields[1]); + EXPECT_EQ(FormField(string16(), + ASCIIToUTF16("reply-send"), + ASCIIToUTF16("Send"), + ASCIIToUTF16("submit")), + fields[2]); +} + TEST_F(FormManagerTest, InferredLabelsWithSameName) { LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">" " Address Line 1:" |