diff options
author | dhollowa@chromium.org <dhollowa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-26 01:58:09 +0000 |
---|---|---|
committer | dhollowa@chromium.org <dhollowa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-26 01:58:09 +0000 |
commit | 6a3d7b998c691aa54b0f024a2a17bcf6ea2e0c57 (patch) | |
tree | ce14eccb00c57730e1c0067e55d49c96e7567dd5 /chrome | |
parent | 18506ac1d93c3ac122f7ae435b25431c580e9317 (diff) | |
download | chromium_src-6a3d7b998c691aa54b0f024a2a17bcf6ea2e0c57.zip chromium_src-6a3d7b998c691aa54b0f024a2a17bcf6ea2e0c57.tar.gz chromium_src-6a3d7b998c691aa54b0f024a2a17bcf6ea2e0c57.tar.bz2 |
Label scraping for AutoFill.
Adds label scraping to AutoFill. Infers labels from surrounding context of
input fields in the WebKit DOM. Specific cases added are:
- Text element immediately preceeding INPUT element.
Eg. First name:<INPUT type="text" id="firstname" value="John"/>
- Paragraph element containing text immediately preceeding INPUT element.
Eg. <P>First name:</P><INPUT type="text" id="firstname" value="John"/>
BUG=33031
TEST=FormManagerTest.LabelsFromInferredParagraph,
FormManagerTest.LabelsFromInferredText,
FormStructureTest.HeuristicsLabelsOnly
Review URL: http://codereview.chromium.org/1380002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@42720 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome')
-rw-r--r-- | chrome/browser/autofill/form_structure_unittest.cc | 98 | ||||
-rw-r--r-- | chrome/renderer/form_manager.cc | 53 | ||||
-rw-r--r-- | chrome/renderer/form_manager.h | 5 | ||||
-rw-r--r-- | chrome/renderer/form_manager_unittest.cc | 165 |
4 files changed, 277 insertions, 44 deletions
diff --git a/chrome/browser/autofill/form_structure_unittest.cc b/chrome/browser/autofill/form_structure_unittest.cc index 1245229..a8283a5 100644 --- a/chrome/browser/autofill/form_structure_unittest.cc +++ b/chrome/browser/autofill/form_structure_unittest.cc @@ -384,4 +384,102 @@ TEST(FormStructureTest, HeuristicsSample6) { EXPECT_EQ(ADDRESS_HOME_ZIP, form_structure->field(5)->heuristic_type()); } +// Tests a sequence of FormFields where only labels are supplied to heuristics +// for matching. This works because FormField labels are matched in the case +// that input element ids (or |name| fields) are missing. +TEST(FormStructureTest, HeuristicsLabelsOnly) { + scoped_ptr<FormStructure> form_structure; + webkit_glue::FormFieldValues values; + + values.method = ASCIIToUTF16("post"); + values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("First Name"), + string16(), + string16(), + ASCIIToUTF16("text"), + WebInputElement::Text)); + values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Last Name"), + string16(), + string16(), + ASCIIToUTF16("text"), + WebInputElement::Text)); + values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("EMail"), + string16(), + string16(), + ASCIIToUTF16("text"), + WebInputElement::Text)); + values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Phone"), + string16(), + string16(), + ASCIIToUTF16("text"), + WebInputElement::Text)); + values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Fax"), + string16(), + string16(), + ASCIIToUTF16("text"), + WebInputElement::Text)); + values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Address"), + string16(), + string16(), + ASCIIToUTF16("text"), + WebInputElement::Text)); + values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Address"), + string16(), + string16(), + ASCIIToUTF16("text"), + WebInputElement::Text)); + values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Zip code"), + string16(), + string16(), + ASCIIToUTF16("text"), + WebInputElement::Text)); + values.elements.push_back(webkit_glue::FormField(string16(), + ASCIIToUTF16("Submit"), + string16(), + ASCIIToUTF16("submit"), + WebInputElement::Submit)); + form_structure.reset(new FormStructure(values)); + EXPECT_TRUE(form_structure->IsAutoFillable()); + + // Expect the correct number of fields. + ASSERT_EQ(8UL, form_structure->field_count()); + + // Check that heuristics are initialized as UNKNOWN_TYPE. + std::vector<AutoFillField*>::const_iterator iter; + size_t i; + for (iter = form_structure->begin(), i = 0; + iter != form_structure->end(); + ++iter, ++i) { + // Expect last element to be NULL. + if (i == form_structure->field_count()) { + ASSERT_EQ(static_cast<AutoFillField*>(NULL), *iter); + } else { + ASSERT_NE(static_cast<AutoFillField*>(NULL), *iter); + EXPECT_EQ(UNKNOWN_TYPE, (*iter)->heuristic_type()); + } + } + + // Compute heuristic types. + form_structure->GetHeuristicAutoFillTypes(); + ASSERT_EQ(8U, form_structure->field_count()); + + // Check that heuristics are no longer UNKNOWN_TYPE. + // First name. + EXPECT_EQ(NAME_FIRST, form_structure->field(0)->heuristic_type()); + // Last name. + EXPECT_EQ(NAME_LAST, form_structure->field(1)->heuristic_type()); + // Email. + EXPECT_EQ(EMAIL_ADDRESS, form_structure->field(2)->heuristic_type()); + // Phone. + EXPECT_EQ(PHONE_HOME_WHOLE_NUMBER, + form_structure->field(3)->heuristic_type()); + // Fax. Note, we don't currently match fax. + EXPECT_EQ(UNKNOWN_TYPE, form_structure->field(4)->heuristic_type()); + // Address. + EXPECT_EQ(ADDRESS_HOME_LINE1, form_structure->field(5)->heuristic_type()); + // Address Line 2. + EXPECT_EQ(ADDRESS_HOME_LINE2, form_structure->field(6)->heuristic_type()); + // Zip. + EXPECT_EQ(ADDRESS_HOME_ZIP, form_structure->field(7)->heuristic_type()); +} + } // namespace diff --git a/chrome/renderer/form_manager.cc b/chrome/renderer/form_manager.cc index 246c448..2752dcb 100644 --- a/chrome/renderer/form_manager.cc +++ b/chrome/renderer/form_manager.cc @@ -202,10 +202,10 @@ void FormManager::FormElementToFormData(WebFrame* frame, string16 form_control_type = input_element.formControlType(); WebInputElement::InputType input_type = input_element.inputType(); FormField field = FormField(label, - name, - value, - form_control_type, - input_type); + name, + value, + form_control_type, + input_type); form->fields.push_back(field); } } @@ -221,5 +221,48 @@ string16 FormManager::LabelForElement(const WebInputElement& element) { return label.innerText(); } } - return string16(); + + // Infer the label from context if not found in label element. + return FormManager::InferLabelForElement(element); +} + +// static +string16 FormManager::InferLabelForElement(const WebInputElement& element) { + string16 inferred_label; + WebNode previous = element.previousSibling(); + if (!previous.isNull()) { + if (previous.isTextNode()) { + inferred_label = previous.nodeValue(); + TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); + } + + // If we didn't find text, check for previous paragraph. + // Eg. <p>Some Text</p><input ...> + // Note the lack of whitespace between <p> and <input> elements. + if (inferred_label.empty()) { + if (previous.isElementNode()) { + WebElement element = previous.toElement<WebElement>(); + if (element.hasTagName("p")) { + inferred_label = element.innerText(); + TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); + } + } + } + + // If we didn't find paragraph, check for previous paragraph to this. + // Eg. <p>Some Text</p> <input ...> + // Note the whitespace between <p> and <input> elements. + if (inferred_label.empty()) { + previous = previous.previousSibling(); + if (!previous.isNull() && previous.isElementNode()) { + WebElement element = previous.toElement<WebElement>(); + if (element.hasTagName("p")) { + inferred_label = element.innerText(); + TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label); + } + } + } + } + + return inferred_label; } diff --git a/chrome/renderer/form_manager.h b/chrome/renderer/form_manager.h index a6805b2..4928838 100644 --- a/chrome/renderer/form_manager.h +++ b/chrome/renderer/form_manager.h @@ -80,6 +80,11 @@ class FormManager { // Returns the corresponding label for |element|. static string16 LabelForElement(const WebKit::WebInputElement& element); + // Infers corresponding label for |element| from surrounding context in the + // DOM. Contents of preceeding <p> tag or preceeding text element found in + // the form. + static string16 InferLabelForElement(const WebKit::WebInputElement& element); + // The map of form elements. WebFrameFormElementMap form_elements_map_; diff --git a/chrome/renderer/form_manager_unittest.cc b/chrome/renderer/form_manager_unittest.cc index 418afd8..c288d19 100644 --- a/chrome/renderer/form_manager_unittest.cc +++ b/chrome/renderer/form_manager_unittest.cc @@ -19,20 +19,19 @@ using WebKit::WebString; using webkit_glue::FormData; using webkit_glue::FormField; -class FormManagerTest : public RenderViewTest { - public: - FormManagerTest() {} -}; +namespace { + +typedef RenderViewTest FormManagerTest; TEST_F(FormManagerTest, ExtractForms) { LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">" - " <INPUT type=\"text\" id=\"firstname\" value=\"John\">" - " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">" - " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">" + " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" "</FORM>"); WebFrame* web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); FormManager form_manager; form_manager.ExtractForms(web_frame); @@ -67,16 +66,16 @@ TEST_F(FormManagerTest, ExtractForms) { TEST_F(FormManagerTest, ExtractMultipleForms) { LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">" - " <INPUT type=\"text\" id=\"firstname\" value=\"John\">" - " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">" + " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" "</FORM>" "<FORM name=\"TestForm2\" action=\"http://zoo.com\" method=\"post\">" - " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">" - " <INPUT type=\"submit\" name=\"second\" value=\"Submit\">" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"second\" value=\"Submit\"/>" "</FORM>"); WebFrame* web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); FormManager form_manager; form_manager.ExtractForms(web_frame); @@ -128,12 +127,12 @@ TEST_F(FormManagerTest, GetFormsAutocomplete) { // Form is not auto-completable due to autocomplete=off. LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\"" " autocomplete=off>" - " <INPUT type=\"text\" id=\"firstname\" value=\"John\">" - " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">" + " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" "</FORM>"); WebFrame* web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); FormManager form_manager; form_manager.ExtractForms(web_frame); @@ -152,12 +151,12 @@ TEST_F(FormManagerTest, GetFormsAutocomplete) { LoadHTML("<FORM name=\"TestForm\" action=\"http://abc.com\" method=\"post\">" " <INPUT type=\"text\" id=\"firstname\" value=\"John\"" " autocomplete=off>" - " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">" - " <INPUT type=\"submit\" name=\"reply\" value=\"Send\">" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"reply\" value=\"Send\"/>" "</FORM>"); web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); form_manager.Reset(); form_manager.ExtractForms(web_frame); @@ -188,13 +187,13 @@ TEST_F(FormManagerTest, GetFormsAutocomplete) { TEST_F(FormManagerTest, GetFormsElementsEnabled) { // The firstname element is not enabled due to disabled being set. LoadHTML("<FORM name=\"TestForm\" action=\"http://xyz.com\" method=\"post\">" - " <INPUT disabled type=\"text\" id=\"firstname\" value=\"John\">" - " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">" - " <INPUT type=\"submit\" name=\"submit\" value=\"Send\">" + " <INPUT disabled type=\"text\" id=\"firstname\" value=\"John\"/>" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"submit\" value=\"Send\"/>" "</FORM>"); WebFrame* web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); FormManager form_manager; form_manager.ExtractForms(web_frame); @@ -224,13 +223,13 @@ TEST_F(FormManagerTest, GetFormsElementsEnabled) { TEST_F(FormManagerTest, FindForm) { LoadHTML("<FORM name=\"TestForm\" action=\"http://buh.com\" method=\"post\">" - " <INPUT type=\"text\" id=\"firstname\" value=\"John\">" - " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">" - " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">" + " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" "</FORM>"); WebFrame* web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); FormManager form_manager; form_manager.ExtractForms(web_frame); @@ -273,13 +272,13 @@ TEST_F(FormManagerTest, FindForm) { TEST_F(FormManagerTest, FillForm) { LoadHTML("<FORM name=\"TestForm\" action=\"http://buh.com\" method=\"post\">" - " <INPUT type=\"text\" id=\"firstname\">" - " <INPUT type=\"text\" id=\"lastname\">" - " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">" + " <INPUT type=\"text\" id=\"firstname\"/>" + " <INPUT type=\"text\" id=\"lastname\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" "</FORM>"); WebFrame* web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); FormManager form_manager; form_manager.ExtractForms(web_frame); @@ -352,13 +351,13 @@ TEST_F(FormManagerTest, FillForm) { TEST_F(FormManagerTest, Reset) { LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">" - " <INPUT type=\"text\" id=\"firstname\" value=\"John\">" - " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">" - " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">" + " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" "</FORM>"); WebFrame* web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); FormManager form_manager; form_manager.ExtractForms(web_frame); @@ -378,14 +377,57 @@ TEST_F(FormManagerTest, Reset) { TEST_F(FormManagerTest, Labels) { LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">" " <LABEL for=\"firstname\"> First name: </LABEL>" - " <INPUT type=\"text\" id=\"firstname\" value=\"John\">" + " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>" " <LABEL for=\"lastname\"> Last name: </LABEL>" - " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">" - " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" + "</FORM>"); + + WebFrame* web_frame = GetMainFrame(); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); + + FormManager form_manager; + form_manager.ExtractForms(web_frame); + + std::vector<FormData> forms; + form_manager.GetForms(&forms, FormManager::REQUIRE_NONE); + ASSERT_EQ(1U, forms.size()); + + const FormData& form = forms[0]; + EXPECT_EQ(ASCIIToUTF16("TestForm"), form.name); + EXPECT_EQ(GURL(web_frame->url()), form.origin); + EXPECT_EQ(GURL("http://cnn.com"), form.action); + + const std::vector<FormField>& fields = form.fields; + ASSERT_EQ(3U, fields.size()); + EXPECT_EQ(FormField(ASCIIToUTF16("First name:"), + ASCIIToUTF16("firstname"), + ASCIIToUTF16("John"), + ASCIIToUTF16("text"), + WebInputElement::Text), fields[0]); + EXPECT_EQ(FormField(ASCIIToUTF16("Last name:"), + ASCIIToUTF16("lastname"), + ASCIIToUTF16("Smith"), + ASCIIToUTF16("text"), + WebInputElement::Text), fields[1]); + EXPECT_EQ(FormField(string16(), + ASCIIToUTF16("reply-send"), + ASCIIToUTF16("Send"), + ASCIIToUTF16("submit"), + WebInputElement::Submit), fields[2]); +} + +TEST_F(FormManagerTest, LabelsFromInferredText) { + LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">" + " First name:" + " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>" + " Last name:" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" "</FORM>"); WebFrame* web_frame = GetMainFrame(); - ASSERT_TRUE(web_frame); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); FormManager form_manager; form_manager.ExtractForms(web_frame); @@ -417,3 +459,48 @@ TEST_F(FormManagerTest, Labels) { ASCIIToUTF16("submit"), WebInputElement::Submit), fields[2]); } + +TEST_F(FormManagerTest, LabelsFromInferredParagraph) { + LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">" + " <P>First name:</P><INPUT type=\"text\" " + " id=\"firstname\" value=\"John\"/>" + " <P>Last name:</P>" + " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>" + " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>" + "</FORM>"); + + WebFrame* web_frame = GetMainFrame(); + ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame); + + FormManager form_manager; + form_manager.ExtractForms(web_frame); + + std::vector<FormData> forms; + form_manager.GetForms(&forms, FormManager::REQUIRE_NONE); + ASSERT_EQ(1U, forms.size()); + + const FormData& form = forms[0]; + EXPECT_EQ(ASCIIToUTF16("TestForm"), form.name); + EXPECT_EQ(GURL(web_frame->url()), form.origin); + EXPECT_EQ(GURL("http://cnn.com"), form.action); + + const std::vector<FormField>& fields = form.fields; + ASSERT_EQ(3U, fields.size()); + EXPECT_EQ(FormField(ASCIIToUTF16("First name:"), + ASCIIToUTF16("firstname"), + ASCIIToUTF16("John"), + ASCIIToUTF16("text"), + WebInputElement::Text), fields[0]); + EXPECT_EQ(FormField(ASCIIToUTF16("Last name:"), + ASCIIToUTF16("lastname"), + ASCIIToUTF16("Smith"), + ASCIIToUTF16("text"), + WebInputElement::Text), fields[1]); + EXPECT_EQ(FormField(string16(), + ASCIIToUTF16("reply-send"), + ASCIIToUTF16("Send"), + ASCIIToUTF16("submit"), + WebInputElement::Submit), fields[2]); +} + +} // namespace |