summaryrefslogtreecommitdiffstats
path: root/chrome
diff options
context:
space:
mode:
authordhollowa@chromium.org <dhollowa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-03-26 01:58:09 +0000
committerdhollowa@chromium.org <dhollowa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-03-26 01:58:09 +0000
commit6a3d7b998c691aa54b0f024a2a17bcf6ea2e0c57 (patch)
treece14eccb00c57730e1c0067e55d49c96e7567dd5 /chrome
parent18506ac1d93c3ac122f7ae435b25431c580e9317 (diff)
downloadchromium_src-6a3d7b998c691aa54b0f024a2a17bcf6ea2e0c57.zip
chromium_src-6a3d7b998c691aa54b0f024a2a17bcf6ea2e0c57.tar.gz
chromium_src-6a3d7b998c691aa54b0f024a2a17bcf6ea2e0c57.tar.bz2
Label scraping for AutoFill.
Adds label scraping to AutoFill. Infers labels from surrounding context of input fields in the WebKit DOM. Specific cases added are: - Text element immediately preceeding INPUT element. Eg. First name:<INPUT type="text" id="firstname" value="John"/> - Paragraph element containing text immediately preceeding INPUT element. Eg. <P>First name:</P><INPUT type="text" id="firstname" value="John"/> BUG=33031 TEST=FormManagerTest.LabelsFromInferredParagraph, FormManagerTest.LabelsFromInferredText, FormStructureTest.HeuristicsLabelsOnly Review URL: http://codereview.chromium.org/1380002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@42720 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome')
-rw-r--r--chrome/browser/autofill/form_structure_unittest.cc98
-rw-r--r--chrome/renderer/form_manager.cc53
-rw-r--r--chrome/renderer/form_manager.h5
-rw-r--r--chrome/renderer/form_manager_unittest.cc165
4 files changed, 277 insertions, 44 deletions
diff --git a/chrome/browser/autofill/form_structure_unittest.cc b/chrome/browser/autofill/form_structure_unittest.cc
index 1245229..a8283a5 100644
--- a/chrome/browser/autofill/form_structure_unittest.cc
+++ b/chrome/browser/autofill/form_structure_unittest.cc
@@ -384,4 +384,102 @@ TEST(FormStructureTest, HeuristicsSample6) {
EXPECT_EQ(ADDRESS_HOME_ZIP, form_structure->field(5)->heuristic_type());
}
+// Tests a sequence of FormFields where only labels are supplied to heuristics
+// for matching. This works because FormField labels are matched in the case
+// that input element ids (or |name| fields) are missing.
+TEST(FormStructureTest, HeuristicsLabelsOnly) {
+ scoped_ptr<FormStructure> form_structure;
+ webkit_glue::FormFieldValues values;
+
+ values.method = ASCIIToUTF16("post");
+ values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("First Name"),
+ string16(),
+ string16(),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text));
+ values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Last Name"),
+ string16(),
+ string16(),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text));
+ values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("EMail"),
+ string16(),
+ string16(),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text));
+ values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Phone"),
+ string16(),
+ string16(),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text));
+ values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Fax"),
+ string16(),
+ string16(),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text));
+ values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Address"),
+ string16(),
+ string16(),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text));
+ values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Address"),
+ string16(),
+ string16(),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text));
+ values.elements.push_back(webkit_glue::FormField(ASCIIToUTF16("Zip code"),
+ string16(),
+ string16(),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text));
+ values.elements.push_back(webkit_glue::FormField(string16(),
+ ASCIIToUTF16("Submit"),
+ string16(),
+ ASCIIToUTF16("submit"),
+ WebInputElement::Submit));
+ form_structure.reset(new FormStructure(values));
+ EXPECT_TRUE(form_structure->IsAutoFillable());
+
+ // Expect the correct number of fields.
+ ASSERT_EQ(8UL, form_structure->field_count());
+
+ // Check that heuristics are initialized as UNKNOWN_TYPE.
+ std::vector<AutoFillField*>::const_iterator iter;
+ size_t i;
+ for (iter = form_structure->begin(), i = 0;
+ iter != form_structure->end();
+ ++iter, ++i) {
+ // Expect last element to be NULL.
+ if (i == form_structure->field_count()) {
+ ASSERT_EQ(static_cast<AutoFillField*>(NULL), *iter);
+ } else {
+ ASSERT_NE(static_cast<AutoFillField*>(NULL), *iter);
+ EXPECT_EQ(UNKNOWN_TYPE, (*iter)->heuristic_type());
+ }
+ }
+
+ // Compute heuristic types.
+ form_structure->GetHeuristicAutoFillTypes();
+ ASSERT_EQ(8U, form_structure->field_count());
+
+ // Check that heuristics are no longer UNKNOWN_TYPE.
+ // First name.
+ EXPECT_EQ(NAME_FIRST, form_structure->field(0)->heuristic_type());
+ // Last name.
+ EXPECT_EQ(NAME_LAST, form_structure->field(1)->heuristic_type());
+ // Email.
+ EXPECT_EQ(EMAIL_ADDRESS, form_structure->field(2)->heuristic_type());
+ // Phone.
+ EXPECT_EQ(PHONE_HOME_WHOLE_NUMBER,
+ form_structure->field(3)->heuristic_type());
+ // Fax. Note, we don't currently match fax.
+ EXPECT_EQ(UNKNOWN_TYPE, form_structure->field(4)->heuristic_type());
+ // Address.
+ EXPECT_EQ(ADDRESS_HOME_LINE1, form_structure->field(5)->heuristic_type());
+ // Address Line 2.
+ EXPECT_EQ(ADDRESS_HOME_LINE2, form_structure->field(6)->heuristic_type());
+ // Zip.
+ EXPECT_EQ(ADDRESS_HOME_ZIP, form_structure->field(7)->heuristic_type());
+}
+
} // namespace
diff --git a/chrome/renderer/form_manager.cc b/chrome/renderer/form_manager.cc
index 246c448..2752dcb 100644
--- a/chrome/renderer/form_manager.cc
+++ b/chrome/renderer/form_manager.cc
@@ -202,10 +202,10 @@ void FormManager::FormElementToFormData(WebFrame* frame,
string16 form_control_type = input_element.formControlType();
WebInputElement::InputType input_type = input_element.inputType();
FormField field = FormField(label,
- name,
- value,
- form_control_type,
- input_type);
+ name,
+ value,
+ form_control_type,
+ input_type);
form->fields.push_back(field);
}
}
@@ -221,5 +221,48 @@ string16 FormManager::LabelForElement(const WebInputElement& element) {
return label.innerText();
}
}
- return string16();
+
+ // Infer the label from context if not found in label element.
+ return FormManager::InferLabelForElement(element);
+}
+
+// static
+string16 FormManager::InferLabelForElement(const WebInputElement& element) {
+ string16 inferred_label;
+ WebNode previous = element.previousSibling();
+ if (!previous.isNull()) {
+ if (previous.isTextNode()) {
+ inferred_label = previous.nodeValue();
+ TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
+ }
+
+ // If we didn't find text, check for previous paragraph.
+ // Eg. <p>Some Text</p><input ...>
+ // Note the lack of whitespace between <p> and <input> elements.
+ if (inferred_label.empty()) {
+ if (previous.isElementNode()) {
+ WebElement element = previous.toElement<WebElement>();
+ if (element.hasTagName("p")) {
+ inferred_label = element.innerText();
+ TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
+ }
+ }
+ }
+
+ // If we didn't find paragraph, check for previous paragraph to this.
+ // Eg. <p>Some Text</p> <input ...>
+ // Note the whitespace between <p> and <input> elements.
+ if (inferred_label.empty()) {
+ previous = previous.previousSibling();
+ if (!previous.isNull() && previous.isElementNode()) {
+ WebElement element = previous.toElement<WebElement>();
+ if (element.hasTagName("p")) {
+ inferred_label = element.innerText();
+ TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
+ }
+ }
+ }
+ }
+
+ return inferred_label;
}
diff --git a/chrome/renderer/form_manager.h b/chrome/renderer/form_manager.h
index a6805b2..4928838 100644
--- a/chrome/renderer/form_manager.h
+++ b/chrome/renderer/form_manager.h
@@ -80,6 +80,11 @@ class FormManager {
// Returns the corresponding label for |element|.
static string16 LabelForElement(const WebKit::WebInputElement& element);
+ // Infers corresponding label for |element| from surrounding context in the
+ // DOM. Contents of preceeding <p> tag or preceeding text element found in
+ // the form.
+ static string16 InferLabelForElement(const WebKit::WebInputElement& element);
+
// The map of form elements.
WebFrameFormElementMap form_elements_map_;
diff --git a/chrome/renderer/form_manager_unittest.cc b/chrome/renderer/form_manager_unittest.cc
index 418afd8..c288d19 100644
--- a/chrome/renderer/form_manager_unittest.cc
+++ b/chrome/renderer/form_manager_unittest.cc
@@ -19,20 +19,19 @@ using WebKit::WebString;
using webkit_glue::FormData;
using webkit_glue::FormField;
-class FormManagerTest : public RenderViewTest {
- public:
- FormManagerTest() {}
-};
+namespace {
+
+typedef RenderViewTest FormManagerTest;
TEST_F(FormManagerTest, ExtractForms) {
LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">"
- " <INPUT type=\"text\" id=\"firstname\" value=\"John\">"
- " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">"
- " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">"
+ " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
"</FORM>");
WebFrame* web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
FormManager form_manager;
form_manager.ExtractForms(web_frame);
@@ -67,16 +66,16 @@ TEST_F(FormManagerTest, ExtractForms) {
TEST_F(FormManagerTest, ExtractMultipleForms) {
LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">"
- " <INPUT type=\"text\" id=\"firstname\" value=\"John\">"
- " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">"
+ " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
"</FORM>"
"<FORM name=\"TestForm2\" action=\"http://zoo.com\" method=\"post\">"
- " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">"
- " <INPUT type=\"submit\" name=\"second\" value=\"Submit\">"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"second\" value=\"Submit\"/>"
"</FORM>");
WebFrame* web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
FormManager form_manager;
form_manager.ExtractForms(web_frame);
@@ -128,12 +127,12 @@ TEST_F(FormManagerTest, GetFormsAutocomplete) {
// Form is not auto-completable due to autocomplete=off.
LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\""
" autocomplete=off>"
- " <INPUT type=\"text\" id=\"firstname\" value=\"John\">"
- " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">"
+ " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
"</FORM>");
WebFrame* web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
FormManager form_manager;
form_manager.ExtractForms(web_frame);
@@ -152,12 +151,12 @@ TEST_F(FormManagerTest, GetFormsAutocomplete) {
LoadHTML("<FORM name=\"TestForm\" action=\"http://abc.com\" method=\"post\">"
" <INPUT type=\"text\" id=\"firstname\" value=\"John\""
" autocomplete=off>"
- " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">"
- " <INPUT type=\"submit\" name=\"reply\" value=\"Send\">"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"reply\" value=\"Send\"/>"
"</FORM>");
web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
form_manager.Reset();
form_manager.ExtractForms(web_frame);
@@ -188,13 +187,13 @@ TEST_F(FormManagerTest, GetFormsAutocomplete) {
TEST_F(FormManagerTest, GetFormsElementsEnabled) {
// The firstname element is not enabled due to disabled being set.
LoadHTML("<FORM name=\"TestForm\" action=\"http://xyz.com\" method=\"post\">"
- " <INPUT disabled type=\"text\" id=\"firstname\" value=\"John\">"
- " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">"
- " <INPUT type=\"submit\" name=\"submit\" value=\"Send\">"
+ " <INPUT disabled type=\"text\" id=\"firstname\" value=\"John\"/>"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"submit\" value=\"Send\"/>"
"</FORM>");
WebFrame* web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
FormManager form_manager;
form_manager.ExtractForms(web_frame);
@@ -224,13 +223,13 @@ TEST_F(FormManagerTest, GetFormsElementsEnabled) {
TEST_F(FormManagerTest, FindForm) {
LoadHTML("<FORM name=\"TestForm\" action=\"http://buh.com\" method=\"post\">"
- " <INPUT type=\"text\" id=\"firstname\" value=\"John\">"
- " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">"
- " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">"
+ " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
"</FORM>");
WebFrame* web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
FormManager form_manager;
form_manager.ExtractForms(web_frame);
@@ -273,13 +272,13 @@ TEST_F(FormManagerTest, FindForm) {
TEST_F(FormManagerTest, FillForm) {
LoadHTML("<FORM name=\"TestForm\" action=\"http://buh.com\" method=\"post\">"
- " <INPUT type=\"text\" id=\"firstname\">"
- " <INPUT type=\"text\" id=\"lastname\">"
- " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">"
+ " <INPUT type=\"text\" id=\"firstname\"/>"
+ " <INPUT type=\"text\" id=\"lastname\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
"</FORM>");
WebFrame* web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
FormManager form_manager;
form_manager.ExtractForms(web_frame);
@@ -352,13 +351,13 @@ TEST_F(FormManagerTest, FillForm) {
TEST_F(FormManagerTest, Reset) {
LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">"
- " <INPUT type=\"text\" id=\"firstname\" value=\"John\">"
- " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">"
- " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">"
+ " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
"</FORM>");
WebFrame* web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
FormManager form_manager;
form_manager.ExtractForms(web_frame);
@@ -378,14 +377,57 @@ TEST_F(FormManagerTest, Reset) {
TEST_F(FormManagerTest, Labels) {
LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">"
" <LABEL for=\"firstname\"> First name: </LABEL>"
- " <INPUT type=\"text\" id=\"firstname\" value=\"John\">"
+ " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>"
" <LABEL for=\"lastname\"> Last name: </LABEL>"
- " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\">"
- " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\">"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
+ "</FORM>");
+
+ WebFrame* web_frame = GetMainFrame();
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
+
+ FormManager form_manager;
+ form_manager.ExtractForms(web_frame);
+
+ std::vector<FormData> forms;
+ form_manager.GetForms(&forms, FormManager::REQUIRE_NONE);
+ ASSERT_EQ(1U, forms.size());
+
+ const FormData& form = forms[0];
+ EXPECT_EQ(ASCIIToUTF16("TestForm"), form.name);
+ EXPECT_EQ(GURL(web_frame->url()), form.origin);
+ EXPECT_EQ(GURL("http://cnn.com"), form.action);
+
+ const std::vector<FormField>& fields = form.fields;
+ ASSERT_EQ(3U, fields.size());
+ EXPECT_EQ(FormField(ASCIIToUTF16("First name:"),
+ ASCIIToUTF16("firstname"),
+ ASCIIToUTF16("John"),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text), fields[0]);
+ EXPECT_EQ(FormField(ASCIIToUTF16("Last name:"),
+ ASCIIToUTF16("lastname"),
+ ASCIIToUTF16("Smith"),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text), fields[1]);
+ EXPECT_EQ(FormField(string16(),
+ ASCIIToUTF16("reply-send"),
+ ASCIIToUTF16("Send"),
+ ASCIIToUTF16("submit"),
+ WebInputElement::Submit), fields[2]);
+}
+
+TEST_F(FormManagerTest, LabelsFromInferredText) {
+ LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">"
+ " First name:"
+ " <INPUT type=\"text\" id=\"firstname\" value=\"John\"/>"
+ " Last name:"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
"</FORM>");
WebFrame* web_frame = GetMainFrame();
- ASSERT_TRUE(web_frame);
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
FormManager form_manager;
form_manager.ExtractForms(web_frame);
@@ -417,3 +459,48 @@ TEST_F(FormManagerTest, Labels) {
ASCIIToUTF16("submit"),
WebInputElement::Submit), fields[2]);
}
+
+TEST_F(FormManagerTest, LabelsFromInferredParagraph) {
+ LoadHTML("<FORM name=\"TestForm\" action=\"http://cnn.com\" method=\"post\">"
+ " <P>First name:</P><INPUT type=\"text\" "
+ " id=\"firstname\" value=\"John\"/>"
+ " <P>Last name:</P>"
+ " <INPUT type=\"text\" id=\"lastname\" value=\"Smith\"/>"
+ " <INPUT type=\"submit\" name=\"reply-send\" value=\"Send\"/>"
+ "</FORM>");
+
+ WebFrame* web_frame = GetMainFrame();
+ ASSERT_NE(static_cast<WebFrame*>(NULL), web_frame);
+
+ FormManager form_manager;
+ form_manager.ExtractForms(web_frame);
+
+ std::vector<FormData> forms;
+ form_manager.GetForms(&forms, FormManager::REQUIRE_NONE);
+ ASSERT_EQ(1U, forms.size());
+
+ const FormData& form = forms[0];
+ EXPECT_EQ(ASCIIToUTF16("TestForm"), form.name);
+ EXPECT_EQ(GURL(web_frame->url()), form.origin);
+ EXPECT_EQ(GURL("http://cnn.com"), form.action);
+
+ const std::vector<FormField>& fields = form.fields;
+ ASSERT_EQ(3U, fields.size());
+ EXPECT_EQ(FormField(ASCIIToUTF16("First name:"),
+ ASCIIToUTF16("firstname"),
+ ASCIIToUTF16("John"),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text), fields[0]);
+ EXPECT_EQ(FormField(ASCIIToUTF16("Last name:"),
+ ASCIIToUTF16("lastname"),
+ ASCIIToUTF16("Smith"),
+ ASCIIToUTF16("text"),
+ WebInputElement::Text), fields[1]);
+ EXPECT_EQ(FormField(string16(),
+ ASCIIToUTF16("reply-send"),
+ ASCIIToUTF16("Send"),
+ ASCIIToUTF16("submit"),
+ WebInputElement::Submit), fields[2]);
+}
+
+} // namespace