From 1be7eadb032256687abda35ae4b2ba4770757c70 Mon Sep 17 00:00:00 2001 From: "jcampan@chromium.org" Date: Fri, 12 Feb 2010 20:02:02 +0000 Subject: Making the parsing of the response received from the translate server deal with more bad results. The server can send unmatched and duplicated tags. It's paramount for us to get as many text chunks out as we sent in. This is now we are trying to do when parsing the response. BUG=34854 TEST=Run the unit-tests. Reproduce steps in bug. Review URL: http://codereview.chromium.org/603037 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@38925 0039d316-1c4b-4281-b951-d872f2087c98 --- .../browser/renderer_host/translation_service.cc | 120 +++++++++++++-------- chrome/browser/renderer_host/translation_service.h | 30 +++++- .../renderer_host/translation_service_unittest.cc | 63 +++++++++-- 3 files changed, 159 insertions(+), 54 deletions(-) diff --git a/chrome/browser/renderer_host/translation_service.cc b/chrome/browser/renderer_host/translation_service.cc index 6ddef41..3413d21 100644 --- a/chrome/browser/renderer_host/translation_service.cc +++ b/chrome/browser/renderer_host/translation_service.cc @@ -217,7 +217,14 @@ void SendTranslationRequestTask::Cancel() { // TranslationService, public: TranslationService::TranslationService(IPC::Message::Sender* message_sender) - : message_sender_(message_sender) { + : message_sender_(message_sender), + kCRAnchorTagStart(ASCIIToUTF16("")), + kLessThan(ASCIIToUTF16("<")), + kQuoteGreaterThan(ASCIIToUTF16("'>")) { } TranslationService::~TranslationService() { @@ -490,7 +497,6 @@ void TranslationService::TranslationFailed(const URLFetcher* url_fetcher) { SendResponseToRenderer(url_fetcher, 1, TranslationService::TextChunksList()); } -// static string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) { // If there is only 1 chunk, we don't need an anchor tag as there is no order // to preserve. @@ -499,69 +505,100 @@ string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) { string16 str; for (size_t i = 0; i < text_chunks.size(); ++i) { - str.append(ASCIIToUTF16("")); + str.append(kQuoteGreaterThan); str.append(text_chunks[i]); - str.append(ASCIIToUTF16("")); + str.append(kClosingAnchorTag); } return str; } -// static +bool TranslationService::FindOpenTagIndex(const string16& text, + size_t start_index, + size_t* tag_start_index, + size_t* tag_end_index, + int* id) { + DCHECK(tag_start_index && tag_end_index && id); + size_t text_length = text.length(); + if (start_index >= text_length) + return false; + + *tag_start_index = text.find(kCRAnchorTagStart, start_index); + if (*tag_start_index == std::string::npos) + return false; + + size_t quote_index = *tag_start_index + kCRAnchorTagStart.length(); + size_t close_quote_index = text.find(kQuote, quote_index); + if (close_quote_index == std::string::npos) { + NOTREACHED(); + return false; // Not a valid anchor tag. + } + + string16 id_str = text.substr(quote_index, close_quote_index - quote_index); + // Get the id. + if (!StringToInt(id_str, id)) { + NOTREACHED(); + return false; // Not a valid id, give up. + } + + *tag_end_index = text.find(kGreaterThan, close_quote_index); + if (*tag_end_index == std::string::npos || *tag_end_index >= text_length) + return false; + return true; +} + void TranslationService::SplitIntoTextChunks(const string16& translated_text, TextChunks* text_chunks) { - const string16 kOpenTag = ASCIIToUTF16(""); - const size_t open_tag_len = kOpenTag.size(); - - size_t start_index = translated_text.find(kOpenTag); - if (start_index == std::string::npos) { + int id = -1; + size_t tag_start_index = 0; + size_t tag_end_index = 0; + if (!FindOpenTagIndex(translated_text, 0, &tag_start_index, &tag_end_index, + &id)) { // No magic anchor tag, it was a single chunk. text_chunks->push_back(translated_text); return; } // The server might send us some HTML with duplicated and unbalanced tags. - // We separate from the open tag to the next open tag located after at least - // one close tag. - while (start_index != std::string::npos) { - size_t stop_index = - translated_text.find(kCloseTag, start_index + open_tag_len); - string16 chunk; - if (stop_index == std::string::npos) { - // No close tag. Just report as one chunk. - chunk = translated_text; - start_index = std::string::npos; // So we break on next iteration. + // We separate from one tag begining to the next, and merge tags with + // duplicate IDs. + std::set parsed_tags; + string16 chunk; + while (tag_start_index != std::string::npos) { + int next_id = -1; + size_t previous_tag_end_index = tag_end_index; + if (!FindOpenTagIndex(translated_text, tag_end_index, + &tag_start_index, &tag_end_index, &next_id)) { + // Last tag. Just report as one chunk. + chunk = translated_text.substr(previous_tag_end_index + 1); + tag_start_index = std::string::npos; // So we break on next iteration. } else { - // Now find the next open tag after this close tag. - stop_index = translated_text.find(kOpenTag, stop_index); - if (stop_index != std::string::npos) { - chunk = translated_text.substr(start_index, stop_index - start_index); - start_index = stop_index; - } else { - chunk = translated_text.substr(start_index); - start_index = std::string::npos; // So we break on next iteration. - } + // Extract the text for this tag. + DCHECK(tag_start_index > previous_tag_end_index); + chunk = + translated_text.substr(previous_tag_end_index + 1, + tag_start_index - previous_tag_end_index - 1); } chunk = RemoveTag(chunk); // The translation server leaves some ampersand character in the // translation. chunk = UnescapeForHTML(chunk); - text_chunks->push_back(RemoveTag(chunk)); + if (parsed_tags.count(id) > 0) { + // We have already seen this tag, add it to the previous text-chunk. + text_chunks->back().append(chunk); + } else { + text_chunks->push_back(chunk); + parsed_tags.insert(id); + } + id = next_id; } } -// static string16 TranslationService::RemoveTag(const string16& text) { // Remove any anchor tags, knowing they could be extra/unbalanced tags. - const string16 kStartTag(ASCIIToUTF16("")); - const string16 kGreaterThan(ASCIIToUTF16(">")); - const string16 kLessThan(ASCIIToUTF16("<")); - string16 result; - size_t start_index = text.find(kStartTag); + size_t start_index = text.find(kAnchorTagStart); if (start_index == std::string::npos) { result = text; } else { @@ -579,7 +616,7 @@ string16 TranslationService::RemoveTag(const string16& text) { } if (start_index > 0 && first_iter) result = text.substr(0, start_index); - start_index = text.find(kStartTag, start_index + 1); + start_index = text.find(kAnchorTagStart, start_index + 1); if (start_index == std::string::npos) { result += text.substr(stop_index + 1); break; @@ -590,8 +627,7 @@ string16 TranslationService::RemoveTag(const string16& text) { } // Now remove tags. - ReplaceSubstringsAfterOffset(&result, 0, - ASCIIToUTF16(""), ASCIIToUTF16("")); + ReplaceSubstringsAfterOffset(&result, 0, kClosingAnchorTag, EmptyString16()); return result; } diff --git a/chrome/browser/renderer_host/translation_service.h b/chrome/browser/renderer_host/translation_service.h index 273a142..666265b 100644 --- a/chrome/browser/renderer_host/translation_service.h +++ b/chrome/browser/renderer_host/translation_service.h @@ -130,16 +130,29 @@ class TranslationService : public URLFetcher::Delegate { // Merges all text chunks to be translated into a single string that can be // sent to the translate server, surrounding each chunk with an anchor tag // to preserve chunk order in the translated version. - static string16 MergeTextChunks(const TextChunks& text_chunks); + string16 MergeTextChunks(const TextChunks& text_chunks); // Splits the translated text into its original text chunks, removing the // anchor tags wrapper that were added to preserve order. - static void SplitIntoTextChunks(const string16& translated_text, - TextChunks* text_chunks); + void SplitIntoTextChunks(const string16& translated_text, + TextChunks* text_chunks); // Removes the HTML anchor tag surrounding |text| and returns the resulting // string. - static string16 RemoveTag(const string16& text); + string16 RemoveTag(const string16& text); + + // Find the next anchor tag in |text| starting at |start_index|. + // Sets |id| (which must be non NULL) to the id property of the tag (which is + // expected to be an int). Sets |tag_start_index| and |tag_end_index| to the + // index of the beginning/end of the next tag. + // Returns true if a tag was found and it is not at the end of the string, + // false otherwise in which case |id|, |tag_start_index| and |tag_end_index| + // are not set. + bool FindOpenTagIndex(const string16& text, + size_t start_index, + size_t* tag_start_index, + size_t* tag_end_index, + int* id); // Adds |text| to the string request in/out param |request|. If |request| is // empty, then the source, target language as well as the secure parameters @@ -160,6 +173,15 @@ class TranslationService : public URLFetcher::Delegate { TranslationRequestMap pending_translation_requests_; TranslationRequestMap pending_secure_translation_requests_; + // Strings used for parsing. + const string16 kCRAnchorTagStart; + const string16 kAnchorTagStart; + const string16 kClosingAnchorTag; + const string16 kQuote; + const string16 kGreaterThan; + const string16 kLessThan; + const string16 kQuoteGreaterThan; + // The size taken by the parameters and separators needed when adding text to // a request string. static size_t text_param_length_; diff --git a/chrome/browser/renderer_host/translation_service_unittest.cc b/chrome/browser/renderer_host/translation_service_unittest.cc index 5d18fe9..f973502 100644 --- a/chrome/browser/renderer_host/translation_service_unittest.cc +++ b/chrome/browser/renderer_host/translation_service_unittest.cc @@ -154,14 +154,15 @@ static void ExtractQueryStringsFromUploadData(TestURLFetcher* url_fetcher, } TEST_F(TranslationServiceTest, MergeTestChunks) { + TranslationService translation_service(NULL); std::vector input; input.push_back(ASCIIToUTF16("Hello")); - string16 result = TranslationService::MergeTextChunks(input); + string16 result = translation_service.MergeTextChunks(input); EXPECT_EQ(ASCIIToUTF16("Hello"), result); input.push_back(ASCIIToUTF16(" my name")); input.push_back(ASCIIToUTF16(" is")); input.push_back(ASCIIToUTF16(" Jay.")); - result = TranslationService::MergeTextChunks(input); + result = translation_service.MergeTextChunks(input); EXPECT_EQ(ASCIIToUTF16("Hello" " my name" " is" @@ -179,11 +180,12 @@ TEST_F(TranslationServiceTest, RemoveTag) { "", "Hello", "", " Link ", "Link", " text_chunks; - TranslationService::SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks); + translation_service.SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks); ASSERT_EQ(1U, text_chunks.size()); EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]); text_chunks.clear(); // Multiple chunks case, correct syntax. - TranslationService::SplitIntoTextChunks( + translation_service.SplitIntoTextChunks( ASCIIToUTF16("Bonjour" " mon nom" " est" @@ -216,15 +220,58 @@ TEST_F(TranslationServiceTest, SplitIntoTextChunks) { // For info, original input: // Experience Nexus One // , the new Android phone from Google - TranslationService::SplitIntoTextChunks( + translation_service.SplitIntoTextChunks( ASCIIToUTF16("Experience Nexus" " One, the new " "Android Phone"), &text_chunks); ASSERT_EQ(3U, text_chunks.size()); EXPECT_EQ(ASCIIToUTF16("Experience "), text_chunks[0]); - EXPECT_EQ(ASCIIToUTF16("Nexus One, "), text_chunks[1]); - EXPECT_EQ(ASCIIToUTF16("the new Android Phone"), text_chunks[2]); + EXPECT_EQ(ASCIIToUTF16("Nexus"), text_chunks[1]); + EXPECT_EQ(ASCIIToUTF16(" One, the new Android Phone"), text_chunks[2]); + text_chunks.clear(); + + // Other incorrect case: + // Original input: + // Benzinpreis-vergleich + translation_service.SplitIntoTextChunks( + ASCIIToUTF16("Gasoline " + "price-comparison"), &text_chunks); + ASSERT_EQ(2U, text_chunks.size()); + EXPECT_EQ(ASCIIToUTF16("Gasoline "), text_chunks[0]); + EXPECT_EQ(ASCIIToUTF16("price-comparison"), text_chunks[1]); + text_chunks.clear(); + + // Other incorrect case: + // Original input: + // Bußgeld-rechner + translation_service.SplitIntoTextChunks( + ASCIIToUTF16("Fine-computer" + ""), &text_chunks); + ASSERT_EQ(2U, text_chunks.size()); + EXPECT_EQ(ASCIIToUTF16(""), text_chunks[0]); + EXPECT_EQ(ASCIIToUTF16("Fine-computer"), text_chunks[1]); text_chunks.clear(); + + translation_service.SplitIntoTextChunks( + ASCIIToUTF16("The mountain live . " + "By Philipp Wittrock are " + "more ... Video " + "Forum"), &text_chunks); + ASSERT_EQ(5U, text_chunks.size()); + EXPECT_EQ(ASCIIToUTF16("The mountain live . "), text_chunks[0]); + EXPECT_EQ(ASCIIToUTF16("By Philipp Wittrock are "), text_chunks[1]); + EXPECT_EQ(ASCIIToUTF16("more ... "), text_chunks[2]); + EXPECT_EQ(ASCIIToUTF16("Video "), text_chunks[3]); + EXPECT_EQ(ASCIIToUTF16("Forum"), text_chunks[4]); + text_chunks.clear(); + + // Make sure we support ending with a start tag. + translation_service.SplitIntoTextChunks( + ASCIIToUTF16("Hello"), + &text_chunks); + ASSERT_EQ(2U, text_chunks.size()); + EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]); + EXPECT_EQ(EmptyString16(), text_chunks[1]); } // Tests that a successful translate works as expected. -- cgit v1.1