diff options
3 files changed, 159 insertions, 54 deletions
diff --git a/chrome/browser/renderer_host/translation_service.cc b/chrome/browser/renderer_host/translation_service.cc index 6ddef41..3413d21 100644 --- a/chrome/browser/renderer_host/translation_service.cc +++ b/chrome/browser/renderer_host/translation_service.cc @@ -217,7 +217,14 @@ void SendTranslationRequestTask::Cancel() { // TranslationService, public: TranslationService::TranslationService(IPC::Message::Sender* message_sender) - : message_sender_(message_sender) { + : message_sender_(message_sender), + kCRAnchorTagStart(ASCIIToUTF16("<a _CR_TR_ id='")), + kAnchorTagStart(ASCIIToUTF16("<a ")), + kClosingAnchorTag(ASCIIToUTF16("</a>")), + kQuote(ASCIIToUTF16("'")), + kGreaterThan(ASCIIToUTF16(">")), + kLessThan(ASCIIToUTF16("<")), + kQuoteGreaterThan(ASCIIToUTF16("'>")) { } TranslationService::~TranslationService() { @@ -490,7 +497,6 @@ void TranslationService::TranslationFailed(const URLFetcher* url_fetcher) { SendResponseToRenderer(url_fetcher, 1, TranslationService::TextChunksList()); } -// static string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) { // If there is only 1 chunk, we don't need an anchor tag as there is no order // to preserve. @@ -499,69 +505,100 @@ string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) { string16 str; for (size_t i = 0; i < text_chunks.size(); ++i) { - str.append(ASCIIToUTF16("<a _CR_TR_ id='")); + str.append(kCRAnchorTagStart); str.append(IntToString16(i)); - str.append(ASCIIToUTF16("'>")); + str.append(kQuoteGreaterThan); str.append(text_chunks[i]); - str.append(ASCIIToUTF16("</a>")); + str.append(kClosingAnchorTag); } return str; } -// static +bool TranslationService::FindOpenTagIndex(const string16& text, + size_t start_index, + size_t* tag_start_index, + size_t* tag_end_index, + int* id) { + DCHECK(tag_start_index && tag_end_index && id); + size_t text_length = text.length(); + if (start_index >= text_length) + return false; + + *tag_start_index = text.find(kCRAnchorTagStart, start_index); + if (*tag_start_index == std::string::npos) + return false; + + size_t quote_index = *tag_start_index + kCRAnchorTagStart.length(); + size_t close_quote_index = text.find(kQuote, quote_index); + if (close_quote_index == std::string::npos) { + NOTREACHED(); + return false; // Not a valid anchor tag. + } + + string16 id_str = text.substr(quote_index, close_quote_index - quote_index); + // Get the id. + if (!StringToInt(id_str, id)) { + NOTREACHED(); + return false; // Not a valid id, give up. + } + + *tag_end_index = text.find(kGreaterThan, close_quote_index); + if (*tag_end_index == std::string::npos || *tag_end_index >= text_length) + return false; + return true; +} + void TranslationService::SplitIntoTextChunks(const string16& translated_text, TextChunks* text_chunks) { - const string16 kOpenTag = ASCIIToUTF16("<a _CR_TR_ "); - const string16 kCloseTag = ASCIIToUTF16("</a>"); - const size_t open_tag_len = kOpenTag.size(); - - size_t start_index = translated_text.find(kOpenTag); - if (start_index == std::string::npos) { + int id = -1; + size_t tag_start_index = 0; + size_t tag_end_index = 0; + if (!FindOpenTagIndex(translated_text, 0, &tag_start_index, &tag_end_index, + &id)) { // No magic anchor tag, it was a single chunk. text_chunks->push_back(translated_text); return; } // The server might send us some HTML with duplicated and unbalanced tags. - // We separate from the open tag to the next open tag located after at least - // one close tag. - while (start_index != std::string::npos) { - size_t stop_index = - translated_text.find(kCloseTag, start_index + open_tag_len); - string16 chunk; - if (stop_index == std::string::npos) { - // No close tag. Just report as one chunk. - chunk = translated_text; - start_index = std::string::npos; // So we break on next iteration. + // We separate from one tag begining to the next, and merge tags with + // duplicate IDs. + std::set<int> parsed_tags; + string16 chunk; + while (tag_start_index != std::string::npos) { + int next_id = -1; + size_t previous_tag_end_index = tag_end_index; + if (!FindOpenTagIndex(translated_text, tag_end_index, + &tag_start_index, &tag_end_index, &next_id)) { + // Last tag. Just report as one chunk. + chunk = translated_text.substr(previous_tag_end_index + 1); + tag_start_index = std::string::npos; // So we break on next iteration. } else { - // Now find the next open tag after this close tag. - stop_index = translated_text.find(kOpenTag, stop_index); - if (stop_index != std::string::npos) { - chunk = translated_text.substr(start_index, stop_index - start_index); - start_index = stop_index; - } else { - chunk = translated_text.substr(start_index); - start_index = std::string::npos; // So we break on next iteration. - } + // Extract the text for this tag. + DCHECK(tag_start_index > previous_tag_end_index); + chunk = + translated_text.substr(previous_tag_end_index + 1, + tag_start_index - previous_tag_end_index - 1); } chunk = RemoveTag(chunk); // The translation server leaves some ampersand character in the // translation. chunk = UnescapeForHTML(chunk); - text_chunks->push_back(RemoveTag(chunk)); + if (parsed_tags.count(id) > 0) { + // We have already seen this tag, add it to the previous text-chunk. + text_chunks->back().append(chunk); + } else { + text_chunks->push_back(chunk); + parsed_tags.insert(id); + } + id = next_id; } } -// static string16 TranslationService::RemoveTag(const string16& text) { // Remove any anchor tags, knowing they could be extra/unbalanced tags. - const string16 kStartTag(ASCIIToUTF16("<a ")); - const string16 kEndTag(ASCIIToUTF16("</a>")); - const string16 kGreaterThan(ASCIIToUTF16(">")); - const string16 kLessThan(ASCIIToUTF16("<")); - string16 result; - size_t start_index = text.find(kStartTag); + size_t start_index = text.find(kAnchorTagStart); if (start_index == std::string::npos) { result = text; } else { @@ -579,7 +616,7 @@ string16 TranslationService::RemoveTag(const string16& text) { } if (start_index > 0 && first_iter) result = text.substr(0, start_index); - start_index = text.find(kStartTag, start_index + 1); + start_index = text.find(kAnchorTagStart, start_index + 1); if (start_index == std::string::npos) { result += text.substr(stop_index + 1); break; @@ -590,8 +627,7 @@ string16 TranslationService::RemoveTag(const string16& text) { } // Now remove </a> tags. - ReplaceSubstringsAfterOffset(&result, 0, - ASCIIToUTF16("</a>"), ASCIIToUTF16("")); + ReplaceSubstringsAfterOffset(&result, 0, kClosingAnchorTag, EmptyString16()); return result; } diff --git a/chrome/browser/renderer_host/translation_service.h b/chrome/browser/renderer_host/translation_service.h index 273a142..666265b 100644 --- a/chrome/browser/renderer_host/translation_service.h +++ b/chrome/browser/renderer_host/translation_service.h @@ -130,16 +130,29 @@ class TranslationService : public URLFetcher::Delegate { // Merges all text chunks to be translated into a single string that can be // sent to the translate server, surrounding each chunk with an anchor tag // to preserve chunk order in the translated version. - static string16 MergeTextChunks(const TextChunks& text_chunks); + string16 MergeTextChunks(const TextChunks& text_chunks); // Splits the translated text into its original text chunks, removing the // anchor tags wrapper that were added to preserve order. - static void SplitIntoTextChunks(const string16& translated_text, - TextChunks* text_chunks); + void SplitIntoTextChunks(const string16& translated_text, + TextChunks* text_chunks); // Removes the HTML anchor tag surrounding |text| and returns the resulting // string. - static string16 RemoveTag(const string16& text); + string16 RemoveTag(const string16& text); + + // Find the next anchor tag in |text| starting at |start_index|. + // Sets |id| (which must be non NULL) to the id property of the tag (which is + // expected to be an int). Sets |tag_start_index| and |tag_end_index| to the + // index of the beginning/end of the next tag. + // Returns true if a tag was found and it is not at the end of the string, + // false otherwise in which case |id|, |tag_start_index| and |tag_end_index| + // are not set. + bool FindOpenTagIndex(const string16& text, + size_t start_index, + size_t* tag_start_index, + size_t* tag_end_index, + int* id); // Adds |text| to the string request in/out param |request|. If |request| is // empty, then the source, target language as well as the secure parameters @@ -160,6 +173,15 @@ class TranslationService : public URLFetcher::Delegate { TranslationRequestMap pending_translation_requests_; TranslationRequestMap pending_secure_translation_requests_; + // Strings used for parsing. + const string16 kCRAnchorTagStart; + const string16 kAnchorTagStart; + const string16 kClosingAnchorTag; + const string16 kQuote; + const string16 kGreaterThan; + const string16 kLessThan; + const string16 kQuoteGreaterThan; + // The size taken by the parameters and separators needed when adding text to // a request string. static size_t text_param_length_; diff --git a/chrome/browser/renderer_host/translation_service_unittest.cc b/chrome/browser/renderer_host/translation_service_unittest.cc index 5d18fe9..f973502 100644 --- a/chrome/browser/renderer_host/translation_service_unittest.cc +++ b/chrome/browser/renderer_host/translation_service_unittest.cc @@ -154,14 +154,15 @@ static void ExtractQueryStringsFromUploadData(TestURLFetcher* url_fetcher, } TEST_F(TranslationServiceTest, MergeTestChunks) { + TranslationService translation_service(NULL); std::vector<string16> input; input.push_back(ASCIIToUTF16("Hello")); - string16 result = TranslationService::MergeTextChunks(input); + string16 result = translation_service.MergeTextChunks(input); EXPECT_EQ(ASCIIToUTF16("Hello"), result); input.push_back(ASCIIToUTF16(" my name")); input.push_back(ASCIIToUTF16(" is")); input.push_back(ASCIIToUTF16(" Jay.")); - result = TranslationService::MergeTextChunks(input); + result = translation_service.MergeTextChunks(input); EXPECT_EQ(ASCIIToUTF16("<a _CR_TR_ id='0'>Hello</a>" "<a _CR_TR_ id='1'> my name</a>" "<a _CR_TR_ id='2'> is</a>" @@ -179,11 +180,12 @@ TEST_F(TranslationServiceTest, RemoveTag) { "", "Hello", "", " Link ", "Link", "<a link", "broken", "broken bad bad" }; + TranslationService translation_service(NULL); ASSERT_EQ(arraysize(kInputs), arraysize(kExpected)); for (size_t i = 0; i < arraysize(kInputs); ++i) { SCOPED_TRACE(::testing::Message::Message() << "Iteration " << i); string16 input = ASCIIToUTF16(kInputs[i]); - string16 output = TranslationService::RemoveTag(input); + string16 output = translation_service.RemoveTag(input); EXPECT_EQ(ASCIIToUTF16(kExpected[i]), output); } } @@ -191,16 +193,18 @@ TEST_F(TranslationServiceTest, RemoveTag) { // Tests that we deal correctly with the various results the translation server // can return, including the buggy ones. TEST_F(TranslationServiceTest, SplitIntoTextChunks) { + TranslationService translation_service(NULL); + // Simple case. std::vector<string16> text_chunks; - TranslationService::SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks); + translation_service.SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks); ASSERT_EQ(1U, text_chunks.size()); EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]); text_chunks.clear(); // Multiple chunks case, correct syntax. - TranslationService::SplitIntoTextChunks( + translation_service.SplitIntoTextChunks( ASCIIToUTF16("<a _CR_TR_ id='0'>Bonjour</a>" "<a _CR_TR_ id='1'> mon nom</a>" "<a _CR_TR_ id='2'> est</a>" @@ -216,15 +220,58 @@ TEST_F(TranslationServiceTest, SplitIntoTextChunks) { // For info, original input: // <a _CR_TRANSLATE_ id='0'> Experience </a><a _CR_TRANSLATE_ id='1'>Nexus One // </a><a _CR_TRANSLATE_ id='2'>, the new Android phone from Google</a> - TranslationService::SplitIntoTextChunks( + translation_service.SplitIntoTextChunks( ASCIIToUTF16("<a _CR_TR_ id='0'>Experience</a> <a _CR_TR_ id='1'>Nexus" "<a _CR_TR_ id='2'> One,</a></a> <a _CR_TR_ id='2'>the new " "Android Phone</a>"), &text_chunks); ASSERT_EQ(3U, text_chunks.size()); EXPECT_EQ(ASCIIToUTF16("Experience "), text_chunks[0]); - EXPECT_EQ(ASCIIToUTF16("Nexus One, "), text_chunks[1]); - EXPECT_EQ(ASCIIToUTF16("the new Android Phone"), text_chunks[2]); + EXPECT_EQ(ASCIIToUTF16("Nexus"), text_chunks[1]); + EXPECT_EQ(ASCIIToUTF16(" One, the new Android Phone"), text_chunks[2]); + text_chunks.clear(); + + // Other incorrect case: + // Original input: + // <a _CR_TR_ id='0'>Benzinpreis-</a><a _CR_TR_ id='1'>vergleich</a> + translation_service.SplitIntoTextChunks( + ASCIIToUTF16("<a _CR_TR_ id='0'>Gasoline <a _CR_TR_ id='1'>" + "price-comparison</a></a>"), &text_chunks); + ASSERT_EQ(2U, text_chunks.size()); + EXPECT_EQ(ASCIIToUTF16("Gasoline "), text_chunks[0]); + EXPECT_EQ(ASCIIToUTF16("price-comparison"), text_chunks[1]); + text_chunks.clear(); + + // Other incorrect case: + // Original input: + // <a _CR_TR_ id='0'>Bußgeld-</a><a _CR_TR_ id='1'>rechner</a> + translation_service.SplitIntoTextChunks( + ASCIIToUTF16("<a _CR_TR_ id='1'><a _CR_TR_ id='0'>Fine-computer</a>" + "</a>"), &text_chunks); + ASSERT_EQ(2U, text_chunks.size()); + EXPECT_EQ(ASCIIToUTF16(""), text_chunks[0]); + EXPECT_EQ(ASCIIToUTF16("Fine-computer"), text_chunks[1]); text_chunks.clear(); + + translation_service.SplitIntoTextChunks( + ASCIIToUTF16("<a _CR_TR_ id='0'>The mountain live .</a> " + "<a _CR_TR_ id='1'>By Philipp Wittrock</a> <a _CR_TR_ id='0'>are</a> " + "<a _CR_TR_ id='2'>more ...</a> <a _CR_TR_ id='3'>Video</a> " + "<a _CR_TR_ id='4'>Forum</a>"), &text_chunks); + ASSERT_EQ(5U, text_chunks.size()); + EXPECT_EQ(ASCIIToUTF16("The mountain live . "), text_chunks[0]); + EXPECT_EQ(ASCIIToUTF16("By Philipp Wittrock are "), text_chunks[1]); + EXPECT_EQ(ASCIIToUTF16("more ... "), text_chunks[2]); + EXPECT_EQ(ASCIIToUTF16("Video "), text_chunks[3]); + EXPECT_EQ(ASCIIToUTF16("Forum"), text_chunks[4]); + text_chunks.clear(); + + // Make sure we support ending with a start tag. + translation_service.SplitIntoTextChunks( + ASCIIToUTF16("<a _CR_TR_ id='0'>Hello</a><a _CR_TR_ id='1'>"), + &text_chunks); + ASSERT_EQ(2U, text_chunks.size()); + EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]); + EXPECT_EQ(EmptyString16(), text_chunks[1]); } // Tests that a successful translate works as expected. |