3 files changed, 159 insertions, 54 deletions
diff --git a/chrome/browser/renderer_host/translation_service.cc b/chrome/browser/renderer_host/translation_service.cc
index 6ddef41..3413d21 100644
--- a/chrome/browser/renderer_host/translation_service.cc
+++ b/chrome/browser/renderer_host/translation_service.cc
@@ -217,7 +217,14 @@ void SendTranslationRequestTask::Cancel() {
 // TranslationService, public:
 
 TranslationService::TranslationService(IPC::Message::Sender* message_sender)
-    : message_sender_(message_sender) {
+    : message_sender_(message_sender),
+      kCRAnchorTagStart(ASCIIToUTF16("<a _CR_TR_ id='")),
+      kAnchorTagStart(ASCIIToUTF16("<a ")),
+      kClosingAnchorTag(ASCIIToUTF16("</a>")),
+      kQuote(ASCIIToUTF16("'")),
+      kGreaterThan(ASCIIToUTF16(">")),
+      kLessThan(ASCIIToUTF16("<")),
+      kQuoteGreaterThan(ASCIIToUTF16("'>")) {
 }
 
 TranslationService::~TranslationService() {
@@ -490,7 +497,6 @@ void TranslationService::TranslationFailed(const URLFetcher* url_fetcher) {
   SendResponseToRenderer(url_fetcher, 1, TranslationService::TextChunksList());
 }
 
-// static
 string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) {
   // If there is only 1 chunk, we don't need an anchor tag as there is no order
   // to preserve.
@@ -499,69 +505,100 @@ string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) {
 
   string16 str;
   for (size_t i = 0; i < text_chunks.size(); ++i) {
-    str.append(ASCIIToUTF16("<a _CR_TR_ id='"));
+    str.append(kCRAnchorTagStart);
     str.append(IntToString16(i));
-    str.append(ASCIIToUTF16("'>"));
+    str.append(kQuoteGreaterThan);
     str.append(text_chunks[i]);
-    str.append(ASCIIToUTF16("</a>"));
+    str.append(kClosingAnchorTag);
   }
   return str;
 }
 
-// static
+bool TranslationService::FindOpenTagIndex(const string16& text,
+                                          size_t start_index,
+                                          size_t* tag_start_index,
+                                          size_t* tag_end_index,
+                                          int* id) {
+  DCHECK(tag_start_index && tag_end_index && id);
+  size_t text_length = text.length();
+  if (start_index >= text_length)
+    return false;
+
+  *tag_start_index = text.find(kCRAnchorTagStart, start_index);
+  if (*tag_start_index == std::string::npos)
+    return false;
+
+  size_t quote_index = *tag_start_index + kCRAnchorTagStart.length();
+  size_t close_quote_index = text.find(kQuote, quote_index);
+  if (close_quote_index == std::string::npos) {
+    NOTREACHED();
+    return false;  // Not a valid anchor tag.
+  }
+
+  string16 id_str = text.substr(quote_index, close_quote_index - quote_index);
+  // Get the id.
+  if (!StringToInt(id_str, id)) {
+    NOTREACHED();
+    return false;  // Not a valid id, give up.
+  }
+
+  *tag_end_index = text.find(kGreaterThan, close_quote_index);
+  if (*tag_end_index == std::string::npos || *tag_end_index >= text_length)
+    return false;
+  return true;
+}
+
 void TranslationService::SplitIntoTextChunks(const string16& translated_text,
                                              TextChunks* text_chunks) {
-  const string16 kOpenTag = ASCIIToUTF16("<a _CR_TR_ ");
-  const string16 kCloseTag = ASCIIToUTF16("</a>");
-  const size_t open_tag_len = kOpenTag.size();
-
-  size_t start_index = translated_text.find(kOpenTag);
-  if (start_index == std::string::npos) {
+  int id = -1;
+  size_t tag_start_index = 0;
+  size_t tag_end_index = 0;
+  if (!FindOpenTagIndex(translated_text, 0, &tag_start_index, &tag_end_index,
+                        &id)) {
     // No magic anchor tag, it was a single chunk.
     text_chunks->push_back(translated_text);
     return;
   }
 
   // The server might send us some HTML with duplicated and unbalanced tags.
-  // We separate from the open tag to the next open tag located after at least
-  // one close tag.
-  while (start_index != std::string::npos) {
-    size_t stop_index =
-        translated_text.find(kCloseTag, start_index + open_tag_len);
-    string16 chunk;
-    if (stop_index == std::string::npos) {
-      // No close tag.  Just report as one chunk.
-      chunk = translated_text;
-      start_index = std::string::npos;  // So we break on next iteration.
+  // We separate from one tag begining to the next, and merge tags with
+  // duplicate IDs.
+  std::set<int> parsed_tags;
+  string16 chunk;
+  while (tag_start_index != std::string::npos) {
+    int next_id = -1;
+    size_t previous_tag_end_index = tag_end_index;
+    if (!FindOpenTagIndex(translated_text, tag_end_index,
+                          &tag_start_index, &tag_end_index, &next_id)) {
+      // Last tag. Just report as one chunk.
+      chunk = translated_text.substr(previous_tag_end_index + 1);
+      tag_start_index = std::string::npos;  // So we break on next iteration.
     } else {
-      // Now find the next open tag after this close tag.
-      stop_index = translated_text.find(kOpenTag, stop_index);
-      if (stop_index != std::string::npos) {
-        chunk = translated_text.substr(start_index, stop_index - start_index);
-        start_index = stop_index;
-      } else {
-        chunk = translated_text.substr(start_index);
-        start_index = std::string::npos;  // So we break on next iteration.
-      }
+      // Extract the text for this tag.
+      DCHECK(tag_start_index > previous_tag_end_index);
+      chunk =
+          translated_text.substr(previous_tag_end_index + 1,
+                                 tag_start_index - previous_tag_end_index - 1);
     }
     chunk = RemoveTag(chunk);
     // The translation server leaves some ampersand character in the
     // translation.
     chunk = UnescapeForHTML(chunk);
-    text_chunks->push_back(RemoveTag(chunk));
+    if (parsed_tags.count(id) > 0) {
+      // We have already seen this tag, add it to the previous text-chunk.
+      text_chunks->back().append(chunk);
+    } else {
+      text_chunks->push_back(chunk);
+      parsed_tags.insert(id);
+    }
+    id = next_id;
   }
 }
 
-// static
 string16 TranslationService::RemoveTag(const string16& text) {
   // Remove any anchor tags, knowing they could be extra/unbalanced tags.
-  const string16 kStartTag(ASCIIToUTF16("<a "));
-  const string16 kEndTag(ASCIIToUTF16("</a>"));
-  const string16 kGreaterThan(ASCIIToUTF16(">"));
-  const string16 kLessThan(ASCIIToUTF16("<"));
-
   string16 result;
-  size_t start_index = text.find(kStartTag);
+  size_t start_index = text.find(kAnchorTagStart);
   if (start_index == std::string::npos) {
     result = text;
   } else {
@@ -579,7 +616,7 @@ string16 TranslationService::RemoveTag(const string16& text) {
       }
       if (start_index > 0 && first_iter)
         result = text.substr(0, start_index);
-      start_index = text.find(kStartTag, start_index + 1);
+      start_index = text.find(kAnchorTagStart, start_index + 1);
       if (start_index == std::string::npos) {
         result += text.substr(stop_index + 1);
         break;
@@ -590,8 +627,7 @@ string16 TranslationService::RemoveTag(const string16& text) {
   }
 
   // Now remove </a> tags.
-  ReplaceSubstringsAfterOffset(&result, 0,
-                               ASCIIToUTF16("</a>"), ASCIIToUTF16(""));
+  ReplaceSubstringsAfterOffset(&result, 0, kClosingAnchorTag, EmptyString16());
   return result;
 }
 
diff --git a/chrome/browser/renderer_host/translation_service.h b/chrome/browser/renderer_host/translation_service.h
index 273a142..666265b 100644
--- a/chrome/browser/renderer_host/translation_service.h
+++ b/chrome/browser/renderer_host/translation_service.h
@@ -130,16 +130,29 @@ class TranslationService : public URLFetcher::Delegate {
   // Merges all text chunks to be translated into a single string that can be
   // sent to the translate server, surrounding each chunk with an anchor tag
   // to preserve chunk order in the translated version.
-  static string16 MergeTextChunks(const TextChunks& text_chunks);
+  string16 MergeTextChunks(const TextChunks& text_chunks);
 
   // Splits the translated text into its original text chunks, removing the
   // anchor tags wrapper that were added to preserve order.
-  static void SplitIntoTextChunks(const string16& translated_text,
-                                  TextChunks* text_chunks);
+  void SplitIntoTextChunks(const string16& translated_text,
+                           TextChunks* text_chunks);
 
   // Removes the HTML anchor tag surrounding |text| and returns the resulting
   // string.
-  static string16 RemoveTag(const string16& text);
+  string16 RemoveTag(const string16& text);
+
+  // Find the next anchor tag in |text| starting at |start_index|.
+  // Sets |id| (which must be non NULL) to the id property of the tag (which is
+  // expected to be an int). Sets |tag_start_index| and |tag_end_index| to the
+  // index of the beginning/end of the next tag.
+  // Returns true if a tag was found and it is not at the end of the string,
+  // false otherwise in which case |id|, |tag_start_index| and |tag_end_index|
+  // are not set.
+  bool FindOpenTagIndex(const string16& text,
+                        size_t start_index,
+                        size_t* tag_start_index,
+                        size_t* tag_end_index,
+                        int* id);
 
   // Adds |text| to the string request in/out param |request|.  If |request| is
   // empty, then the source, target language as well as the secure parameters
@@ -160,6 +173,15 @@ class TranslationService : public URLFetcher::Delegate {
   TranslationRequestMap pending_translation_requests_;
   TranslationRequestMap pending_secure_translation_requests_;
 
+  // Strings used for parsing.
+  const string16 kCRAnchorTagStart;
+  const string16 kAnchorTagStart;
+  const string16 kClosingAnchorTag;
+  const string16 kQuote;
+  const string16 kGreaterThan;
+  const string16 kLessThan;
+  const string16 kQuoteGreaterThan;
+
   // The size taken by the parameters and separators needed when adding text to
   // a request string.
   static size_t text_param_length_;
diff --git a/chrome/browser/renderer_host/translation_service_unittest.cc b/chrome/browser/renderer_host/translation_service_unittest.cc
index 5d18fe9..f973502 100644
--- a/chrome/browser/renderer_host/translation_service_unittest.cc
+++ b/chrome/browser/renderer_host/translation_service_unittest.cc
@@ -154,14 +154,15 @@ static void ExtractQueryStringsFromUploadData(TestURLFetcher* url_fetcher,
 }
 
 TEST_F(TranslationServiceTest, MergeTestChunks) {
+  TranslationService translation_service(NULL);
   std::vector<string16> input;
   input.push_back(ASCIIToUTF16("Hello"));
-  string16 result = TranslationService::MergeTextChunks(input);
+  string16 result = translation_service.MergeTextChunks(input);
   EXPECT_EQ(ASCIIToUTF16("Hello"), result);
   input.push_back(ASCIIToUTF16(" my name"));
   input.push_back(ASCIIToUTF16(" is"));
   input.push_back(ASCIIToUTF16(" Jay."));
-  result = TranslationService::MergeTextChunks(input);
+  result = translation_service.MergeTextChunks(input);
   EXPECT_EQ(ASCIIToUTF16("<a _CR_TR_ id='0'>Hello</a>"
                          "<a _CR_TR_ id='1'> my name</a>"
                          "<a _CR_TR_ id='2'> is</a>"
@@ -179,11 +180,12 @@ TEST_F(TranslationServiceTest, RemoveTag) {
     "", "Hello", "", "  Link ", "Link", "<a link", "broken", "broken bad bad"
   };
 
+  TranslationService translation_service(NULL);
   ASSERT_EQ(arraysize(kInputs), arraysize(kExpected));
   for (size_t i = 0; i < arraysize(kInputs); ++i) {
     SCOPED_TRACE(::testing::Message::Message() << "Iteration " << i);
     string16 input = ASCIIToUTF16(kInputs[i]);
-    string16 output = TranslationService::RemoveTag(input);
+    string16 output = translation_service.RemoveTag(input);
     EXPECT_EQ(ASCIIToUTF16(kExpected[i]), output);
   }
 }
@@ -191,16 +193,18 @@ TEST_F(TranslationServiceTest, RemoveTag) {
 // Tests that we deal correctly with the various results the translation server
 // can return, including the buggy ones.
 TEST_F(TranslationServiceTest, SplitIntoTextChunks) {
+  TranslationService translation_service(NULL);
+
   // Simple case.
   std::vector<string16> text_chunks;
-  TranslationService::SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks);
+  translation_service.SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks);
   ASSERT_EQ(1U, text_chunks.size());
   EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]);
 
   text_chunks.clear();
 
   // Multiple chunks case, correct syntax.
-  TranslationService::SplitIntoTextChunks(
+  translation_service.SplitIntoTextChunks(
       ASCIIToUTF16("<a _CR_TR_ id='0'>Bonjour</a>"
                    "<a _CR_TR_ id='1'> mon nom</a>"
                    "<a _CR_TR_ id='2'> est</a>"
@@ -216,15 +220,58 @@ TEST_F(TranslationServiceTest, SplitIntoTextChunks) {
   // For info, original input:
   // <a _CR_TRANSLATE_ id='0'> Experience </a><a _CR_TRANSLATE_ id='1'>Nexus One
   // </a><a _CR_TRANSLATE_ id='2'>, the new Android phone from Google</a>
-  TranslationService::SplitIntoTextChunks(
+  translation_service.SplitIntoTextChunks(
       ASCIIToUTF16("<a _CR_TR_ id='0'>Experience</a> <a _CR_TR_ id='1'>Nexus"
                    "<a _CR_TR_ id='2'> One,</a></a> <a _CR_TR_ id='2'>the new "
                    "Android Phone</a>"), &text_chunks);
   ASSERT_EQ(3U, text_chunks.size());
   EXPECT_EQ(ASCIIToUTF16("Experience "), text_chunks[0]);
-  EXPECT_EQ(ASCIIToUTF16("Nexus One, "), text_chunks[1]);
-  EXPECT_EQ(ASCIIToUTF16("the new Android Phone"), text_chunks[2]);
+  EXPECT_EQ(ASCIIToUTF16("Nexus"), text_chunks[1]);
+  EXPECT_EQ(ASCIIToUTF16(" One, the new Android Phone"), text_chunks[2]);
+  text_chunks.clear();
+
+  // Other incorrect case:
+  // Original input:
+  // <a _CR_TR_ id='0'>Benzinpreis-</a><a _CR_TR_ id='1'>vergleich</a>
+  translation_service.SplitIntoTextChunks(
+      ASCIIToUTF16("<a _CR_TR_ id='0'>Gasoline <a _CR_TR_ id='1'>"
+                   "price-comparison</a></a>"), &text_chunks);
+  ASSERT_EQ(2U, text_chunks.size());
+  EXPECT_EQ(ASCIIToUTF16("Gasoline "), text_chunks[0]);
+  EXPECT_EQ(ASCIIToUTF16("price-comparison"), text_chunks[1]);
+  text_chunks.clear();
+
+  // Other incorrect case:
+  // Original input:
+  // <a _CR_TR_ id='0'>Bußgeld-</a><a _CR_TR_ id='1'>rechner</a>
+  translation_service.SplitIntoTextChunks(
+      ASCIIToUTF16("<a _CR_TR_ id='1'><a _CR_TR_ id='0'>Fine-computer</a>"
+                   "</a>"), &text_chunks);
+  ASSERT_EQ(2U, text_chunks.size());
+  EXPECT_EQ(ASCIIToUTF16(""), text_chunks[0]);
+  EXPECT_EQ(ASCIIToUTF16("Fine-computer"), text_chunks[1]);
   text_chunks.clear();
+
+  translation_service.SplitIntoTextChunks(
+      ASCIIToUTF16("<a _CR_TR_ id='0'>The mountain live .</a> "
+          "<a _CR_TR_ id='1'>By Philipp Wittrock</a> <a _CR_TR_ id='0'>are</a> "
+          "<a _CR_TR_ id='2'>more ...</a> <a _CR_TR_ id='3'>Video</a> "
+          "<a _CR_TR_ id='4'>Forum</a>"), &text_chunks);
+  ASSERT_EQ(5U, text_chunks.size());
+  EXPECT_EQ(ASCIIToUTF16("The mountain live . "), text_chunks[0]);
+  EXPECT_EQ(ASCIIToUTF16("By Philipp Wittrock are "), text_chunks[1]);
+  EXPECT_EQ(ASCIIToUTF16("more ... "), text_chunks[2]);
+  EXPECT_EQ(ASCIIToUTF16("Video "), text_chunks[3]);
+  EXPECT_EQ(ASCIIToUTF16("Forum"), text_chunks[4]);
+  text_chunks.clear();
+
+  // Make sure we support ending with a start tag.
+  translation_service.SplitIntoTextChunks(
+      ASCIIToUTF16("<a _CR_TR_ id='0'>Hello</a><a _CR_TR_ id='1'>"),
+      &text_chunks);
+  ASSERT_EQ(2U, text_chunks.size());
+  EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]);
+  EXPECT_EQ(EmptyString16(), text_chunks[1]);
 }
 
 // Tests that a successful translate works as expected.