Making the parsing of the response received from the translate server

deal with more bad results. The server can send unmatched and duplicated tags. It's paramount for us to get as many text chunks out as we sent in. This is now we are trying to do when parsing the response. BUG=34854 TEST=Run the unit-tests. Reproduce steps in bug. Review URL: http://codereview.chromium.org/603037 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@38925 0039d316-1c4b-4281-b951-d872f2087c98
author: jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-02-12 20:02:02 +0000
committer: jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-02-12 20:02:02 +0000
commit: 1be7eadb032256687abda35ae4b2ba4770757c70 (patch)
tree: 78f856b4d56b1d908426fc3606a9b9e612c65490
parent: 8d7727ac112e452b4daa1161d516401120475d68 (diff)
download: chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.zip
chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.tar.gz
chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.tar.bz2
3 files changed, 159 insertions, 54 deletions
diff --git a/chrome/browser/renderer_host/translation_service.cc b/chrome/browser/renderer_host/translation_service.cc
index 6ddef41..3413d21 100644
--- a/chrome/browser/renderer_host/translation_service.cc
+++ b/chrome/browser/renderer_host/translation_service.cc
@@ -217,7 +217,14 @@ void SendTranslationRequestTask::Cancel() {
 // TranslationService, public:
 
 TranslationService::TranslationService(IPC::Message::Sender* message_sender)
-    : message_sender_(message_sender) {
+    : message_sender_(message_sender),
+      kCRAnchorTagStart(ASCIIToUTF16("<a _CR_TR_ id='")),
+      kAnchorTagStart(ASCIIToUTF16("<a ")),
+      kClosingAnchorTag(ASCIIToUTF16("</a>")),
+      kQuote(ASCIIToUTF16("'")),
+      kGreaterThan(ASCIIToUTF16(">")),
+      kLessThan(ASCIIToUTF16("<")),
+      kQuoteGreaterThan(ASCIIToUTF16("'>")) {
 }
 
 TranslationService::~TranslationService() {
@@ -490,7 +497,6 @@ void TranslationService::TranslationFailed(const URLFetcher* url_fetcher) {
   SendResponseToRenderer(url_fetcher, 1, TranslationService::TextChunksList());
 }
 
-// static
 string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) {
   // If there is only 1 chunk, we don't need an anchor tag as there is no order
   // to preserve.
@@ -499,69 +505,100 @@ string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) {
 
   string16 str;
   for (size_t i = 0; i < text_chunks.size(); ++i) {
-    str.append(ASCIIToUTF16("<a _CR_TR_ id='"));
+    str.append(kCRAnchorTagStart);
     str.append(IntToString16(i));
-    str.append(ASCIIToUTF16("'>"));
+    str.append(kQuoteGreaterThan);
     str.append(text_chunks[i]);
-    str.append(ASCIIToUTF16("</a>"));
+    str.append(kClosingAnchorTag);
   }
   return str;
 }
 
-// static
+bool TranslationService::FindOpenTagIndex(const string16& text,
+                                          size_t start_index,
+                                          size_t* tag_start_index,
+                                          size_t* tag_end_index,
+                                          int* id) {
+  DCHECK(tag_start_index && tag_end_index && id);
+  size_t text_length = text.length();
+  if (start_index >= text_length)
+    return false;
+
+  *tag_start_index = text.find(kCRAnchorTagStart, start_index);
+  if (*tag_start_index == std::string::npos)
+    return false;
+
+  size_t quote_index = *tag_start_index + kCRAnchorTagStart.length();
+  size_t close_quote_index = text.find(kQuote, quote_index);
+  if (close_quote_index == std::string::npos) {
+    NOTREACHED();
+    return false;  // Not a valid anchor tag.
+  }
+
+  string16 id_str = text.substr(quote_index, close_quote_index - quote_index);
+  // Get the id.
+  if (!StringToInt(id_str, id)) {
+    NOTREACHED();
+    return false;  // Not a valid id, give up.
+  }
+
+  *tag_end_index = text.find(kGreaterThan, close_quote_index);
+  if (*tag_end_index == std::string::npos || *tag_end_index >= text_length)
+    return false;
+  return true;
+}
+
 void TranslationService::SplitIntoTextChunks(const string16& translated_text,
                                              TextChunks* text_chunks) {
-  const string16 kOpenTag = ASCIIToUTF16("<a _CR_TR_ ");
-  const string16 kCloseTag = ASCIIToUTF16("</a>");
-  const size_t open_tag_len = kOpenTag.size();
-
-  size_t start_index = translated_text.find(kOpenTag);
-  if (start_index == std::string::npos) {
+  int id = -1;
+  size_t tag_start_index = 0;
+  size_t tag_end_index = 0;
+  if (!FindOpenTagIndex(translated_text, 0, &tag_start_index, &tag_end_index,
+                        &id)) {
     // No magic anchor tag, it was a single chunk.
     text_chunks->push_back(translated_text);
     return;
   }
 
   // The server might send us some HTML with duplicated and unbalanced tags.
-  // We separate from the open tag to the next open tag located after at least
-  // one close tag.
-  while (start_index != std::string::npos) {
-    size_t stop_index =
-        translated_text.find(kCloseTag, start_index + open_tag_len);
-    string16 chunk;
-    if (stop_index == std::string::npos) {
-      // No close tag.  Just report as one chunk.
-      chunk = translated_text;
-      start_index = std::string::npos;  // So we break on next iteration.
+  // We separate from one tag begining to the next, and merge tags with
+  // duplicate IDs.
+  std::set<int> parsed_tags;
+  string16 chunk;
+  while (tag_start_index != std::string::npos) {
+    int next_id = -1;
+    size_t previous_tag_end_index = tag_end_index;
+    if (!FindOpenTagIndex(translated_text, tag_end_index,
+                          &tag_start_index, &tag_end_index, &next_id)) {
+      // Last tag. Just report as one chunk.
+      chunk = translated_text.substr(previous_tag_end_index + 1);
+      tag_start_index = std::string::npos;  // So we break on next iteration.
     } else {
-      // Now find the next open tag after this close tag.
-      stop_index = translated_text.find(kOpenTag, stop_index);
-      if (stop_index != std::string::npos) {
-        chunk = translated_text.substr(start_index, stop_index - start_index);
-        start_index = stop_index;
-      } else {
-        chunk = translated_text.substr(start_index);
-        start_index = std::string::npos;  // So we break on next iteration.
-      }
+      // Extract the text for this tag.
+      DCHECK(tag_start_index > previous_tag_end_index);
+      chunk =
+          translated_text.substr(previous_tag_end_index + 1,
+                                 tag_start_index - previous_tag_end_index - 1);
     }
     chunk = RemoveTag(chunk);
     // The translation server leaves some ampersand character in the
     // translation.
     chunk = UnescapeForHTML(chunk);
-    text_chunks->push_back(RemoveTag(chunk));
+    if (parsed_tags.count(id) > 0) {
+      // We have already seen this tag, add it to the previous text-chunk.
+      text_chunks->back().append(chunk);
+    } else {
+      text_chunks->push_back(chunk);
+      parsed_tags.insert(id);
+    }
+    id = next_id;
   }
 }
 
-// static
 string16 TranslationService::RemoveTag(const string16& text) {
   // Remove any anchor tags, knowing they could be extra/unbalanced tags.
-  const string16 kStartTag(ASCIIToUTF16("<a "));
-  const string16 kEndTag(ASCIIToUTF16("</a>"));
-  const string16 kGreaterThan(ASCIIToUTF16(">"));
-  const string16 kLessThan(ASCIIToUTF16("<"));
-
   string16 result;
-  size_t start_index = text.find(kStartTag);
+  size_t start_index = text.find(kAnchorTagStart);
   if (start_index == std::string::npos) {
     result = text;
   } else {
@@ -579,7 +616,7 @@ string16 TranslationService::RemoveTag(const string16& text) {
       }
       if (start_index > 0 && first_iter)
         result = text.substr(0, start_index);
-      start_index = text.find(kStartTag, start_index + 1);
+      start_index = text.find(kAnchorTagStart, start_index + 1);
       if (start_index == std::string::npos) {
         result += text.substr(stop_index + 1);
         break;
@@ -590,8 +627,7 @@ string16 TranslationService::RemoveTag(const string16& text) {
   }
 
   // Now remove </a> tags.
-  ReplaceSubstringsAfterOffset(&result, 0,
-                               ASCIIToUTF16("</a>"), ASCIIToUTF16(""));
+  ReplaceSubstringsAfterOffset(&result, 0, kClosingAnchorTag, EmptyString16());
   return result;
 }
 
diff --git a/chrome/browser/renderer_host/translation_service.h b/chrome/browser/renderer_host/translation_service.h
index 273a142..666265b 100644
--- a/chrome/browser/renderer_host/translation_service.h
+++ b/chrome/browser/renderer_host/translation_service.h
@@ -130,16 +130,29 @@ class TranslationService : public URLFetcher::Delegate {
   // Merges all text chunks to be translated into a single string that can be
   // sent to the translate server, surrounding each chunk with an anchor tag
   // to preserve chunk order in the translated version.
-  static string16 MergeTextChunks(const TextChunks& text_chunks);
+  string16 MergeTextChunks(const TextChunks& text_chunks);
 
   // Splits the translated text into its original text chunks, removing the
   // anchor tags wrapper that were added to preserve order.
-  static void SplitIntoTextChunks(const string16& translated_text,
-                                  TextChunks* text_chunks);
+  void SplitIntoTextChunks(const string16& translated_text,
+                           TextChunks* text_chunks);
 
   // Removes the HTML anchor tag surrounding |text| and returns the resulting
   // string.
-  static string16 RemoveTag(const string16& text);
+  string16 RemoveTag(const string16& text);
+
+  // Find the next anchor tag in |text| starting at |start_index|.
+  // Sets |id| (which must be non NULL) to the id property of the tag (which is
+  // expected to be an int). Sets |tag_start_index| and |tag_end_index| to the
+  // index of the beginning/end of the next tag.
+  // Returns true if a tag was found and it is not at the end of the string,
+  // false otherwise in which case |id|, |tag_start_index| and |tag_end_index|
+  // are not set.
+  bool FindOpenTagIndex(const string16& text,
+                        size_t start_index,
+                        size_t* tag_start_index,
+                        size_t* tag_end_index,
+                        int* id);
 
   // Adds |text| to the string request in/out param |request|.  If |request| is
   // empty, then the source, target language as well as the secure parameters
@@ -160,6 +173,15 @@ class TranslationService : public URLFetcher::Delegate {
   TranslationRequestMap pending_translation_requests_;
   TranslationRequestMap pending_secure_translation_requests_;
 
+  // Strings used for parsing.
+  const string16 kCRAnchorTagStart;
+  const string16 kAnchorTagStart;
+  const string16 kClosingAnchorTag;
+  const string16 kQuote;
+  const string16 kGreaterThan;
+  const string16 kLessThan;
+  const string16 kQuoteGreaterThan;
+
   // The size taken by the parameters and separators needed when adding text to
   // a request string.
   static size_t text_param_length_;
diff --git a/chrome/browser/renderer_host/translation_service_unittest.cc b/chrome/browser/renderer_host/translation_service_unittest.cc
index 5d18fe9..f973502 100644
--- a/chrome/browser/renderer_host/translation_service_unittest.cc
+++ b/chrome/browser/renderer_host/translation_service_unittest.cc
@@ -154,14 +154,15 @@ static void ExtractQueryStringsFromUploadData(TestURLFetcher* url_fetcher,
 }
 
 TEST_F(TranslationServiceTest, MergeTestChunks) {
+  TranslationService translation_service(NULL);
   std::vector<string16> input;
   input.push_back(ASCIIToUTF16("Hello"));
-  string16 result = TranslationService::MergeTextChunks(input);
+  string16 result = translation_service.MergeTextChunks(input);
   EXPECT_EQ(ASCIIToUTF16("Hello"), result);
   input.push_back(ASCIIToUTF16(" my name"));
   input.push_back(ASCIIToUTF16(" is"));
   input.push_back(ASCIIToUTF16(" Jay."));
-  result = TranslationService::MergeTextChunks(input);
+  result = translation_service.MergeTextChunks(input);
   EXPECT_EQ(ASCIIToUTF16("<a _CR_TR_ id='0'>Hello</a>"
                          "<a _CR_TR_ id='1'> my name</a>"
                          "<a _CR_TR_ id='2'> is</a>"
@@ -179,11 +180,12 @@ TEST_F(TranslationServiceTest, RemoveTag) {
     "", "Hello", "", "  Link ", "Link", "<a link", "broken", "broken bad bad"
   };
 
+  TranslationService translation_service(NULL);
   ASSERT_EQ(arraysize(kInputs), arraysize(kExpected));
   for (size_t i = 0; i < arraysize(kInputs); ++i) {
     SCOPED_TRACE(::testing::Message::Message() << "Iteration " << i);
     string16 input = ASCIIToUTF16(kInputs[i]);
-    string16 output = TranslationService::RemoveTag(input);
+    string16 output = translation_service.RemoveTag(input);
     EXPECT_EQ(ASCIIToUTF16(kExpected[i]), output);
   }
 }
@@ -191,16 +193,18 @@ TEST_F(TranslationServiceTest, RemoveTag) {
 // Tests that we deal correctly with the various results the translation server
 // can return, including the buggy ones.
 TEST_F(TranslationServiceTest, SplitIntoTextChunks) {
+  TranslationService translation_service(NULL);
+
   // Simple case.
   std::vector<string16> text_chunks;
-  TranslationService::SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks);
+  translation_service.SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks);
   ASSERT_EQ(1U, text_chunks.size());
   EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]);
 
   text_chunks.clear();
 
   // Multiple chunks case, correct syntax.
-  TranslationService::SplitIntoTextChunks(
+  translation_service.SplitIntoTextChunks(
       ASCIIToUTF16("<a _CR_TR_ id='0'>Bonjour</a>"
                    "<a _CR_TR_ id='1'> mon nom</a>"
                    "<a _CR_TR_ id='2'> est</a>"
@@ -216,15 +220,58 @@ TEST_F(TranslationServiceTest, SplitIntoTextChunks) {
   // For info, original input:
   // <a _CR_TRANSLATE_ id='0'> Experience </a><a _CR_TRANSLATE_ id='1'>Nexus One
   // </a><a _CR_TRANSLATE_ id='2'>, the new Android phone from Google</a>
-  TranslationService::SplitIntoTextChunks(
+  translation_service.SplitIntoTextChunks(
       ASCIIToUTF16("<a _CR_TR_ id='0'>Experience</a> <a _CR_TR_ id='1'>Nexus"
                    "<a _CR_TR_ id='2'> One,</a></a> <a _CR_TR_ id='2'>the new "
                    "Android Phone</a>"), &text_chunks);
   ASSERT_EQ(3U, text_chunks.size());
   EXPECT_EQ(ASCIIToUTF16("Experience "), text_chunks[0]);
-  EXPECT_EQ(ASCIIToUTF16("Nexus One, "), text_chunks[1]);
-  EXPECT_EQ(ASCIIToUTF16("the new Android Phone"), text_chunks[2]);
+  EXPECT_EQ(ASCIIToUTF16("Nexus"), text_chunks[1]);
+  EXPECT_EQ(ASCIIToUTF16(" One, the new Android Phone"), text_chunks[2]);
+  text_chunks.clear();
+
+  // Other incorrect case:
+  // Original input:
+  // <a _CR_TR_ id='0'>Benzinpreis-</a><a _CR_TR_ id='1'>vergleich</a>
+  translation_service.SplitIntoTextChunks(
+      ASCIIToUTF16("<a _CR_TR_ id='0'>Gasoline <a _CR_TR_ id='1'>"
+                   "price-comparison</a></a>"), &text_chunks);
+  ASSERT_EQ(2U, text_chunks.size());
+  EXPECT_EQ(ASCIIToUTF16("Gasoline "), text_chunks[0]);
+  EXPECT_EQ(ASCIIToUTF16("price-comparison"), text_chunks[1]);
+  text_chunks.clear();
+
+  // Other incorrect case:
+  // Original input:
+  // <a _CR_TR_ id='0'>Bußgeld-</a><a _CR_TR_ id='1'>rechner</a>
+  translation_service.SplitIntoTextChunks(
+      ASCIIToUTF16("<a _CR_TR_ id='1'><a _CR_TR_ id='0'>Fine-computer</a>"
+                   "</a>"), &text_chunks);
+  ASSERT_EQ(2U, text_chunks.size());
+  EXPECT_EQ(ASCIIToUTF16(""), text_chunks[0]);
+  EXPECT_EQ(ASCIIToUTF16("Fine-computer"), text_chunks[1]);
   text_chunks.clear();
+
+  translation_service.SplitIntoTextChunks(
+      ASCIIToUTF16("<a _CR_TR_ id='0'>The mountain live .</a> "
+          "<a _CR_TR_ id='1'>By Philipp Wittrock</a> <a _CR_TR_ id='0'>are</a> "
+          "<a _CR_TR_ id='2'>more ...</a> <a _CR_TR_ id='3'>Video</a> "
+          "<a _CR_TR_ id='4'>Forum</a>"), &text_chunks);
+  ASSERT_EQ(5U, text_chunks.size());
+  EXPECT_EQ(ASCIIToUTF16("The mountain live . "), text_chunks[0]);
+  EXPECT_EQ(ASCIIToUTF16("By Philipp Wittrock are "), text_chunks[1]);
+  EXPECT_EQ(ASCIIToUTF16("more ... "), text_chunks[2]);
+  EXPECT_EQ(ASCIIToUTF16("Video "), text_chunks[3]);
+  EXPECT_EQ(ASCIIToUTF16("Forum"), text_chunks[4]);
+  text_chunks.clear();
+
+  // Make sure we support ending with a start tag.
+  translation_service.SplitIntoTextChunks(
+      ASCIIToUTF16("<a _CR_TR_ id='0'>Hello</a><a _CR_TR_ id='1'>"),
+      &text_chunks);
+  ASSERT_EQ(2U, text_chunks.size());
+  EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]);
+  EXPECT_EQ(EmptyString16(), text_chunks[1]);
 }
 
 // Tests that a successful translate works as expected.
author	jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-02-12 20:02:02 +0000
committer	jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-02-12 20:02:02 +0000
commit	1be7eadb032256687abda35ae4b2ba4770757c70 (patch)
tree	78f856b4d56b1d908426fc3606a9b9e612c65490
parent	8d7727ac112e452b4daa1161d516401120475d68 (diff)
download	chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.zip chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.tar.gz chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.tar.bz2