Making the parsing of the response received from the translate server

deal with more bad results. The server can send unmatched and duplicated tags. It's paramount for us to get as many text chunks out as we sent in. This is now we are trying to do when parsing the response. BUG=34854 TEST=Run the unit-tests. Reproduce steps in bug. Review URL: http://codereview.chromium.org/603037 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@38925 0039d316-1c4b-4281-b951-d872f2087c98
author: jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-02-12 20:02:02 +0000
committer: jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-02-12 20:02:02 +0000
commit: 1be7eadb032256687abda35ae4b2ba4770757c70 (patch)
tree: 78f856b4d56b1d908426fc3606a9b9e612c65490 /chrome/browser/renderer_host/translation_service.cc
parent: 8d7727ac112e452b4daa1161d516401120475d68 (diff)
download: chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.zip
chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.tar.gz
chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.tar.bz2
1 files changed, 78 insertions, 42 deletions
diff --git a/chrome/browser/renderer_host/translation_service.cc b/chrome/browser/renderer_host/translation_service.cc
index 6ddef41..3413d21 100644
--- a/chrome/browser/renderer_host/translation_service.cc
+++ b/chrome/browser/renderer_host/translation_service.cc
@@ -217,7 +217,14 @@ void SendTranslationRequestTask::Cancel() {
 // TranslationService, public:
 
 TranslationService::TranslationService(IPC::Message::Sender* message_sender)
-    : message_sender_(message_sender) {
+    : message_sender_(message_sender),
+      kCRAnchorTagStart(ASCIIToUTF16("<a _CR_TR_ id='")),
+      kAnchorTagStart(ASCIIToUTF16("<a ")),
+      kClosingAnchorTag(ASCIIToUTF16("</a>")),
+      kQuote(ASCIIToUTF16("'")),
+      kGreaterThan(ASCIIToUTF16(">")),
+      kLessThan(ASCIIToUTF16("<")),
+      kQuoteGreaterThan(ASCIIToUTF16("'>")) {
 }
 
 TranslationService::~TranslationService() {
@@ -490,7 +497,6 @@ void TranslationService::TranslationFailed(const URLFetcher* url_fetcher) {
   SendResponseToRenderer(url_fetcher, 1, TranslationService::TextChunksList());
 }
 
-// static
 string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) {
   // If there is only 1 chunk, we don't need an anchor tag as there is no order
   // to preserve.
@@ -499,69 +505,100 @@ string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) {
 
   string16 str;
   for (size_t i = 0; i < text_chunks.size(); ++i) {
-    str.append(ASCIIToUTF16("<a _CR_TR_ id='"));
+    str.append(kCRAnchorTagStart);
     str.append(IntToString16(i));
-    str.append(ASCIIToUTF16("'>"));
+    str.append(kQuoteGreaterThan);
     str.append(text_chunks[i]);
-    str.append(ASCIIToUTF16("</a>"));
+    str.append(kClosingAnchorTag);
   }
   return str;
 }
 
-// static
+bool TranslationService::FindOpenTagIndex(const string16& text,
+                                          size_t start_index,
+                                          size_t* tag_start_index,
+                                          size_t* tag_end_index,
+                                          int* id) {
+  DCHECK(tag_start_index && tag_end_index && id);
+  size_t text_length = text.length();
+  if (start_index >= text_length)
+    return false;
+
+  *tag_start_index = text.find(kCRAnchorTagStart, start_index);
+  if (*tag_start_index == std::string::npos)
+    return false;
+
+  size_t quote_index = *tag_start_index + kCRAnchorTagStart.length();
+  size_t close_quote_index = text.find(kQuote, quote_index);
+  if (close_quote_index == std::string::npos) {
+    NOTREACHED();
+    return false;  // Not a valid anchor tag.
+  }
+
+  string16 id_str = text.substr(quote_index, close_quote_index - quote_index);
+  // Get the id.
+  if (!StringToInt(id_str, id)) {
+    NOTREACHED();
+    return false;  // Not a valid id, give up.
+  }
+
+  *tag_end_index = text.find(kGreaterThan, close_quote_index);
+  if (*tag_end_index == std::string::npos || *tag_end_index >= text_length)
+    return false;
+  return true;
+}
+
 void TranslationService::SplitIntoTextChunks(const string16& translated_text,
                                              TextChunks* text_chunks) {
-  const string16 kOpenTag = ASCIIToUTF16("<a _CR_TR_ ");
-  const string16 kCloseTag = ASCIIToUTF16("</a>");
-  const size_t open_tag_len = kOpenTag.size();
-
-  size_t start_index = translated_text.find(kOpenTag);
-  if (start_index == std::string::npos) {
+  int id = -1;
+  size_t tag_start_index = 0;
+  size_t tag_end_index = 0;
+  if (!FindOpenTagIndex(translated_text, 0, &tag_start_index, &tag_end_index,
+                        &id)) {
     // No magic anchor tag, it was a single chunk.
     text_chunks->push_back(translated_text);
     return;
   }
 
   // The server might send us some HTML with duplicated and unbalanced tags.
-  // We separate from the open tag to the next open tag located after at least
-  // one close tag.
-  while (start_index != std::string::npos) {
-    size_t stop_index =
-        translated_text.find(kCloseTag, start_index + open_tag_len);
-    string16 chunk;
-    if (stop_index == std::string::npos) {
-      // No close tag.  Just report as one chunk.
-      chunk = translated_text;
-      start_index = std::string::npos;  // So we break on next iteration.
+  // We separate from one tag begining to the next, and merge tags with
+  // duplicate IDs.
+  std::set<int> parsed_tags;
+  string16 chunk;
+  while (tag_start_index != std::string::npos) {
+    int next_id = -1;
+    size_t previous_tag_end_index = tag_end_index;
+    if (!FindOpenTagIndex(translated_text, tag_end_index,
+                          &tag_start_index, &tag_end_index, &next_id)) {
+      // Last tag. Just report as one chunk.
+      chunk = translated_text.substr(previous_tag_end_index + 1);
+      tag_start_index = std::string::npos;  // So we break on next iteration.
     } else {
-      // Now find the next open tag after this close tag.
-      stop_index = translated_text.find(kOpenTag, stop_index);
-      if (stop_index != std::string::npos) {
-        chunk = translated_text.substr(start_index, stop_index - start_index);
-        start_index = stop_index;
-      } else {
-        chunk = translated_text.substr(start_index);
-        start_index = std::string::npos;  // So we break on next iteration.
-      }
+      // Extract the text for this tag.
+      DCHECK(tag_start_index > previous_tag_end_index);
+      chunk =
+          translated_text.substr(previous_tag_end_index + 1,
+                                 tag_start_index - previous_tag_end_index - 1);
     }
     chunk = RemoveTag(chunk);
     // The translation server leaves some ampersand character in the
     // translation.
     chunk = UnescapeForHTML(chunk);
-    text_chunks->push_back(RemoveTag(chunk));
+    if (parsed_tags.count(id) > 0) {
+      // We have already seen this tag, add it to the previous text-chunk.
+      text_chunks->back().append(chunk);
+    } else {
+      text_chunks->push_back(chunk);
+      parsed_tags.insert(id);
+    }
+    id = next_id;
   }
 }
 
-// static
 string16 TranslationService::RemoveTag(const string16& text) {
   // Remove any anchor tags, knowing they could be extra/unbalanced tags.
-  const string16 kStartTag(ASCIIToUTF16("<a "));
-  const string16 kEndTag(ASCIIToUTF16("</a>"));
-  const string16 kGreaterThan(ASCIIToUTF16(">"));
-  const string16 kLessThan(ASCIIToUTF16("<"));
-
   string16 result;
-  size_t start_index = text.find(kStartTag);
+  size_t start_index = text.find(kAnchorTagStart);
   if (start_index == std::string::npos) {
     result = text;
   } else {
@@ -579,7 +616,7 @@ string16 TranslationService::RemoveTag(const string16& text) {
       }
       if (start_index > 0 && first_iter)
         result = text.substr(0, start_index);
-      start_index = text.find(kStartTag, start_index + 1);
+      start_index = text.find(kAnchorTagStart, start_index + 1);
       if (start_index == std::string::npos) {
         result += text.substr(stop_index + 1);
         break;
@@ -590,8 +627,7 @@ string16 TranslationService::RemoveTag(const string16& text) {
   }
 
   // Now remove </a> tags.
-  ReplaceSubstringsAfterOffset(&result, 0,
-                               ASCIIToUTF16("</a>"), ASCIIToUTF16(""));
+  ReplaceSubstringsAfterOffset(&result, 0, kClosingAnchorTag, EmptyString16());
   return result;
 }
author	jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-02-12 20:02:02 +0000
committer	jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-02-12 20:02:02 +0000
commit	1be7eadb032256687abda35ae4b2ba4770757c70 (patch)
tree	78f856b4d56b1d908426fc3606a9b9e612c65490 /chrome/browser/renderer_host/translation_service.cc
parent	8d7727ac112e452b4daa1161d516401120475d68 (diff)
download	chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.zip chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.tar.gz chromium_src-1be7eadb032256687abda35ae4b2ba4770757c70.tar.bz2