summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--chrome/browser/renderer_host/translation_service.cc120
-rw-r--r--chrome/browser/renderer_host/translation_service.h30
-rw-r--r--chrome/browser/renderer_host/translation_service_unittest.cc63
3 files changed, 159 insertions, 54 deletions
diff --git a/chrome/browser/renderer_host/translation_service.cc b/chrome/browser/renderer_host/translation_service.cc
index 6ddef41..3413d21 100644
--- a/chrome/browser/renderer_host/translation_service.cc
+++ b/chrome/browser/renderer_host/translation_service.cc
@@ -217,7 +217,14 @@ void SendTranslationRequestTask::Cancel() {
// TranslationService, public:
TranslationService::TranslationService(IPC::Message::Sender* message_sender)
- : message_sender_(message_sender) {
+ : message_sender_(message_sender),
+ kCRAnchorTagStart(ASCIIToUTF16("<a _CR_TR_ id='")),
+ kAnchorTagStart(ASCIIToUTF16("<a ")),
+ kClosingAnchorTag(ASCIIToUTF16("</a>")),
+ kQuote(ASCIIToUTF16("'")),
+ kGreaterThan(ASCIIToUTF16(">")),
+ kLessThan(ASCIIToUTF16("<")),
+ kQuoteGreaterThan(ASCIIToUTF16("'>")) {
}
TranslationService::~TranslationService() {
@@ -490,7 +497,6 @@ void TranslationService::TranslationFailed(const URLFetcher* url_fetcher) {
SendResponseToRenderer(url_fetcher, 1, TranslationService::TextChunksList());
}
-// static
string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) {
// If there is only 1 chunk, we don't need an anchor tag as there is no order
// to preserve.
@@ -499,69 +505,100 @@ string16 TranslationService::MergeTextChunks(const TextChunks& text_chunks) {
string16 str;
for (size_t i = 0; i < text_chunks.size(); ++i) {
- str.append(ASCIIToUTF16("<a _CR_TR_ id='"));
+ str.append(kCRAnchorTagStart);
str.append(IntToString16(i));
- str.append(ASCIIToUTF16("'>"));
+ str.append(kQuoteGreaterThan);
str.append(text_chunks[i]);
- str.append(ASCIIToUTF16("</a>"));
+ str.append(kClosingAnchorTag);
}
return str;
}
-// static
+bool TranslationService::FindOpenTagIndex(const string16& text,
+ size_t start_index,
+ size_t* tag_start_index,
+ size_t* tag_end_index,
+ int* id) {
+ DCHECK(tag_start_index && tag_end_index && id);
+ size_t text_length = text.length();
+ if (start_index >= text_length)
+ return false;
+
+ *tag_start_index = text.find(kCRAnchorTagStart, start_index);
+ if (*tag_start_index == std::string::npos)
+ return false;
+
+ size_t quote_index = *tag_start_index + kCRAnchorTagStart.length();
+ size_t close_quote_index = text.find(kQuote, quote_index);
+ if (close_quote_index == std::string::npos) {
+ NOTREACHED();
+ return false; // Not a valid anchor tag.
+ }
+
+ string16 id_str = text.substr(quote_index, close_quote_index - quote_index);
+ // Get the id.
+ if (!StringToInt(id_str, id)) {
+ NOTREACHED();
+ return false; // Not a valid id, give up.
+ }
+
+ *tag_end_index = text.find(kGreaterThan, close_quote_index);
+ if (*tag_end_index == std::string::npos || *tag_end_index >= text_length)
+ return false;
+ return true;
+}
+
void TranslationService::SplitIntoTextChunks(const string16& translated_text,
TextChunks* text_chunks) {
- const string16 kOpenTag = ASCIIToUTF16("<a _CR_TR_ ");
- const string16 kCloseTag = ASCIIToUTF16("</a>");
- const size_t open_tag_len = kOpenTag.size();
-
- size_t start_index = translated_text.find(kOpenTag);
- if (start_index == std::string::npos) {
+ int id = -1;
+ size_t tag_start_index = 0;
+ size_t tag_end_index = 0;
+ if (!FindOpenTagIndex(translated_text, 0, &tag_start_index, &tag_end_index,
+ &id)) {
// No magic anchor tag, it was a single chunk.
text_chunks->push_back(translated_text);
return;
}
// The server might send us some HTML with duplicated and unbalanced tags.
- // We separate from the open tag to the next open tag located after at least
- // one close tag.
- while (start_index != std::string::npos) {
- size_t stop_index =
- translated_text.find(kCloseTag, start_index + open_tag_len);
- string16 chunk;
- if (stop_index == std::string::npos) {
- // No close tag. Just report as one chunk.
- chunk = translated_text;
- start_index = std::string::npos; // So we break on next iteration.
+ // We separate from one tag begining to the next, and merge tags with
+ // duplicate IDs.
+ std::set<int> parsed_tags;
+ string16 chunk;
+ while (tag_start_index != std::string::npos) {
+ int next_id = -1;
+ size_t previous_tag_end_index = tag_end_index;
+ if (!FindOpenTagIndex(translated_text, tag_end_index,
+ &tag_start_index, &tag_end_index, &next_id)) {
+ // Last tag. Just report as one chunk.
+ chunk = translated_text.substr(previous_tag_end_index + 1);
+ tag_start_index = std::string::npos; // So we break on next iteration.
} else {
- // Now find the next open tag after this close tag.
- stop_index = translated_text.find(kOpenTag, stop_index);
- if (stop_index != std::string::npos) {
- chunk = translated_text.substr(start_index, stop_index - start_index);
- start_index = stop_index;
- } else {
- chunk = translated_text.substr(start_index);
- start_index = std::string::npos; // So we break on next iteration.
- }
+ // Extract the text for this tag.
+ DCHECK(tag_start_index > previous_tag_end_index);
+ chunk =
+ translated_text.substr(previous_tag_end_index + 1,
+ tag_start_index - previous_tag_end_index - 1);
}
chunk = RemoveTag(chunk);
// The translation server leaves some ampersand character in the
// translation.
chunk = UnescapeForHTML(chunk);
- text_chunks->push_back(RemoveTag(chunk));
+ if (parsed_tags.count(id) > 0) {
+ // We have already seen this tag, add it to the previous text-chunk.
+ text_chunks->back().append(chunk);
+ } else {
+ text_chunks->push_back(chunk);
+ parsed_tags.insert(id);
+ }
+ id = next_id;
}
}
-// static
string16 TranslationService::RemoveTag(const string16& text) {
// Remove any anchor tags, knowing they could be extra/unbalanced tags.
- const string16 kStartTag(ASCIIToUTF16("<a "));
- const string16 kEndTag(ASCIIToUTF16("</a>"));
- const string16 kGreaterThan(ASCIIToUTF16(">"));
- const string16 kLessThan(ASCIIToUTF16("<"));
-
string16 result;
- size_t start_index = text.find(kStartTag);
+ size_t start_index = text.find(kAnchorTagStart);
if (start_index == std::string::npos) {
result = text;
} else {
@@ -579,7 +616,7 @@ string16 TranslationService::RemoveTag(const string16& text) {
}
if (start_index > 0 && first_iter)
result = text.substr(0, start_index);
- start_index = text.find(kStartTag, start_index + 1);
+ start_index = text.find(kAnchorTagStart, start_index + 1);
if (start_index == std::string::npos) {
result += text.substr(stop_index + 1);
break;
@@ -590,8 +627,7 @@ string16 TranslationService::RemoveTag(const string16& text) {
}
// Now remove </a> tags.
- ReplaceSubstringsAfterOffset(&result, 0,
- ASCIIToUTF16("</a>"), ASCIIToUTF16(""));
+ ReplaceSubstringsAfterOffset(&result, 0, kClosingAnchorTag, EmptyString16());
return result;
}
diff --git a/chrome/browser/renderer_host/translation_service.h b/chrome/browser/renderer_host/translation_service.h
index 273a142..666265b 100644
--- a/chrome/browser/renderer_host/translation_service.h
+++ b/chrome/browser/renderer_host/translation_service.h
@@ -130,16 +130,29 @@ class TranslationService : public URLFetcher::Delegate {
// Merges all text chunks to be translated into a single string that can be
// sent to the translate server, surrounding each chunk with an anchor tag
// to preserve chunk order in the translated version.
- static string16 MergeTextChunks(const TextChunks& text_chunks);
+ string16 MergeTextChunks(const TextChunks& text_chunks);
// Splits the translated text into its original text chunks, removing the
// anchor tags wrapper that were added to preserve order.
- static void SplitIntoTextChunks(const string16& translated_text,
- TextChunks* text_chunks);
+ void SplitIntoTextChunks(const string16& translated_text,
+ TextChunks* text_chunks);
// Removes the HTML anchor tag surrounding |text| and returns the resulting
// string.
- static string16 RemoveTag(const string16& text);
+ string16 RemoveTag(const string16& text);
+
+ // Find the next anchor tag in |text| starting at |start_index|.
+ // Sets |id| (which must be non NULL) to the id property of the tag (which is
+ // expected to be an int). Sets |tag_start_index| and |tag_end_index| to the
+ // index of the beginning/end of the next tag.
+ // Returns true if a tag was found and it is not at the end of the string,
+ // false otherwise in which case |id|, |tag_start_index| and |tag_end_index|
+ // are not set.
+ bool FindOpenTagIndex(const string16& text,
+ size_t start_index,
+ size_t* tag_start_index,
+ size_t* tag_end_index,
+ int* id);
// Adds |text| to the string request in/out param |request|. If |request| is
// empty, then the source, target language as well as the secure parameters
@@ -160,6 +173,15 @@ class TranslationService : public URLFetcher::Delegate {
TranslationRequestMap pending_translation_requests_;
TranslationRequestMap pending_secure_translation_requests_;
+ // Strings used for parsing.
+ const string16 kCRAnchorTagStart;
+ const string16 kAnchorTagStart;
+ const string16 kClosingAnchorTag;
+ const string16 kQuote;
+ const string16 kGreaterThan;
+ const string16 kLessThan;
+ const string16 kQuoteGreaterThan;
+
// The size taken by the parameters and separators needed when adding text to
// a request string.
static size_t text_param_length_;
diff --git a/chrome/browser/renderer_host/translation_service_unittest.cc b/chrome/browser/renderer_host/translation_service_unittest.cc
index 5d18fe9..f973502 100644
--- a/chrome/browser/renderer_host/translation_service_unittest.cc
+++ b/chrome/browser/renderer_host/translation_service_unittest.cc
@@ -154,14 +154,15 @@ static void ExtractQueryStringsFromUploadData(TestURLFetcher* url_fetcher,
}
TEST_F(TranslationServiceTest, MergeTestChunks) {
+ TranslationService translation_service(NULL);
std::vector<string16> input;
input.push_back(ASCIIToUTF16("Hello"));
- string16 result = TranslationService::MergeTextChunks(input);
+ string16 result = translation_service.MergeTextChunks(input);
EXPECT_EQ(ASCIIToUTF16("Hello"), result);
input.push_back(ASCIIToUTF16(" my name"));
input.push_back(ASCIIToUTF16(" is"));
input.push_back(ASCIIToUTF16(" Jay."));
- result = TranslationService::MergeTextChunks(input);
+ result = translation_service.MergeTextChunks(input);
EXPECT_EQ(ASCIIToUTF16("<a _CR_TR_ id='0'>Hello</a>"
"<a _CR_TR_ id='1'> my name</a>"
"<a _CR_TR_ id='2'> is</a>"
@@ -179,11 +180,12 @@ TEST_F(TranslationServiceTest, RemoveTag) {
"", "Hello", "", " Link ", "Link", "<a link", "broken", "broken bad bad"
};
+ TranslationService translation_service(NULL);
ASSERT_EQ(arraysize(kInputs), arraysize(kExpected));
for (size_t i = 0; i < arraysize(kInputs); ++i) {
SCOPED_TRACE(::testing::Message::Message() << "Iteration " << i);
string16 input = ASCIIToUTF16(kInputs[i]);
- string16 output = TranslationService::RemoveTag(input);
+ string16 output = translation_service.RemoveTag(input);
EXPECT_EQ(ASCIIToUTF16(kExpected[i]), output);
}
}
@@ -191,16 +193,18 @@ TEST_F(TranslationServiceTest, RemoveTag) {
// Tests that we deal correctly with the various results the translation server
// can return, including the buggy ones.
TEST_F(TranslationServiceTest, SplitIntoTextChunks) {
+ TranslationService translation_service(NULL);
+
// Simple case.
std::vector<string16> text_chunks;
- TranslationService::SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks);
+ translation_service.SplitIntoTextChunks(ASCIIToUTF16("Hello"), &text_chunks);
ASSERT_EQ(1U, text_chunks.size());
EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]);
text_chunks.clear();
// Multiple chunks case, correct syntax.
- TranslationService::SplitIntoTextChunks(
+ translation_service.SplitIntoTextChunks(
ASCIIToUTF16("<a _CR_TR_ id='0'>Bonjour</a>"
"<a _CR_TR_ id='1'> mon nom</a>"
"<a _CR_TR_ id='2'> est</a>"
@@ -216,15 +220,58 @@ TEST_F(TranslationServiceTest, SplitIntoTextChunks) {
// For info, original input:
// <a _CR_TRANSLATE_ id='0'> Experience </a><a _CR_TRANSLATE_ id='1'>Nexus One
// </a><a _CR_TRANSLATE_ id='2'>, the new Android phone from Google</a>
- TranslationService::SplitIntoTextChunks(
+ translation_service.SplitIntoTextChunks(
ASCIIToUTF16("<a _CR_TR_ id='0'>Experience</a> <a _CR_TR_ id='1'>Nexus"
"<a _CR_TR_ id='2'> One,</a></a> <a _CR_TR_ id='2'>the new "
"Android Phone</a>"), &text_chunks);
ASSERT_EQ(3U, text_chunks.size());
EXPECT_EQ(ASCIIToUTF16("Experience "), text_chunks[0]);
- EXPECT_EQ(ASCIIToUTF16("Nexus One, "), text_chunks[1]);
- EXPECT_EQ(ASCIIToUTF16("the new Android Phone"), text_chunks[2]);
+ EXPECT_EQ(ASCIIToUTF16("Nexus"), text_chunks[1]);
+ EXPECT_EQ(ASCIIToUTF16(" One, the new Android Phone"), text_chunks[2]);
+ text_chunks.clear();
+
+ // Other incorrect case:
+ // Original input:
+ // <a _CR_TR_ id='0'>Benzinpreis-</a><a _CR_TR_ id='1'>vergleich</a>
+ translation_service.SplitIntoTextChunks(
+ ASCIIToUTF16("<a _CR_TR_ id='0'>Gasoline <a _CR_TR_ id='1'>"
+ "price-comparison</a></a>"), &text_chunks);
+ ASSERT_EQ(2U, text_chunks.size());
+ EXPECT_EQ(ASCIIToUTF16("Gasoline "), text_chunks[0]);
+ EXPECT_EQ(ASCIIToUTF16("price-comparison"), text_chunks[1]);
+ text_chunks.clear();
+
+ // Other incorrect case:
+ // Original input:
+ // <a _CR_TR_ id='0'>Bußgeld-</a><a _CR_TR_ id='1'>rechner</a>
+ translation_service.SplitIntoTextChunks(
+ ASCIIToUTF16("<a _CR_TR_ id='1'><a _CR_TR_ id='0'>Fine-computer</a>"
+ "</a>"), &text_chunks);
+ ASSERT_EQ(2U, text_chunks.size());
+ EXPECT_EQ(ASCIIToUTF16(""), text_chunks[0]);
+ EXPECT_EQ(ASCIIToUTF16("Fine-computer"), text_chunks[1]);
text_chunks.clear();
+
+ translation_service.SplitIntoTextChunks(
+ ASCIIToUTF16("<a _CR_TR_ id='0'>The mountain live .</a> "
+ "<a _CR_TR_ id='1'>By Philipp Wittrock</a> <a _CR_TR_ id='0'>are</a> "
+ "<a _CR_TR_ id='2'>more ...</a> <a _CR_TR_ id='3'>Video</a> "
+ "<a _CR_TR_ id='4'>Forum</a>"), &text_chunks);
+ ASSERT_EQ(5U, text_chunks.size());
+ EXPECT_EQ(ASCIIToUTF16("The mountain live . "), text_chunks[0]);
+ EXPECT_EQ(ASCIIToUTF16("By Philipp Wittrock are "), text_chunks[1]);
+ EXPECT_EQ(ASCIIToUTF16("more ... "), text_chunks[2]);
+ EXPECT_EQ(ASCIIToUTF16("Video "), text_chunks[3]);
+ EXPECT_EQ(ASCIIToUTF16("Forum"), text_chunks[4]);
+ text_chunks.clear();
+
+ // Make sure we support ending with a start tag.
+ translation_service.SplitIntoTextChunks(
+ ASCIIToUTF16("<a _CR_TR_ id='0'>Hello</a><a _CR_TR_ id='1'>"),
+ &text_chunks);
+ ASSERT_EQ(2U, text_chunks.size());
+ EXPECT_EQ(ASCIIToUTF16("Hello"), text_chunks[0]);
+ EXPECT_EQ(EmptyString16(), text_chunks[1]);
}
// Tests that a successful translate works as expected.