Merge 32199 - [Mac/Linux] Fix WordIterator::GetWord() for UTF16 surrogate pairs.

For systems where !defined(WCHART_T_IS_UTF16), WordIterator transforms the input into UChar data. But GetWord() was using the resulting offsets as indices into the original string, so it could return incorrect data and/or attempt to index off the end of the input. This changes GetWord() to do do the inverse conversion from Init() for those systems. BUG=27698 TEST=See bug. Review URL: http://codereview.chromium.org/399010 TBR=shess@chromium.org Review URL: http://codereview.chromium.org/414001 git-svn-id: svn://svn.chromium.org/chrome/branches/249/src@32527 0039d316-1c4b-4281-b951-d872f2087c98
author: shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-11-19 19:07:20 +0000
committer: shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-11-19 19:07:20 +0000
commit: 896f8820356938ebf722c036610347d504c1c8d0 (patch)
tree: 72a296654c828d5e80ff06a58c09893c4a4df129 /base
parent: 4961f1e508749fd6e6b8a8376ee047c57468b81b (diff)
download: chromium_src-896f8820356938ebf722c036610347d504c1c8d0.zip
chromium_src-896f8820356938ebf722c036610347d504c1c8d0.tar.gz
chromium_src-896f8820356938ebf722c036610347d504c1c8d0.tar.bz2
2 files changed, 63 insertions, 0 deletions
diff --git a/base/i18n/word_iterator.cc b/base/i18n/word_iterator.cc
index 45a06b9..feb77eb 100644
--- a/base/i18n/word_iterator.cc
+++ b/base/i18n/word_iterator.cc
@@ -83,5 +83,19 @@ bool WordIterator::IsWord() const {
 
 std::wstring WordIterator::GetWord() const {
   DCHECK(prev_ != npos && pos_ != npos);
+#if defined(WCHAR_T_IS_UTF16)
   return string_.substr(prev_, pos_ - prev_);
+#else  // WCHAR_T_IS_UTF16
+  // See comment in Init().  If there are no surrogate pairs,
+  // |out_length| will be exactly |in_length|, if there are surrogate
+  // pairs it will be less than |in_length|.
+  int32_t out_length;
+  UErrorCode error = U_ZERO_ERROR;
+  const int32_t in_length = pos_ - prev_;
+  std::vector<std::wstring::value_type> out_buffer(in_length);
+  u_strToWCS(&out_buffer[0], in_length, &out_length,
+             &chars_[prev_], in_length, &error);
+  DCHECK_LE(out_length, in_length);
+  return std::wstring(&out_buffer[0], out_length);
+#endif
 }
diff --git a/base/i18n/word_iterator_unittest.cc b/base/i18n/word_iterator_unittest.cc
index 0d28370..d653e1d4 100644
--- a/base/i18n/word_iterator_unittest.cc
+++ b/base/i18n/word_iterator_unittest.cc
@@ -4,6 +4,9 @@
 
 #include "base/i18n/word_iterator.h"
 
+#include "base/string_piece.h"
+#include "base/string_util.h"
+#include "base/sys_string_conversions.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
 TEST(WordIteratorTest, BreakWord) {
@@ -66,3 +69,49 @@ TEST(WordIteratorTest, BreakLine) {
   EXPECT_FALSE(iter.Advance());
   EXPECT_FALSE(iter.IsWord());
 }
+
+TEST(WordIteratorTest, BreakWide16) {
+  //  "Παγκόσμιος Ιστός"
+  const std::wstring str(L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+                         L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2");
+  const std::wstring word1(str.substr(0, 10));
+  const std::wstring word2(str.substr(11, 5));
+  WordIterator iter(str, WordIterator::BREAK_WORD);
+  ASSERT_TRUE(iter.Init());
+  EXPECT_TRUE(iter.Advance());
+  EXPECT_TRUE(iter.IsWord());
+  EXPECT_EQ(word1, iter.GetWord());
+  EXPECT_TRUE(iter.Advance());
+  EXPECT_FALSE(iter.IsWord());
+  EXPECT_EQ(L" ", iter.GetWord());
+  EXPECT_TRUE(iter.Advance());
+  EXPECT_TRUE(iter.IsWord());
+  EXPECT_EQ(word2, iter.GetWord());
+  EXPECT_FALSE(iter.Advance());
+  EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(WordIteratorTest, BreakWide32) {
+  // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
+  const char *very_wide_char = "\xF0\x9D\x92\x9C";
+  const std::wstring str(
+      base::SysUTF8ToWide(StringPrintf("%s a", very_wide_char)));
+#if defined(WCHAR_T_IS_UTF16)
+  const std::wstring very_wide_word(str.substr(0, 2));
+#elif defined(WCHAR_T_IS_UTF32)
+  const std::wstring very_wide_word(str.substr(0, 1));
+#endif
+  WordIterator iter(str, WordIterator::BREAK_WORD);
+  ASSERT_TRUE(iter.Init());
+  EXPECT_TRUE(iter.Advance());
+  EXPECT_TRUE(iter.IsWord());
+  EXPECT_EQ(very_wide_word, iter.GetWord());
+  EXPECT_TRUE(iter.Advance());
+  EXPECT_FALSE(iter.IsWord());
+  EXPECT_EQ(L" ", iter.GetWord());
+  EXPECT_TRUE(iter.Advance());
+  EXPECT_TRUE(iter.IsWord());
+  EXPECT_EQ(L"a", iter.GetWord());
+  EXPECT_FALSE(iter.Advance());
+  EXPECT_FALSE(iter.IsWord());
+}
author	shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-11-19 19:07:20 +0000
committer	shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-11-19 19:07:20 +0000
commit	896f8820356938ebf722c036610347d504c1c8d0 (patch)
tree	72a296654c828d5e80ff06a58c09893c4a4df129 /base
parent	4961f1e508749fd6e6b8a8376ee047c57468b81b (diff)
download	chromium_src-896f8820356938ebf722c036610347d504c1c8d0.zip chromium_src-896f8820356938ebf722c036610347d504c1c8d0.tar.gz chromium_src-896f8820356938ebf722c036610347d504c1c8d0.tar.bz2