Fix |ElideText()| to handle UTF16 surrogate pairs correctly.

BUG=107703 TEST=New unit test in text_elider_unittest.cc. Review URL: http://codereview.chromium.org/8910018 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@114705 0039d316-1c4b-4281-b951-d872f2087c98
author: asvitkine@chromium.org <asvitkine@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-12-15 22:09:20 +0000
committer: asvitkine@chromium.org <asvitkine@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-12-15 22:09:20 +0000
commit: da06e3089a9649187c7c93e115376c9b23c96cb7 (patch)
tree: 696eac9ae9561add157c724197966dbc74aeedaf /ui
parent: 862deef098a8f161cb9926363bd4786f83fdbdc9 (diff)
download: chromium_src-da06e3089a9649187c7c93e115376c9b23c96cb7.zip
chromium_src-da06e3089a9649187c7c93e115376c9b23c96cb7.tar.gz
chromium_src-da06e3089a9649187c7c93e115376c9b23c96cb7.tar.bz2
2 files changed, 115 insertions, 26 deletions
diff --git a/ui/base/text/text_elider.cc b/ui/base/text/text_elider.cc
index 13ee279..08435a2 100644
--- a/ui/base/text/text_elider.cc
+++ b/ui/base/text/text_elider.cc
@@ -32,26 +32,66 @@ const char16 kForwardSlash = '/';
 
 namespace {
 
-// Cuts |text| to be |length| characters long.  If |cut_in_middle| is true, the
-// middle of the string is removed to leave equal-length pieces from the
-// beginning and end of the string; otherwise, the end of the string is removed
-// and only the beginning remains.  If |insert_ellipsis| is true, then an
-// ellipsis character will by inserted at the cut point.
-string16 CutString(const string16& text,
-                   size_t length,
-                   bool cut_in_middle,
-                   bool insert_ellipsis) {
-  // TODO(tony): This is wrong, it might split the string in the middle of a
-  // surrogate pair.
-  const string16 kInsert = insert_ellipsis ? UTF8ToUTF16(kEllipsis) :
-                                             string16();
-  if (!cut_in_middle)
-    return text.substr(0, length) + kInsert;
-  // We put the extra character, if any, before the cut.
-  const size_t half_length = length / 2;
-  return text.substr(0, length - half_length) + kInsert +
-      text.substr(text.length() - half_length, half_length);
-}
+// Helper class to split + elide text, while respecting UTF16 surrogate pairs.
+class StringSlicer {
+ public:
+  StringSlicer(const string16& text,
+               const string16& ellipsis,
+               bool elide_in_middle)
+      : text_(text),
+        ellipsis_(ellipsis),
+        elide_in_middle_(elide_in_middle) {
+  }
+
+  // Cuts |text_| to be |length| characters long.  If |cut_in_middle_| is true,
+  // the middle of the string is removed to leave equal-length pieces from the
+  // beginning and end of the string; otherwise, the end of the string is
+  // removed and only the beginning remains.  If |insert_ellipsis| is true,
+  // then an ellipsis character will by inserted at the cut point.
+  string16 CutString(size_t length, bool insert_ellipsis) {
+    const string16 kInsert = insert_ellipsis ? ellipsis_ : string16();
+
+    if (!elide_in_middle_)
+      return text_.substr(0, FindValidBoundaryBefore(length)) + kInsert;
+
+    // We put the extra character, if any, before the cut.
+    size_t half_length = length / 2;
+    size_t prefix_length = FindValidBoundaryBefore(length - half_length);
+    size_t suffix_start_guess = text_.length() - half_length;
+    size_t suffix_start = FindValidBoundaryAfter(suffix_start_guess);
+    size_t suffix_length = half_length - (suffix_start_guess - suffix_start);
+    return text_.substr(0, prefix_length) + kInsert +
+           text_.substr(suffix_start, suffix_length);
+  }
+
+ private:
+  // Returns a valid cut boundary at or before |index|.
+  size_t FindValidBoundaryBefore(size_t index) {
+    DCHECK_LE(index, text_.length());
+    if (index != text_.length())
+      U16_SET_CP_START(text_.data(), 0, index);
+    return index;
+  }
+
+  // Returns a valid cut boundary at or after |index|.
+  size_t FindValidBoundaryAfter(size_t index) {
+    DCHECK_LE(index, text_.length());
+    if (index != text_.length())
+      U16_SET_CP_LIMIT(text_.data(), 0, index, text_.length());
+    return index;
+  }
+
+  // The text to be sliced.
+  const string16& text_;
+
+  // Ellipsis string to use.
+  const string16& ellipsis_;
+
+  // If true, the middle of the string will be elided.
+  bool elide_in_middle_;
+
+  DISALLOW_COPY_AND_ASSIGN(StringSlicer);
+};
 
 // Build a path from the first |num_components| elements in |path_elements|.
 // Prepends |path_prefix|, appends |filename|, inserts ellipsis if appropriate.
@@ -350,10 +390,14 @@ string16 ElideText(const string16& text,
   if (text.empty())
     return text;
 
+  const string16 kEllipsisUTF16 = UTF8ToUTF16(kEllipsis);
+
   int current_text_pixel_width = font.GetStringWidth(text);
   bool elide_in_middle = (elide_behavior == ui::ELIDE_IN_MIDDLE);
   bool insert_ellipsis = (elide_behavior != ui::TRUNCATE_AT_END);
 
+  StringSlicer slicer(text, kEllipsisUTF16, elide_in_middle);
+
   // Pango will return 0 width for absurdly long strings. Cut the string in
   // half and try again.
   // This is caused by an int overflow in Pango (specifically, in
@@ -363,14 +407,14 @@ string16 ElideText(const string16& text,
   // (eliding way too much from a ridiculous string is probably still
   // ridiculous), but we should check other widths for bogus values as well.
   if (current_text_pixel_width <= 0 && !text.empty()) {
-    return ElideText(CutString(text, text.length() / 2, elide_in_middle, false),
-                     font, available_pixel_width, elide_behavior);
+    string16 cut = slicer.CutString(text.length() / 2, false);
+    return ElideText(cut, font, available_pixel_width, elide_behavior);
   }
 
   if (current_text_pixel_width <= available_pixel_width)
     return text;
 
-  if (font.GetStringWidth(UTF8ToUTF16(kEllipsis)) > available_pixel_width)
+  if (font.GetStringWidth(kEllipsisUTF16) > available_pixel_width)
     return string16();
 
   // Use binary search to compute the elided text.
@@ -380,12 +424,12 @@ string16 ElideText(const string16& text,
   for (guess = (lo + hi) / 2; lo <= hi; guess = (lo + hi) / 2) {
     // We check the length of the whole desired string at once to ensure we
     // handle kerning/ligatures/etc. correctly.
-    string16 cut = CutString(text, guess, elide_in_middle, insert_ellipsis);
+    string16 cut = slicer.CutString(guess, insert_ellipsis);
     int guess_length = font.GetStringWidth(cut);
     // Check again that we didn't hit a Pango width overflow. If so, cut the
     // current string in half and start over.
     if (guess_length <= 0) {
-      return ElideText(CutString(text, guess / 2, elide_in_middle, false),
+      return ElideText(slicer.CutString(guess / 2, false),
                        font, available_pixel_width, elide_behavior);
     }
     if (guess_length > available_pixel_width)
@@ -394,7 +438,7 @@ string16 ElideText(const string16& text,
       lo = guess + 1;
   }
 
-  return CutString(text, guess, elide_in_middle, insert_ellipsis);
+  return slicer.CutString(guess, insert_ellipsis);
 }
 
 SortedDisplayURL::SortedDisplayURL(const GURL& url,
diff --git a/ui/base/text/text_elider_unittest.cc b/ui/base/text/text_elider_unittest.cc
index b716a00..439db7b 100644
--- a/ui/base/text/text_elider_unittest.cc
+++ b/ui/base/text/text_elider_unittest.cc
@@ -237,6 +237,51 @@ TEST(TextEliderTest, ElideTextTruncate) {
   }
 }
 
+// Checks that all occurrences of |first_char| are followed by |second_char| and
+// all occurrences of |second_char| are preceded by |first_char| in |text|.
+static void CheckSurrogatePairs(const string16& text,
+                                char16 first_char,
+                                char16 second_char) {
+  size_t index = text.find_first_of(first_char);
+  while (index != string16::npos) {
+    EXPECT_LT(index, text.length() - 1);
+    EXPECT_EQ(second_char, text[index + 1]);
+    index = text.find_first_of(first_char, index + 1);
+  }
+  index = text.find_first_of(second_char);
+  while (index != string16::npos) {
+    EXPECT_GT(index, 0U);
+    EXPECT_EQ(first_char, text[index - 1]);
+    index = text.find_first_of(second_char, index + 1);
+  }
+}
+
+TEST(TextEliderTest, ElideTextSurrogatePairs) {
+  const gfx::Font font;
+  // The below is 'MUSICAL SYMBOL G CLEF', which is represented in UTF-16 as
+  // two characters forming a surrogate pair 0x0001D11E.
+  const std::string kSurrogate = "\xF0\x9D\x84\x9E";
+  const string16 kTestString =
+      UTF8ToUTF16(kSurrogate + "ab" + kSurrogate + kSurrogate + "cd");
+  const int kTestStringWidth = font.GetStringWidth(kTestString);
+  const char16 kSurrogateFirstChar = kTestString[0];
+  const char16 kSurrogateSecondChar = kTestString[1];
+  string16 result;
+
+  // Elide |kTextString| to all possible widths and check that no instance of
+  // |kSurrogate| was split in two.
+  for (int width = 0; width <= kTestStringWidth; width++) {
+    result = ui::ElideText(kTestString, font, width, ui::TRUNCATE_AT_END);
+    CheckSurrogatePairs(result, kSurrogateFirstChar, kSurrogateSecondChar);
+
+    result = ui::ElideText(kTestString, font, width, ui::ELIDE_AT_END);
+    CheckSurrogatePairs(result, kSurrogateFirstChar, kSurrogateSecondChar);
+
+    result = ui::ElideText(kTestString, font, width, ui::ELIDE_IN_MIDDLE);
+    CheckSurrogatePairs(result, kSurrogateFirstChar, kSurrogateSecondChar);
+  }
+}
+
 TEST(TextEliderTest, ElideTextLongStrings) {
   const string16 kEllipsisStr = UTF8ToUTF16(kEllipsis);
   string16 data_scheme(UTF8ToUTF16("data:text/plain,"));
author	asvitkine@chromium.org <asvitkine@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-12-15 22:09:20 +0000
committer	asvitkine@chromium.org <asvitkine@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-12-15 22:09:20 +0000
commit	da06e3089a9649187c7c93e115376c9b23c96cb7 (patch)
tree	696eac9ae9561add157c724197966dbc74aeedaf /ui
parent	862deef098a8f161cb9926363bd4786f83fdbdc9 (diff)
download	chromium_src-da06e3089a9649187c7c93e115376c9b23c96cb7.zip chromium_src-da06e3089a9649187c7c93e115376c9b23c96cb7.tar.gz chromium_src-da06e3089a9649187c7c93e115376c9b23c96cb7.tar.bz2