5 files changed, 20 insertions, 148 deletions
diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc
index 252eb9c..9014a7b 100644
--- a/base/i18n/icu_string_conversions.cc
+++ b/base/i18n/icu_string_conversions.cc
@@ -17,15 +17,6 @@
 namespace base {
 
 namespace {
-
-inline bool IsValidCodepoint(uint32 code_point) {
-  // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
-  // codepoints larger than 0x10FFFF (the highest codepoint allowed).
-  // Non-characters and unassigned codepoints are allowed.
-  return code_point < 0xD800u ||
-         (code_point >= 0xE000u && code_point <= 0x10FFFFu);
-}
-
 // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE
 // in source/common/ucnv_err.c.
 
diff --git a/base/string_util.cc b/base/string_util.cc
index c9b0aad..19c1735 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -24,6 +24,8 @@
 #include "base/logging.h"
 #include "base/singleton.h"
 #include "base/third_party/dmg_fp/dmg_fp.h"
+#include "base/utf_string_conversion_utils.h"
+#include "base/third_party/icu/icu_utf.h"
 
 namespace {
 
@@ -676,142 +678,20 @@ bool IsStringASCII(const base::StringPiece& str) {
   return DoIsStringASCII(str);
 }
 
-// Helper functions that determine whether the given character begins a
-// UTF-8 sequence of bytes with the given length. A character satisfies
-// "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte
-// character.
-static inline bool IsBegin2ByteUTF8(int c) {
-  return (c & 0xE0) == 0xC0;
-}
-static inline bool IsBegin3ByteUTF8(int c) {
-  return (c & 0xF0) == 0xE0;
-}
-static inline bool IsBegin4ByteUTF8(int c) {
-  return (c & 0xF8) == 0xF0;
-}
-static inline bool IsInUTF8Sequence(int c) {
-  return (c & 0xC0) == 0x80;
-}
-
-// This function was copied from Mozilla, with modifications. The original code
-// was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for
-// this function is:
-//   This function subject to the Mozilla Public License Version
-//   1.1 (the "License"); you may not use this code except in compliance with
-//   the License. You may obtain a copy of the License at
-//   http://www.mozilla.org/MPL/
-//
-//   Software distributed under the License is distributed on an "AS IS" basis,
-//   WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
-//   for the specific language governing rights and limitations under the
-//   License.
-//
-//   The Original Code is mozilla.org code.
-//
-//   The Initial Developer of the Original Code is
-//   Netscape Communications Corporation.
-//   Portions created by the Initial Developer are Copyright (C) 2000
-//   the Initial Developer. All Rights Reserved.
-//
-//   Contributor(s):
-//     Scott Collins <scc@mozilla.org> (original author)
-//
-// This is a template so that it can be run on wide and 8-bit strings. We want
-// to run it on wide strings when we have input that we think may have
-// originally been UTF-8, but has been converted to wide characters because
-// that's what we (and Windows) use internally.
-template<typename CHAR>
-static bool IsStringUTF8T(const CHAR* str, size_t length) {
-  bool overlong = false;
-  bool surrogate = false;
-  bool nonchar = false;
-
-  // overlong byte upper bound
-  typename ToUnsigned<CHAR>::Unsigned olupper = 0;
-
-  // surrogate byte lower bound
-  typename ToUnsigned<CHAR>::Unsigned slower = 0;
-
-  // incremented when inside a multi-byte char to indicate how many bytes
-  // are left in the sequence
-  int positions_left = 0;
-
-  for (uintptr_t i = 0; i < length; i++) {
-    // This whole function assume an unsigned value so force its conversion to
-    // an unsigned value.
-    typename ToUnsigned<CHAR>::Unsigned c = str[i];
-    if (c < 0x80)
-      continue;  // ASCII
-
-    if (c <= 0xC1) {
-      // [80-BF] where not expected, [C0-C1] for overlong
-      return false;
-    } else if (IsBegin2ByteUTF8(c)) {
-      positions_left = 1;
-    } else if (IsBegin3ByteUTF8(c)) {
-      positions_left = 2;
-      if (c == 0xE0) {
-        // to exclude E0[80-9F][80-BF]
-        overlong = true;
-        olupper = 0x9F;
-      } else if (c == 0xED) {
-        // ED[A0-BF][80-BF]: surrogate codepoint
-        surrogate = true;
-        slower = 0xA0;
-      } else if (c == 0xEF) {
-        // EF BF [BE-BF] : non-character
-        // TODO(jungshik): EF B7 [90-AF] should be checked as well.
-        nonchar = true;
-      }
-    } else if (c <= 0xF4) {
-      positions_left = 3;
-      nonchar = true;
-      if (c == 0xF0) {
-        // to exclude F0[80-8F][80-BF]{2}
-        overlong = true;
-        olupper = 0x8F;
-      } else if (c == 0xF4) {
-        // to exclude F4[90-BF][80-BF]
-        // actually not surrogates but codepoints beyond 0x10FFFF
-        surrogate = true;
-        slower = 0x90;
-      }
-    } else {
-      return false;
-    }
-
-    // eat the rest of this multi-byte character
-    while (positions_left) {
-      positions_left--;
-      i++;
-      c = str[i];
-      if (!c)
-        return false;  // end of string but not end of character sequence
-
-      // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
-      if (nonchar && ((!positions_left && c < 0xBE) ||
-                      (positions_left == 1 && c != 0xBF) ||
-                      (positions_left == 2 && 0x0F != (0x0F & c) ))) {
-        nonchar = false;
-      }
-      if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) ||
-          (surrogate && slower <= c) || (nonchar && !positions_left) ) {
-        return false;
-      }
-      overlong = surrogate = false;
-    }
+bool IsStringUTF8(const std::string& str) {
+  const char *src = str.data();
+  int32 src_len = static_cast<int32>(str.length());
+  int32 char_index = 0;
+
+  while (char_index < src_len) {
+    int32 code_point;
+    CBU8_NEXT(src, char_index, src_len, code_point);
+    if (!base::IsValidCharacter(code_point))
+       return false;
   }
   return true;
 }
 
-bool IsStringUTF8(const std::string& str) {
-  return IsStringUTF8T(str.data(), str.length());
-}
-
-bool IsStringWideUTF8(const std::wstring& str) {
-  return IsStringUTF8T(str.data(), str.length());
-}
-
 template<typename Iter>
 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
                                           Iter a_end,
diff --git a/base/string_util.h b/base/string_util.h
index 9e0da1e..e10b99e 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -247,7 +247,6 @@ bool WideToLatin1(const std::wstring& wide, std::string* latin1);
 // add a new function for that.
 bool IsString8Bit(const std::wstring& str);
 bool IsStringUTF8(const std::string& str);
-bool IsStringWideUTF8(const std::wstring& str);
 bool IsStringASCII(const std::wstring& str);
 bool IsStringASCII(const base::StringPiece& str);
 bool IsStringASCII(const string16& str);
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index c6961fe..d75104c 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -225,14 +225,8 @@ TEST(StringUtilTest, IsStringUTF8) {
   EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe"));  // U+FFFE)
   EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe"));  // U+1FFFE
   EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf"));  // U+10FFFF
-
-  // This should also be false, but currently we pass them through.
-  // Disable them for now.
-#if 0
   EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90"));  // U+FDD0
   EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf"));  // U+FDEF
-#endif
-
   // Strings in legacy encodings. We can certainly make up strings
   // in a legacy encoding that are valid in UTF-8, but in real data,
   // most of them are invalid as UTF-8.
diff --git a/base/utf_string_conversion_utils.h b/base/utf_string_conversion_utils.h
index a8a76c5..0c02d82 100644
--- a/base/utf_string_conversion_utils.h
+++ b/base/utf_string_conversion_utils.h
@@ -19,6 +19,14 @@ inline bool IsValidCodepoint(uint32 code_point) {
          (code_point >= 0xE000u && code_point <= 0x10FFFFu);
 }
 
+inline bool IsValidCharacter(uint32 code_point) {
+  // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
+  // 0xFFFE or 0xFFFF) from the set of valid code points.
+  return code_point < 0xD800u || (code_point >= 0xE000u &&
+      code_point < 0xFDD0u) || (code_point > 0xFDEFu &&
+      code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
+}
+
 // ReadUnicodeCharacter --------------------------------------------------------
 
 // Reads a UTF-8 stream, placing the next code point into the given output