6 files changed, 153 insertions, 30 deletions
diff --git a/base/i18n/file_util_icu.cc b/base/i18n/file_util_icu.cc
index 914d2dd..f62a05e 100644
--- a/base/i18n/file_util_icu.cc
+++ b/base/i18n/file_util_icu.cc
@@ -62,10 +62,9 @@ IllegalCharacters::IllegalCharacters() {
   DCHECK(U_SUCCESS(status));
   // Add non-characters. If this becomes a performance bottleneck by
   // any chance, do not add these to |set| and change IsFilenameLegal()
-  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addition to calling
+  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
   // containsNone().
   set->add(0xFDD0, 0xFDEF);
-  set->add(0xFFFD);  // Standard replacement character.
   for (int i = 0; i <= 0x10; ++i) {
     int plane_base = 0x10000 * i;
     set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
diff --git a/base/string_util.cc b/base/string_util.cc
index 72151c2..bf69b0c 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -24,8 +24,6 @@
 #include "base/logging.h"
 #include "base/singleton.h"
 #include "base/third_party/dmg_fp/dmg_fp.h"
-#include "base/utf_string_conversion_utils.h"
-#include "base/third_party/icu/icu_utf.h"
 
 namespace {
 
@@ -613,21 +611,142 @@ bool IsStringASCII(const base::StringPiece& str) {
   return DoIsStringASCII(str);
 }
 
-bool IsStringUTF8(const std::string& str) {
-  const char *src = str.data();
-  int32 src_len = static_cast<int32>(str.length());
-  int32 char_index = 0;
-
-  while (char_index < src_len) {
-    int32 code_point;
-    CBU8_NEXT(src, char_index, src_len, code_point);
-    if (!base::IsValidCodepoint(code_point))
+// Helper functions that determine whether the given character begins a
+// UTF-8 sequence of bytes with the given length. A character satisfies
+// "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte
+// character.
+static inline bool IsBegin2ByteUTF8(int c) {
+  return (c & 0xE0) == 0xC0;
+}
+static inline bool IsBegin3ByteUTF8(int c) {
+  return (c & 0xF0) == 0xE0;
+}
+static inline bool IsBegin4ByteUTF8(int c) {
+  return (c & 0xF8) == 0xF0;
+}
+static inline bool IsInUTF8Sequence(int c) {
+  return (c & 0xC0) == 0x80;
+}
+
+// This function was copied from Mozilla, with modifications. The original code
+// was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for
+// this function is:
+//   This function subject to the Mozilla Public License Version
+//   1.1 (the "License"); you may not use this code except in compliance with
+//   the License. You may obtain a copy of the License at
+//   http://www.mozilla.org/MPL/
+//
+//   Software distributed under the License is distributed on an "AS IS" basis,
+//   WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+//   for the specific language governing rights and limitations under the
+//   License.
+//
+//   The Original Code is mozilla.org code.
+//
+//   The Initial Developer of the Original Code is
+//   Netscape Communications Corporation.
+//   Portions created by the Initial Developer are Copyright (C) 2000
+//   the Initial Developer. All Rights Reserved.
+//
+//   Contributor(s):
+//     Scott Collins <scc@mozilla.org> (original author)
+//
+// This is a template so that it can be run on wide and 8-bit strings. We want
+// to run it on wide strings when we have input that we think may have
+// originally been UTF-8, but has been converted to wide characters because
+// that's what we (and Windows) use internally.
+template<typename CHAR>
+static bool IsStringUTF8T(const CHAR* str, size_t length) {
+  bool overlong = false;
+  bool surrogate = false;
+  bool nonchar = false;
+
+  // overlong byte upper bound
+  typename ToUnsigned<CHAR>::Unsigned olupper = 0;
+
+  // surrogate byte lower bound
+  typename ToUnsigned<CHAR>::Unsigned slower = 0;
+
+  // incremented when inside a multi-byte char to indicate how many bytes
+  // are left in the sequence
+  int positions_left = 0;
+
+  for (uintptr_t i = 0; i < length; i++) {
+    // This whole function assume an unsigned value so force its conversion to
+    // an unsigned value.
+    typename ToUnsigned<CHAR>::Unsigned c = str[i];
+    if (c < 0x80)
+      continue;  // ASCII
+
+    if (c <= 0xC1) {
+      // [80-BF] where not expected, [C0-C1] for overlong
       return false;
-  }
+    } else if (IsBegin2ByteUTF8(c)) {
+      positions_left = 1;
+    } else if (IsBegin3ByteUTF8(c)) {
+      positions_left = 2;
+      if (c == 0xE0) {
+        // to exclude E0[80-9F][80-BF]
+        overlong = true;
+        olupper = 0x9F;
+      } else if (c == 0xED) {
+        // ED[A0-BF][80-BF]: surrogate codepoint
+        surrogate = true;
+        slower = 0xA0;
+      } else if (c == 0xEF) {
+        // EF BF [BE-BF] : non-character
+        // TODO(jungshik): EF B7 [90-AF] should be checked as well.
+        nonchar = true;
+      }
+    } else if (c <= 0xF4) {
+      positions_left = 3;
+      nonchar = true;
+      if (c == 0xF0) {
+        // to exclude F0[80-8F][80-BF]{2}
+        overlong = true;
+        olupper = 0x8F;
+      } else if (c == 0xF4) {
+        // to exclude F4[90-BF][80-BF]
+        // actually not surrogates but codepoints beyond 0x10FFFF
+        surrogate = true;
+        slower = 0x90;
+      }
+    } else {
+      return false;
+    }
 
+    // eat the rest of this multi-byte character
+    while (positions_left) {
+      positions_left--;
+      i++;
+      c = str[i];
+      if (!c)
+        return false;  // end of string but not end of character sequence
+
+      // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
+      if (nonchar && ((!positions_left && c < 0xBE) ||
+                      (positions_left == 1 && c != 0xBF) ||
+                      (positions_left == 2 && 0x0F != (0x0F & c) ))) {
+        nonchar = false;
+      }
+      if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) ||
+          (surrogate && slower <= c) || (nonchar && !positions_left) ) {
+        return false;
+      }
+      overlong = surrogate = false;
+    }
+  }
   return true;
 }
 
+bool IsStringUTF8(const std::string& str) {
+  return IsStringUTF8T(str.data(), str.length());
+}
+
+bool IsStringWideUTF8(const std::wstring& str) {
+  return IsStringUTF8T(str.data(), str.length());
+}
+
 template<typename Iter>
 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
                                           Iter a_end,
diff --git a/base/string_util.h b/base/string_util.h
index ac52f37..c895f27 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -227,6 +227,7 @@ bool WideToLatin1(const std::wstring& wide, std::string* latin1);
 // add a new function for that.
 bool IsString8Bit(const std::wstring& str);
 bool IsStringUTF8(const std::string& str);
+bool IsStringWideUTF8(const std::wstring& str);
 bool IsStringASCII(const std::wstring& str);
 bool IsStringASCII(const base::StringPiece& str);
 bool IsStringASCII(const string16& str);
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 6f366a6..9d848a4 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -225,8 +225,13 @@ TEST(StringUtilTest, IsStringUTF8) {
   EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe"));  // U+FFFE)
   EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe"));  // U+1FFFE
   EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf"));  // U+10FFFF
+
+  // This should also be false, but currently we pass them through.
+  // Disable them for now.
+#if 0
   EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90"));  // U+FDD0
   EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf"));  // U+FDEF
+#endif
 
   // Strings in legacy encodings. We can certainly make up strings
   // in a legacy encoding that are valid in UTF-8, but in real data,
diff --git a/base/utf_string_conversion_utils.h b/base/utf_string_conversion_utils.h
index 3fcb689..a8a76c5 100644
--- a/base/utf_string_conversion_utils.h
+++ b/base/utf_string_conversion_utils.h
@@ -12,12 +12,11 @@
 namespace base {
 
 inline bool IsValidCodepoint(uint32 code_point) {
-  // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
-  // 0xFFFE or 0xFFFF), surrogate code points (U+D800..U+DFFF), and codepoints
-  // larger than U+10FFFF (the highest codepoint allowed).
-  return code_point < 0xD800u || (code_point >= 0xE000u &&
-      code_point < 0xFDD0u) || (code_point > 0xFDEFu &&
-      code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
+  // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
+  // codepoints larger than 0x10FFFF (the highest codepoint allowed).
+  // Non-characters and unassigned codepoints are allowed.
+  return code_point < 0xD800u ||
+         (code_point >= 0xE000u && code_point <= 0x10FFFFu);
 }
 
 // ReadUnicodeCharacter --------------------------------------------------------
diff --git a/base/utf_string_conversions_unittest.cc b/base/utf_string_conversions_unittest.cc
index f68c593..6ba0b5b 100644
--- a/base/utf_string_conversions_unittest.cc
+++ b/base/utf_string_conversions_unittest.cc
@@ -91,8 +91,8 @@ TEST(UTFStringConversionsTest, ConvertUTF8ToWide) {
   } convert_cases[] = {
     // Regular UTF-8 input.
     {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true},
-    // Non-character is rejected.
-    {"\xef\xbf\xbfHello", L"\xfffdHello", false},
+    // Non-character is passed through.
+    {"\xef\xbf\xbfHello", L"\xffffHello", true},
     // Truncated UTF-8 sequence.
     {"\xe4\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false},
     // Truncated off the end.
@@ -105,10 +105,10 @@ TEST(UTFStringConversionsTest, ConvertUTF8ToWide) {
     // The result will either be in UTF-16 or UTF-32.
 #if defined(WCHAR_T_IS_UTF16)
     {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true},
-    {"A\xF4\x8F\xBF\xBEz", L"A\xfffdz", false},
+    {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true},
 #elif defined(WCHAR_T_IS_UTF32)
     {"A\xF0\x90\x8C\x80z", L"A\x10300z", true},
-    {"A\xF4\x8F\xBF\xBEz", L"A\xfffdz", false},
+    {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true},
 #endif
   };
 
@@ -148,9 +148,9 @@ TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) {
     {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
     // Test a non-BMP character.
     {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true},
-    // Non-characters are rejected.
-    {L"\xffffHello", "\xef\xbf\xbdHello", false},
-    {L"\xdbff\xdffeHello", "\xef\xbf\xbdHello", false},
+    // Non-characters are passed through.
+    {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+    {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true},
     // The first character is a truncated UTF-16 character.
     {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
     // Truncated at the end.
@@ -180,9 +180,9 @@ TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) {
     {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
     // Test a non-BMP character.
     {L"A\x10300z", "A\xF0\x90\x8C\x80z", true},
-    // Non-characters are rejected.
-    {L"\xffffHello", "\xEF\xBF\xBDHello", false},
-    {L"\x10fffeHello", "\xEF\xBF\xBDHello", false},
+    // Non-characters are passed through.
+    {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+    {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true},
     // Invalid Unicode code points.
     {L"\xfffffffHello", "\xEF\xBF\xBDHello", false},
     // The first character is a truncated UTF-16 character.