7 files changed, 90 insertions, 14 deletions
diff --git a/base/json_reader.cc b/base/json_reader.cc
index 72ce364..8d0cab4 100644
--- a/base/json_reader.cc
+++ b/base/json_reader.cc
@@ -83,8 +83,11 @@ bool JSONReader::JsonToValue(const std::string& json,
                              Value** root,
                              bool check_root,
                              bool allow_trailing_comma) {
-  // Assume input is UTF8.  The conversion from UTF8 to wstring removes null
-  // bytes for us (a good thing).
+  // The input must be in UTF-8.
+  if (!IsStringUTF8(json.c_str()))
+    return false;
+  // The conversion from UTF8 to wstring removes null bytes for us
+  // (a good thing).
   std::wstring json_wide(UTF8ToWide(json));
   const wchar_t* json_cstr = json_wide.c_str();
 
diff --git a/base/json_reader_unittest.cc b/base/json_reader_unittest.cc
index 6c60a07..c2d6a42 100644
--- a/base/json_reader_unittest.cc
+++ b/base/json_reader_unittest.cc
@@ -481,6 +481,13 @@ TEST(JSONReaderTest, Reading) {
   ASSERT_EQ(L"\x7f51\x9875", str_val);
   delete root;
 
+  // Test invalid utf8 encoded input
+  root = NULL;
+  ASSERT_FALSE(JSONReader::JsonToValue("\"345\xb0\xa1\xb0\xa2\"", &root,
+                                       false, false));
+  ASSERT_FALSE(JSONReader::JsonToValue("\"123\xc0\x81\"", &root,
+                                       false, false));
+
   // Test invalid root objects.
   root = NULL;
   ASSERT_FALSE(JSONReader::Read("null", &root, false));
diff --git a/base/string_util.cc b/base/string_util.cc
index 2f39104..223c485 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -517,7 +517,7 @@ static inline bool IsInUTF8Sequence(int c) {
 // originally been UTF-8, but has been converted to wide characters because
 // that's what we (and Windows) use internally.
 template<typename CHAR>
-static bool IsStringUTF8T(const CHAR* str) {
+static bool IsStringUTF8T(const CHAR* str, int length) {
   bool overlong = false;
   bool surrogate = false;
   bool nonchar = false;
@@ -532,7 +532,7 @@ static bool IsStringUTF8T(const CHAR* str) {
   // are left in the sequence
   int positions_left = 0;
 
-  for (int i = 0; str[i] != 0; i++) {
+  for (int i = 0; i < length; i++) {
     // This whole function assume an unsigned value so force its conversion to
     // an unsigned value.
     typename ToUnsigned<CHAR>::Unsigned c = str[i];
@@ -556,6 +556,7 @@ static bool IsStringUTF8T(const CHAR* str) {
         slower = 0xA0;
       } else if (c == 0xEF) {
         // EF BF [BE-BF] : non-character
+        // TODO(jungshik): EF B7 [90-AF] should be checked as well.
         nonchar = true;
       }
     } else if (c <= 0xF4) {
@@ -599,12 +600,12 @@ static bool IsStringUTF8T(const CHAR* str) {
   return true;
 }
 
-bool IsStringUTF8(const char* str) {
-  return IsStringUTF8T(str);
+bool IsStringUTF8(const std::string& str) {
+  return IsStringUTF8T(str.data(), str.length());
 }
 
-bool IsStringWideUTF8(const wchar_t* str) {
-  return IsStringUTF8T(str);
+bool IsStringWideUTF8(const std::wstring& str) {
+  return IsStringUTF8T(str.data(), str.length());
 }
 
 template<typename Iter>
diff --git a/base/string_util.h b/base/string_util.h
index 8c28f3b..a9f08c4 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -227,8 +227,8 @@ bool WideToLatin1(const std::wstring& wide, std::string* latin1);
 // first case) or characters that use only 8-bits and whose 8-bit
 // representation looks like a UTF-8 string (the second case).
 bool IsString8Bit(const std::wstring& str);
-bool IsStringUTF8(const char* str);
-bool IsStringWideUTF8(const wchar_t* str);
+bool IsStringUTF8(const std::string& str);
+bool IsStringWideUTF8(const std::wstring& str);
 bool IsStringASCII(const std::wstring& str);
 bool IsStringASCII(const std::string& str);
 
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 89cec22..e438ebb 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -113,6 +113,71 @@ TEST(StringUtilTest, CollapseWhitespace) {
   }
 }
 
+
+TEST(StringUtilTest, IsStringUTF8) {
+  EXPECT_TRUE(IsStringUTF8("abc"));
+  EXPECT_TRUE(IsStringUTF8("\xc2\x81"));
+  EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));
+  EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));
+  EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));
+  EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM
+
+
+  // surrogate code points 
+  EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));
+  EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f")); 
+  EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));
+
+  // overlong sequences
+  EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000
+  EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB"
+  EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000
+  EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080
+  EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff
+  EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D
+  EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091
+  EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800
+  EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM)
+  EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F
+  EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5
+
+  // Beyond U+10FFFF (the upper limit of Unicode codespace)
+  EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000
+  EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes
+  EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes
+
+  // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE)
+  EXPECT_FALSE(IsStringUTF8("\xfe\xff"));
+  EXPECT_FALSE(IsStringUTF8("\xff\xfe"));
+  EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));
+  EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));
+
+  // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>
+  EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE)
+  EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE
+  EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF
+
+  // This should also be false, but currently we pass them through.
+  // Disable them for now.
+#if 0
+  EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0
+  EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF
+#endif
+
+  // Strings in legacy encodings. We can certainly make up strings
+  // in a legacy encoding that are valid in UTF-8, but in real data,
+  // most of them are invalid as UTF-8. 
+  EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1
+  EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR
+  EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5
+  // "abc" with U+201[CD] in windows-125[0-8]
+  EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); 
+  // U+0639 U+064E U+0644 U+064E in ISO-8859-6
+  EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); 
+  // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
+  EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));
+}
+
 static const wchar_t* const kConvertRoundtripCases[] = {
   L"Google Video",
   // "网页 图片 资讯更多 »"
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
index d3075ad..305cbcc 100644
--- a/net/base/net_util.cc
+++ b/net/base/net_util.cc
@@ -237,7 +237,7 @@ bool DecodeWord(const std::string& encoded_word,
   // UTF-16 assuming it's in the OS default encoding.
   if (!IsStringASCII(encoded_word)) {
     // Try falling back to the NativeMB encoding if the raw input is not UTF-8.
-    if (IsStringUTF8(encoded_word.c_str())) {
+    if (IsStringUTF8(encoded_word)) {
       *output = encoded_word;
     } else {
       *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
@@ -328,7 +328,7 @@ bool DecodeWord(const std::string& encoded_word,
   // support a rudimentary form of RFC 2231 with charset label, but
   // it'd gain us little in terms of compatibility.
   tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
-  if (IsStringUTF8(tmp.c_str())) {
+  if (IsStringUTF8(tmp)) {
     output->swap(tmp);
     return true;
     // We can try either the OS default charset or 'origin charset' here,
diff --git a/net/base/net_util_win.cc b/net/base/net_util_win.cc
index 206f485..7ea217c 100644
--- a/net/base/net_util_win.cc
+++ b/net/base/net_util_win.cc
@@ -44,7 +44,7 @@ bool FileURLToFilePath(const GURL& url, std::wstring* file_path) {
   path = UnescapeURLComponent(path,
       UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
 
-  if (!IsStringUTF8(path.c_str())) {
+  if (!IsStringUTF8(path)) {
     // Not UTF-8, assume encoding is native codepage and we're done. We know we
     // are giving the conversion function a nonempty string, and it may fail if
     // the given string is not in the current encoding and give us an empty
@@ -68,7 +68,7 @@ bool FileURLToFilePath(const GURL& url, std::wstring* file_path) {
     NOTREACHED() << "Should have filtered out non-8-bit strings above.";
     return false;
   }
-  if (IsStringUTF8(narrow.c_str())) {
+  if (IsStringUTF8(narrow)) {
     // Our string actually looks like it could be UTF-8, convert to 8-bit
     // UTF-8 and then to the corresponding wide string.
     *file_path = UTF8ToWide(narrow);