Add UTF-8 check for JSON deserializer.

Add tests for IsStringUTF8 Make IsStringUTF8 accept std::string/std::wstring rather than char*/wchar_t* Review URL: http://codereview.chromium.org/4268 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@2610 0039d316-1c4b-4281-b951-d872f2087c98
author: jungshik@google.com <jungshik@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2008-09-25 21:42:00 +0000
committer: jungshik@google.com <jungshik@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2008-09-25 21:42:00 +0000
commit: c9ec45429c64884c35f83b74131c0e3ae5b2bbe9 (patch)
tree: 39007373603b8e75f53fa5e4d0c1586b1a3a56b5 /base/string_util_unittest.cc
parent: 7e2fa03804bef4bff9c5bb941f2edf09b6d234c0 (diff)
download: chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.zip
chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.tar.gz
chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.tar.bz2
1 files changed, 65 insertions, 0 deletions
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 89cec22..e438ebb 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -113,6 +113,71 @@ TEST(StringUtilTest, CollapseWhitespace) {
   }
 }
 
+
+TEST(StringUtilTest, IsStringUTF8) {
+  EXPECT_TRUE(IsStringUTF8("abc"));
+  EXPECT_TRUE(IsStringUTF8("\xc2\x81"));
+  EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));
+  EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));
+  EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));
+  EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM
+
+
+  // surrogate code points 
+  EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));
+  EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f")); 
+  EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));
+
+  // overlong sequences
+  EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000
+  EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB"
+  EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000
+  EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080
+  EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff
+  EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D
+  EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091
+  EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800
+  EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM)
+  EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F
+  EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5
+
+  // Beyond U+10FFFF (the upper limit of Unicode codespace)
+  EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000
+  EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes
+  EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes
+
+  // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE)
+  EXPECT_FALSE(IsStringUTF8("\xfe\xff"));
+  EXPECT_FALSE(IsStringUTF8("\xff\xfe"));
+  EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));
+  EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));
+
+  // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>
+  EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE)
+  EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE
+  EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF
+
+  // This should also be false, but currently we pass them through.
+  // Disable them for now.
+#if 0
+  EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0
+  EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF
+#endif
+
+  // Strings in legacy encodings. We can certainly make up strings
+  // in a legacy encoding that are valid in UTF-8, but in real data,
+  // most of them are invalid as UTF-8. 
+  EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1
+  EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR
+  EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5
+  // "abc" with U+201[CD] in windows-125[0-8]
+  EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); 
+  // U+0639 U+064E U+0644 U+064E in ISO-8859-6
+  EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); 
+  // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
+  EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));
+}
+
 static const wchar_t* const kConvertRoundtripCases[] = {
   L"Google Video",
   // "网页 图片 资讯更多 »"
author	jungshik@google.com <jungshik@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2008-09-25 21:42:00 +0000
committer	jungshik@google.com <jungshik@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2008-09-25 21:42:00 +0000
commit	c9ec45429c64884c35f83b74131c0e3ae5b2bbe9 (patch)
tree	39007373603b8e75f53fa5e4d0c1586b1a3a56b5 /base/string_util_unittest.cc
parent	7e2fa03804bef4bff9c5bb941f2edf09b6d234c0 (diff)
download	chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.zip chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.tar.gz chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.tar.bz2