diff options
author | jungshik@google.com <jungshik@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-09-25 21:42:00 +0000 |
---|---|---|
committer | jungshik@google.com <jungshik@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-09-25 21:42:00 +0000 |
commit | c9ec45429c64884c35f83b74131c0e3ae5b2bbe9 (patch) | |
tree | 39007373603b8e75f53fa5e4d0c1586b1a3a56b5 /base | |
parent | 7e2fa03804bef4bff9c5bb941f2edf09b6d234c0 (diff) | |
download | chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.zip chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.tar.gz chromium_src-c9ec45429c64884c35f83b74131c0e3ae5b2bbe9.tar.bz2 |
Add UTF-8 check for JSON deserializer.
Add tests for IsStringUTF8
Make IsStringUTF8 accept std::string/std::wstring rather than char*/wchar_t*
Review URL: http://codereview.chromium.org/4268
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@2610 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/json_reader.cc | 7 | ||||
-rw-r--r-- | base/json_reader_unittest.cc | 7 | ||||
-rw-r--r-- | base/string_util.cc | 13 | ||||
-rw-r--r-- | base/string_util.h | 4 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 65 |
5 files changed, 86 insertions, 10 deletions
diff --git a/base/json_reader.cc b/base/json_reader.cc index 72ce364..8d0cab4 100644 --- a/base/json_reader.cc +++ b/base/json_reader.cc @@ -83,8 +83,11 @@ bool JSONReader::JsonToValue(const std::string& json, Value** root, bool check_root, bool allow_trailing_comma) { - // Assume input is UTF8. The conversion from UTF8 to wstring removes null - // bytes for us (a good thing). + // The input must be in UTF-8. + if (!IsStringUTF8(json.c_str())) + return false; + // The conversion from UTF8 to wstring removes null bytes for us + // (a good thing). std::wstring json_wide(UTF8ToWide(json)); const wchar_t* json_cstr = json_wide.c_str(); diff --git a/base/json_reader_unittest.cc b/base/json_reader_unittest.cc index 6c60a07..c2d6a42 100644 --- a/base/json_reader_unittest.cc +++ b/base/json_reader_unittest.cc @@ -481,6 +481,13 @@ TEST(JSONReaderTest, Reading) { ASSERT_EQ(L"\x7f51\x9875", str_val); delete root; + // Test invalid utf8 encoded input + root = NULL; + ASSERT_FALSE(JSONReader::JsonToValue("\"345\xb0\xa1\xb0\xa2\"", &root, + false, false)); + ASSERT_FALSE(JSONReader::JsonToValue("\"123\xc0\x81\"", &root, + false, false)); + // Test invalid root objects. root = NULL; ASSERT_FALSE(JSONReader::Read("null", &root, false)); diff --git a/base/string_util.cc b/base/string_util.cc index 2f39104..223c485 100644 --- a/base/string_util.cc +++ b/base/string_util.cc @@ -517,7 +517,7 @@ static inline bool IsInUTF8Sequence(int c) { // originally been UTF-8, but has been converted to wide characters because // that's what we (and Windows) use internally. template<typename CHAR> -static bool IsStringUTF8T(const CHAR* str) { +static bool IsStringUTF8T(const CHAR* str, int length) { bool overlong = false; bool surrogate = false; bool nonchar = false; @@ -532,7 +532,7 @@ static bool IsStringUTF8T(const CHAR* str) { // are left in the sequence int positions_left = 0; - for (int i = 0; str[i] != 0; i++) { + for (int i = 0; i < length; i++) { // This whole function assume an unsigned value so force its conversion to // an unsigned value. typename ToUnsigned<CHAR>::Unsigned c = str[i]; @@ -556,6 +556,7 @@ static bool IsStringUTF8T(const CHAR* str) { slower = 0xA0; } else if (c == 0xEF) { // EF BF [BE-BF] : non-character + // TODO(jungshik): EF B7 [90-AF] should be checked as well. nonchar = true; } } else if (c <= 0xF4) { @@ -599,12 +600,12 @@ static bool IsStringUTF8T(const CHAR* str) { return true; } -bool IsStringUTF8(const char* str) { - return IsStringUTF8T(str); +bool IsStringUTF8(const std::string& str) { + return IsStringUTF8T(str.data(), str.length()); } -bool IsStringWideUTF8(const wchar_t* str) { - return IsStringUTF8T(str); +bool IsStringWideUTF8(const std::wstring& str) { + return IsStringUTF8T(str.data(), str.length()); } template<typename Iter> diff --git a/base/string_util.h b/base/string_util.h index 8c28f3b..a9f08c4 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -227,8 +227,8 @@ bool WideToLatin1(const std::wstring& wide, std::string* latin1); // first case) or characters that use only 8-bits and whose 8-bit // representation looks like a UTF-8 string (the second case). bool IsString8Bit(const std::wstring& str); -bool IsStringUTF8(const char* str); -bool IsStringWideUTF8(const wchar_t* str); +bool IsStringUTF8(const std::string& str); +bool IsStringWideUTF8(const std::wstring& str); bool IsStringASCII(const std::wstring& str); bool IsStringASCII(const std::string& str); diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 89cec22..e438ebb 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -113,6 +113,71 @@ TEST(StringUtilTest, CollapseWhitespace) { } } + +TEST(StringUtilTest, IsStringUTF8) { + EXPECT_TRUE(IsStringUTF8("abc")); + EXPECT_TRUE(IsStringUTF8("\xc2\x81")); + EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf")); + EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf")); + EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf")); + EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM + + + // surrogate code points + EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf")); + EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f")); + EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf")); + + // overlong sequences + EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000 + EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB" + EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000 + EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080 + EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff + EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D + EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091 + EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800 + EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM) + EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F + EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5 + + // Beyond U+10FFFF (the upper limit of Unicode codespace) + EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000 + EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes + EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes + + // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE) + EXPECT_FALSE(IsStringUTF8("\xfe\xff")); + EXPECT_FALSE(IsStringUTF8("\xff\xfe")); + EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4))); + EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00")); + + // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF> + EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE) + EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE + EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF + + // This should also be false, but currently we pass them through. + // Disable them for now. +#if 0 + EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0 + EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF +#endif + + // Strings in legacy encodings. We can certainly make up strings + // in a legacy encoding that are valid in UTF-8, but in real data, + // most of them are invalid as UTF-8. + EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1 + EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR + EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5 + // "abc" with U+201[CD] in windows-125[0-8] + EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); + // U+0639 U+064E U+0644 U+064E in ISO-8859-6 + EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); + // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 + EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); +} + static const wchar_t* const kConvertRoundtripCases[] = { L"Google Video", // "网页 图片 资讯更多 »" |