diff options
Diffstat (limited to 'base')
-rw-r--r-- | base/i18n/file_util_icu.cc | 3 | ||||
-rw-r--r-- | base/string_util.cc | 143 | ||||
-rw-r--r-- | base/string_util.h | 1 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 5 | ||||
-rw-r--r-- | base/utf_string_conversion_utils.h | 11 | ||||
-rw-r--r-- | base/utf_string_conversions_unittest.cc | 20 |
6 files changed, 153 insertions, 30 deletions
diff --git a/base/i18n/file_util_icu.cc b/base/i18n/file_util_icu.cc index 914d2dd..f62a05e 100644 --- a/base/i18n/file_util_icu.cc +++ b/base/i18n/file_util_icu.cc @@ -62,10 +62,9 @@ IllegalCharacters::IllegalCharacters() { DCHECK(U_SUCCESS(status)); // Add non-characters. If this becomes a performance bottleneck by // any chance, do not add these to |set| and change IsFilenameLegal() - // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addition to calling + // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling // containsNone(). set->add(0xFDD0, 0xFDEF); - set->add(0xFFFD); // Standard replacement character. for (int i = 0; i <= 0x10; ++i) { int plane_base = 0x10000 * i; set->add(plane_base + 0xFFFE, plane_base + 0xFFFF); diff --git a/base/string_util.cc b/base/string_util.cc index 72151c2..bf69b0c 100644 --- a/base/string_util.cc +++ b/base/string_util.cc @@ -24,8 +24,6 @@ #include "base/logging.h" #include "base/singleton.h" #include "base/third_party/dmg_fp/dmg_fp.h" -#include "base/utf_string_conversion_utils.h" -#include "base/third_party/icu/icu_utf.h" namespace { @@ -613,21 +611,142 @@ bool IsStringASCII(const base::StringPiece& str) { return DoIsStringASCII(str); } -bool IsStringUTF8(const std::string& str) { - const char *src = str.data(); - int32 src_len = static_cast<int32>(str.length()); - int32 char_index = 0; - - while (char_index < src_len) { - int32 code_point; - CBU8_NEXT(src, char_index, src_len, code_point); - if (!base::IsValidCodepoint(code_point)) +// Helper functions that determine whether the given character begins a +// UTF-8 sequence of bytes with the given length. A character satisfies +// "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte +// character. +static inline bool IsBegin2ByteUTF8(int c) { + return (c & 0xE0) == 0xC0; +} +static inline bool IsBegin3ByteUTF8(int c) { + return (c & 0xF0) == 0xE0; +} +static inline bool IsBegin4ByteUTF8(int c) { + return (c & 0xF8) == 0xF0; +} +static inline bool IsInUTF8Sequence(int c) { + return (c & 0xC0) == 0x80; +} + +// This function was copied from Mozilla, with modifications. The original code +// was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for +// this function is: +// This function subject to the Mozilla Public License Version +// 1.1 (the "License"); you may not use this code except in compliance with +// the License. You may obtain a copy of the License at +// http://www.mozilla.org/MPL/ +// +// Software distributed under the License is distributed on an "AS IS" basis, +// WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +// for the specific language governing rights and limitations under the +// License. +// +// The Original Code is mozilla.org code. +// +// The Initial Developer of the Original Code is +// Netscape Communications Corporation. +// Portions created by the Initial Developer are Copyright (C) 2000 +// the Initial Developer. All Rights Reserved. +// +// Contributor(s): +// Scott Collins <scc@mozilla.org> (original author) +// +// This is a template so that it can be run on wide and 8-bit strings. We want +// to run it on wide strings when we have input that we think may have +// originally been UTF-8, but has been converted to wide characters because +// that's what we (and Windows) use internally. +template<typename CHAR> +static bool IsStringUTF8T(const CHAR* str, size_t length) { + bool overlong = false; + bool surrogate = false; + bool nonchar = false; + + // overlong byte upper bound + typename ToUnsigned<CHAR>::Unsigned olupper = 0; + + // surrogate byte lower bound + typename ToUnsigned<CHAR>::Unsigned slower = 0; + + // incremented when inside a multi-byte char to indicate how many bytes + // are left in the sequence + int positions_left = 0; + + for (uintptr_t i = 0; i < length; i++) { + // This whole function assume an unsigned value so force its conversion to + // an unsigned value. + typename ToUnsigned<CHAR>::Unsigned c = str[i]; + if (c < 0x80) + continue; // ASCII + + if (c <= 0xC1) { + // [80-BF] where not expected, [C0-C1] for overlong return false; - } + } else if (IsBegin2ByteUTF8(c)) { + positions_left = 1; + } else if (IsBegin3ByteUTF8(c)) { + positions_left = 2; + if (c == 0xE0) { + // to exclude E0[80-9F][80-BF] + overlong = true; + olupper = 0x9F; + } else if (c == 0xED) { + // ED[A0-BF][80-BF]: surrogate codepoint + surrogate = true; + slower = 0xA0; + } else if (c == 0xEF) { + // EF BF [BE-BF] : non-character + // TODO(jungshik): EF B7 [90-AF] should be checked as well. + nonchar = true; + } + } else if (c <= 0xF4) { + positions_left = 3; + nonchar = true; + if (c == 0xF0) { + // to exclude F0[80-8F][80-BF]{2} + overlong = true; + olupper = 0x8F; + } else if (c == 0xF4) { + // to exclude F4[90-BF][80-BF] + // actually not surrogates but codepoints beyond 0x10FFFF + surrogate = true; + slower = 0x90; + } + } else { + return false; + } + // eat the rest of this multi-byte character + while (positions_left) { + positions_left--; + i++; + c = str[i]; + if (!c) + return false; // end of string but not end of character sequence + + // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF] + if (nonchar && ((!positions_left && c < 0xBE) || + (positions_left == 1 && c != 0xBF) || + (positions_left == 2 && 0x0F != (0x0F & c) ))) { + nonchar = false; + } + if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || + (surrogate && slower <= c) || (nonchar && !positions_left) ) { + return false; + } + overlong = surrogate = false; + } + } return true; } +bool IsStringUTF8(const std::string& str) { + return IsStringUTF8T(str.data(), str.length()); +} + +bool IsStringWideUTF8(const std::wstring& str) { + return IsStringUTF8T(str.data(), str.length()); +} + template<typename Iter> static inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, diff --git a/base/string_util.h b/base/string_util.h index ac52f37..c895f27 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -227,6 +227,7 @@ bool WideToLatin1(const std::wstring& wide, std::string* latin1); // add a new function for that. bool IsString8Bit(const std::wstring& str); bool IsStringUTF8(const std::string& str); +bool IsStringWideUTF8(const std::wstring& str); bool IsStringASCII(const std::wstring& str); bool IsStringASCII(const base::StringPiece& str); bool IsStringASCII(const string16& str); diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 6f366a6..9d848a4 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -225,8 +225,13 @@ TEST(StringUtilTest, IsStringUTF8) { EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE) EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF + + // This should also be false, but currently we pass them through. + // Disable them for now. +#if 0 EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0 EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF +#endif // Strings in legacy encodings. We can certainly make up strings // in a legacy encoding that are valid in UTF-8, but in real data, diff --git a/base/utf_string_conversion_utils.h b/base/utf_string_conversion_utils.h index 3fcb689..a8a76c5 100644 --- a/base/utf_string_conversion_utils.h +++ b/base/utf_string_conversion_utils.h @@ -12,12 +12,11 @@ namespace base { inline bool IsValidCodepoint(uint32 code_point) { - // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in - // 0xFFFE or 0xFFFF), surrogate code points (U+D800..U+DFFF), and codepoints - // larger than U+10FFFF (the highest codepoint allowed). - return code_point < 0xD800u || (code_point >= 0xE000u && - code_point < 0xFDD0u) || (code_point > 0xFDEFu && - code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu); + // Excludes the surrogate code points ([0xD800, 0xDFFF]) and + // codepoints larger than 0x10FFFF (the highest codepoint allowed). + // Non-characters and unassigned codepoints are allowed. + return code_point < 0xD800u || + (code_point >= 0xE000u && code_point <= 0x10FFFFu); } // ReadUnicodeCharacter -------------------------------------------------------- diff --git a/base/utf_string_conversions_unittest.cc b/base/utf_string_conversions_unittest.cc index f68c593..6ba0b5b 100644 --- a/base/utf_string_conversions_unittest.cc +++ b/base/utf_string_conversions_unittest.cc @@ -91,8 +91,8 @@ TEST(UTFStringConversionsTest, ConvertUTF8ToWide) { } convert_cases[] = { // Regular UTF-8 input. {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, - // Non-character is rejected. - {"\xef\xbf\xbfHello", L"\xfffdHello", false}, + // Non-character is passed through. + {"\xef\xbf\xbfHello", L"\xffffHello", true}, // Truncated UTF-8 sequence. {"\xe4\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false}, // Truncated off the end. @@ -105,10 +105,10 @@ TEST(UTFStringConversionsTest, ConvertUTF8ToWide) { // The result will either be in UTF-16 or UTF-32. #if defined(WCHAR_T_IS_UTF16) {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, - {"A\xF4\x8F\xBF\xBEz", L"A\xfffdz", false}, + {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, #elif defined(WCHAR_T_IS_UTF32) {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, - {"A\xF4\x8F\xBF\xBEz", L"A\xfffdz", false}, + {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, #endif }; @@ -148,9 +148,9 @@ TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) { {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, // Test a non-BMP character. {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, - // Non-characters are rejected. - {L"\xffffHello", "\xef\xbf\xbdHello", false}, - {L"\xdbff\xdffeHello", "\xef\xbf\xbdHello", false}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, // The first character is a truncated UTF-16 character. {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false}, // Truncated at the end. @@ -180,9 +180,9 @@ TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) { {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, // Test a non-BMP character. {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, - // Non-characters are rejected. - {L"\xffffHello", "\xEF\xBF\xBDHello", false}, - {L"\x10fffeHello", "\xEF\xBF\xBDHello", false}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, // Invalid Unicode code points. {L"\xfffffffHello", "\xEF\xBF\xBDHello", false}, // The first character is a truncated UTF-16 character. |