diff options
Diffstat (limited to 'base')
-rw-r--r-- | base/i18n/icu_string_conversions.cc | 9 | ||||
-rw-r--r-- | base/string_util.cc | 144 | ||||
-rw-r--r-- | base/string_util.h | 1 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 6 | ||||
-rw-r--r-- | base/utf_string_conversion_utils.h | 8 |
5 files changed, 20 insertions, 148 deletions
diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc index 252eb9c..9014a7b 100644 --- a/base/i18n/icu_string_conversions.cc +++ b/base/i18n/icu_string_conversions.cc @@ -17,15 +17,6 @@ namespace base { namespace { - -inline bool IsValidCodepoint(uint32 code_point) { - // Excludes the surrogate code points ([0xD800, 0xDFFF]) and - // codepoints larger than 0x10FFFF (the highest codepoint allowed). - // Non-characters and unassigned codepoints are allowed. - return code_point < 0xD800u || - (code_point >= 0xE000u && code_point <= 0x10FFFFu); -} - // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE // in source/common/ucnv_err.c. diff --git a/base/string_util.cc b/base/string_util.cc index c9b0aad..19c1735 100644 --- a/base/string_util.cc +++ b/base/string_util.cc @@ -24,6 +24,8 @@ #include "base/logging.h" #include "base/singleton.h" #include "base/third_party/dmg_fp/dmg_fp.h" +#include "base/utf_string_conversion_utils.h" +#include "base/third_party/icu/icu_utf.h" namespace { @@ -676,142 +678,20 @@ bool IsStringASCII(const base::StringPiece& str) { return DoIsStringASCII(str); } -// Helper functions that determine whether the given character begins a -// UTF-8 sequence of bytes with the given length. A character satisfies -// "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte -// character. -static inline bool IsBegin2ByteUTF8(int c) { - return (c & 0xE0) == 0xC0; -} -static inline bool IsBegin3ByteUTF8(int c) { - return (c & 0xF0) == 0xE0; -} -static inline bool IsBegin4ByteUTF8(int c) { - return (c & 0xF8) == 0xF0; -} -static inline bool IsInUTF8Sequence(int c) { - return (c & 0xC0) == 0x80; -} - -// This function was copied from Mozilla, with modifications. The original code -// was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for -// this function is: -// This function subject to the Mozilla Public License Version -// 1.1 (the "License"); you may not use this code except in compliance with -// the License. You may obtain a copy of the License at -// http://www.mozilla.org/MPL/ -// -// Software distributed under the License is distributed on an "AS IS" basis, -// WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License -// for the specific language governing rights and limitations under the -// License. -// -// The Original Code is mozilla.org code. -// -// The Initial Developer of the Original Code is -// Netscape Communications Corporation. -// Portions created by the Initial Developer are Copyright (C) 2000 -// the Initial Developer. All Rights Reserved. -// -// Contributor(s): -// Scott Collins <scc@mozilla.org> (original author) -// -// This is a template so that it can be run on wide and 8-bit strings. We want -// to run it on wide strings when we have input that we think may have -// originally been UTF-8, but has been converted to wide characters because -// that's what we (and Windows) use internally. -template<typename CHAR> -static bool IsStringUTF8T(const CHAR* str, size_t length) { - bool overlong = false; - bool surrogate = false; - bool nonchar = false; - - // overlong byte upper bound - typename ToUnsigned<CHAR>::Unsigned olupper = 0; - - // surrogate byte lower bound - typename ToUnsigned<CHAR>::Unsigned slower = 0; - - // incremented when inside a multi-byte char to indicate how many bytes - // are left in the sequence - int positions_left = 0; - - for (uintptr_t i = 0; i < length; i++) { - // This whole function assume an unsigned value so force its conversion to - // an unsigned value. - typename ToUnsigned<CHAR>::Unsigned c = str[i]; - if (c < 0x80) - continue; // ASCII - - if (c <= 0xC1) { - // [80-BF] where not expected, [C0-C1] for overlong - return false; - } else if (IsBegin2ByteUTF8(c)) { - positions_left = 1; - } else if (IsBegin3ByteUTF8(c)) { - positions_left = 2; - if (c == 0xE0) { - // to exclude E0[80-9F][80-BF] - overlong = true; - olupper = 0x9F; - } else if (c == 0xED) { - // ED[A0-BF][80-BF]: surrogate codepoint - surrogate = true; - slower = 0xA0; - } else if (c == 0xEF) { - // EF BF [BE-BF] : non-character - // TODO(jungshik): EF B7 [90-AF] should be checked as well. - nonchar = true; - } - } else if (c <= 0xF4) { - positions_left = 3; - nonchar = true; - if (c == 0xF0) { - // to exclude F0[80-8F][80-BF]{2} - overlong = true; - olupper = 0x8F; - } else if (c == 0xF4) { - // to exclude F4[90-BF][80-BF] - // actually not surrogates but codepoints beyond 0x10FFFF - surrogate = true; - slower = 0x90; - } - } else { - return false; - } - - // eat the rest of this multi-byte character - while (positions_left) { - positions_left--; - i++; - c = str[i]; - if (!c) - return false; // end of string but not end of character sequence - - // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF] - if (nonchar && ((!positions_left && c < 0xBE) || - (positions_left == 1 && c != 0xBF) || - (positions_left == 2 && 0x0F != (0x0F & c) ))) { - nonchar = false; - } - if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || - (surrogate && slower <= c) || (nonchar && !positions_left) ) { - return false; - } - overlong = surrogate = false; - } +bool IsStringUTF8(const std::string& str) { + const char *src = str.data(); + int32 src_len = static_cast<int32>(str.length()); + int32 char_index = 0; + + while (char_index < src_len) { + int32 code_point; + CBU8_NEXT(src, char_index, src_len, code_point); + if (!base::IsValidCharacter(code_point)) + return false; } return true; } -bool IsStringUTF8(const std::string& str) { - return IsStringUTF8T(str.data(), str.length()); -} - -bool IsStringWideUTF8(const std::wstring& str) { - return IsStringUTF8T(str.data(), str.length()); -} - template<typename Iter> static inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, diff --git a/base/string_util.h b/base/string_util.h index 9e0da1e..e10b99e 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -247,7 +247,6 @@ bool WideToLatin1(const std::wstring& wide, std::string* latin1); // add a new function for that. bool IsString8Bit(const std::wstring& str); bool IsStringUTF8(const std::string& str); -bool IsStringWideUTF8(const std::wstring& str); bool IsStringASCII(const std::wstring& str); bool IsStringASCII(const base::StringPiece& str); bool IsStringASCII(const string16& str); diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index c6961fe..d75104c 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -225,14 +225,8 @@ TEST(StringUtilTest, IsStringUTF8) { EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE) EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF - - // This should also be false, but currently we pass them through. - // Disable them for now. -#if 0 EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0 EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF -#endif - // Strings in legacy encodings. We can certainly make up strings // in a legacy encoding that are valid in UTF-8, but in real data, // most of them are invalid as UTF-8. diff --git a/base/utf_string_conversion_utils.h b/base/utf_string_conversion_utils.h index a8a76c5..0c02d82 100644 --- a/base/utf_string_conversion_utils.h +++ b/base/utf_string_conversion_utils.h @@ -19,6 +19,14 @@ inline bool IsValidCodepoint(uint32 code_point) { (code_point >= 0xE000u && code_point <= 0x10FFFFu); } +inline bool IsValidCharacter(uint32 code_point) { + // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in + // 0xFFFE or 0xFFFF) from the set of valid code points. + return code_point < 0xD800u || (code_point >= 0xE000u && + code_point < 0xFDD0u) || (code_point > 0xFDEFu && + code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu); +} + // ReadUnicodeCharacter -------------------------------------------------------- // Reads a UTF-8 stream, placing the next code point into the given output |