summaryrefslogtreecommitdiffstats
path: root/base/string_util.cc
diff options
context:
space:
mode:
authorjschuh@chromium.org <jschuh@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-01-16 17:56:08 +0000
committerjschuh@chromium.org <jschuh@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-01-16 17:56:08 +0000
commit858d4887816dc5239921900f205351d740f38176 (patch)
tree99ce055954eb3bc1c825101ed7210bf4a051af89 /base/string_util.cc
parent548a6c0f30c3dcf374cc06be48f02a06da5c1d19 (diff)
downloadchromium_src-858d4887816dc5239921900f205351d740f38176.zip
chromium_src-858d4887816dc5239921900f205351d740f38176.tar.gz
chromium_src-858d4887816dc5239921900f205351d740f38176.tar.bz2
Revert 36459 - Breaks 7 WebKit tests
Changes are: * base::IsValidCodepoint() now returns false on noncharacter code points. * base::IsStringUTF8() now uses ICU library (removed old Mozilla implementation). * Removed base::IsStringWideUTF8() (was unused and confusing) * file_util::ReplaceIllegalCharactersInPath() now treats Unicode replacement character (U+FFFD) as invalid. * Associated unit tests updated. BUG=2759 BUG=30662 TEST=base_unittests gtest_filter=StringUtilTest.IsStringUTF8 TEST=base_unittests gtest_filter=UTFStringConversionsTest.* TEST=base_unittests gtest_filter=FileUtilICUTestReplaceIllegalCharactersInPathTest Review URL: http://codereview.chromium.org/548017 TBR=jschuh@chromium.org Review URL: http://codereview.chromium.org/552026 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@36460 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/string_util.cc')
-rw-r--r--base/string_util.cc143
1 files changed, 131 insertions, 12 deletions
diff --git a/base/string_util.cc b/base/string_util.cc
index 72151c2..bf69b0c 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -24,8 +24,6 @@
#include "base/logging.h"
#include "base/singleton.h"
#include "base/third_party/dmg_fp/dmg_fp.h"
-#include "base/utf_string_conversion_utils.h"
-#include "base/third_party/icu/icu_utf.h"
namespace {
@@ -613,21 +611,142 @@ bool IsStringASCII(const base::StringPiece& str) {
return DoIsStringASCII(str);
}
-bool IsStringUTF8(const std::string& str) {
- const char *src = str.data();
- int32 src_len = static_cast<int32>(str.length());
- int32 char_index = 0;
-
- while (char_index < src_len) {
- int32 code_point;
- CBU8_NEXT(src, char_index, src_len, code_point);
- if (!base::IsValidCodepoint(code_point))
+// Helper functions that determine whether the given character begins a
+// UTF-8 sequence of bytes with the given length. A character satisfies
+// "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte
+// character.
+static inline bool IsBegin2ByteUTF8(int c) {
+ return (c & 0xE0) == 0xC0;
+}
+static inline bool IsBegin3ByteUTF8(int c) {
+ return (c & 0xF0) == 0xE0;
+}
+static inline bool IsBegin4ByteUTF8(int c) {
+ return (c & 0xF8) == 0xF0;
+}
+static inline bool IsInUTF8Sequence(int c) {
+ return (c & 0xC0) == 0x80;
+}
+
+// This function was copied from Mozilla, with modifications. The original code
+// was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for
+// this function is:
+// This function subject to the Mozilla Public License Version
+// 1.1 (the "License"); you may not use this code except in compliance with
+// the License. You may obtain a copy of the License at
+// http://www.mozilla.org/MPL/
+//
+// Software distributed under the License is distributed on an "AS IS" basis,
+// WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+// for the specific language governing rights and limitations under the
+// License.
+//
+// The Original Code is mozilla.org code.
+//
+// The Initial Developer of the Original Code is
+// Netscape Communications Corporation.
+// Portions created by the Initial Developer are Copyright (C) 2000
+// the Initial Developer. All Rights Reserved.
+//
+// Contributor(s):
+// Scott Collins <scc@mozilla.org> (original author)
+//
+// This is a template so that it can be run on wide and 8-bit strings. We want
+// to run it on wide strings when we have input that we think may have
+// originally been UTF-8, but has been converted to wide characters because
+// that's what we (and Windows) use internally.
+template<typename CHAR>
+static bool IsStringUTF8T(const CHAR* str, size_t length) {
+ bool overlong = false;
+ bool surrogate = false;
+ bool nonchar = false;
+
+ // overlong byte upper bound
+ typename ToUnsigned<CHAR>::Unsigned olupper = 0;
+
+ // surrogate byte lower bound
+ typename ToUnsigned<CHAR>::Unsigned slower = 0;
+
+ // incremented when inside a multi-byte char to indicate how many bytes
+ // are left in the sequence
+ int positions_left = 0;
+
+ for (uintptr_t i = 0; i < length; i++) {
+ // This whole function assume an unsigned value so force its conversion to
+ // an unsigned value.
+ typename ToUnsigned<CHAR>::Unsigned c = str[i];
+ if (c < 0x80)
+ continue; // ASCII
+
+ if (c <= 0xC1) {
+ // [80-BF] where not expected, [C0-C1] for overlong
return false;
- }
+ } else if (IsBegin2ByteUTF8(c)) {
+ positions_left = 1;
+ } else if (IsBegin3ByteUTF8(c)) {
+ positions_left = 2;
+ if (c == 0xE0) {
+ // to exclude E0[80-9F][80-BF]
+ overlong = true;
+ olupper = 0x9F;
+ } else if (c == 0xED) {
+ // ED[A0-BF][80-BF]: surrogate codepoint
+ surrogate = true;
+ slower = 0xA0;
+ } else if (c == 0xEF) {
+ // EF BF [BE-BF] : non-character
+ // TODO(jungshik): EF B7 [90-AF] should be checked as well.
+ nonchar = true;
+ }
+ } else if (c <= 0xF4) {
+ positions_left = 3;
+ nonchar = true;
+ if (c == 0xF0) {
+ // to exclude F0[80-8F][80-BF]{2}
+ overlong = true;
+ olupper = 0x8F;
+ } else if (c == 0xF4) {
+ // to exclude F4[90-BF][80-BF]
+ // actually not surrogates but codepoints beyond 0x10FFFF
+ surrogate = true;
+ slower = 0x90;
+ }
+ } else {
+ return false;
+ }
+ // eat the rest of this multi-byte character
+ while (positions_left) {
+ positions_left--;
+ i++;
+ c = str[i];
+ if (!c)
+ return false; // end of string but not end of character sequence
+
+ // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
+ if (nonchar && ((!positions_left && c < 0xBE) ||
+ (positions_left == 1 && c != 0xBF) ||
+ (positions_left == 2 && 0x0F != (0x0F & c) ))) {
+ nonchar = false;
+ }
+ if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) ||
+ (surrogate && slower <= c) || (nonchar && !positions_left) ) {
+ return false;
+ }
+ overlong = surrogate = false;
+ }
+ }
return true;
}
+bool IsStringUTF8(const std::string& str) {
+ return IsStringUTF8T(str.data(), str.length());
+}
+
+bool IsStringWideUTF8(const std::wstring& str) {
+ return IsStringUTF8T(str.data(), str.length());
+}
+
template<typename Iter>
static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
Iter a_end,