summaryrefslogtreecommitdiffstats
path: root/base/utf_string_conversions.cc
diff options
context:
space:
mode:
authorpkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-11-07 01:34:53 +0000
committerpkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-11-07 01:34:53 +0000
commitce85f60cd9d399109dab39fe5a9613879ab9a8f7 (patch)
tree0e9e0072d2e5eadfeec08eef0f06a43c56dc1751 /base/utf_string_conversions.cc
parentd90684d0cf0aa16389c9202153c97d373829b7f3 (diff)
downloadchromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.zip
chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.gz
chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.bz2
Fix various problems with inline autocomplete and URLs that change length during fixup:
* URLs with http auth info, which gets stripped * URLs with IDN hosts * URLs with escaped values that get unescaped In cases like these, we'd inline autocomplete from the wrong locations, highlight the wrong portions of the URL as matches, and sometimes DCHECK() in debug mode. The fix is to track how fixup affects the offsets into the URL we care about. Plumbing this required an enormous number of additions :( There is also a fix here to the URL Fixer Upper, which was obviously modified at some point in the past to use the Parsed components, but without updating the comments or some of the functionality to match. Since this isn't supposed to "fix up" things that aren't simple typos, I removed some code to "fix" bogus ports, which was causing bizarre effects when typing HTTP auth URLs ("http://foo:bar" would be fixed to "http://foo" and then matched for inline autocompletion, which was clearly wrong). This is tested incidentally by one of the new History URL Provider tests (which is how I discovered it). BUG=4010 TEST=Covered by unittests Review URL: http://codereview.chromium.org/372017 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@31352 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/utf_string_conversions.cc')
-rw-r--r--base/utf_string_conversions.cc253
1 files changed, 142 insertions, 111 deletions
diff --git a/base/utf_string_conversions.cc b/base/utf_string_conversions.cc
index 6b25cd8..ffff50a 100644
--- a/base/utf_string_conversions.cc
+++ b/base/utf_string_conversions.cc
@@ -84,43 +84,50 @@ bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
// WriteUnicodeCharacter -------------------------------------------------------
-// Appends a UTF-8 character to the given 8-bit string.
-void WriteUnicodeCharacter(uint32 code_point, std::string* output) {
+// Appends a UTF-8 character to the given 8-bit string. Returns the number of
+// bytes written.
+size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) {
if (code_point <= 0x7f) {
// Fast path the common case of one byte.
output->push_back(code_point);
- return;
+ return 1;
}
- // U8_APPEND_UNSAFE can append up to 4 bytes.
- int32 char_offset = static_cast<int32>(output->length());
+ // CBU8_APPEND_UNSAFE can append up to 4 bytes.
+ size_t char_offset = output->length();
+ size_t original_char_offset = char_offset;
output->resize(char_offset + CBU8_MAX_LENGTH);
CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
- // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so
+ // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
// it will represent the new length of the string.
output->resize(char_offset);
+ return char_offset - original_char_offset;
}
-// Appends the given code point as a UTF-16 character to the STL string.
-void WriteUnicodeCharacter(uint32 code_point, string16* output) {
+// Appends the given code point as a UTF-16 character to the given 16-bit
+// string. Returns the number of 16-bit values written.
+size_t WriteUnicodeCharacter(uint32 code_point, string16* output) {
if (CBU16_LENGTH(code_point) == 1) {
// Thie code point is in the Basic Multilingual Plane (BMP).
output->push_back(static_cast<char16>(code_point));
- } else {
- // Non-BMP characters use a double-character encoding.
- int32 char_offset = static_cast<int32>(output->length());
- output->resize(char_offset + CBU16_MAX_LENGTH);
- CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+ return 1;
}
+ // Non-BMP characters use a double-character encoding.
+ size_t char_offset = output->length();
+ output->resize(char_offset + CBU16_MAX_LENGTH);
+ CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+ return CBU16_MAX_LENGTH;
}
#if defined(WCHAR_T_IS_UTF32)
-// Appends the given UTF-32 character to the given 32-bit string.
-inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
+// Appends the given UTF-32 character to the given 32-bit string. Returns the
+// number of 32-bit values written.
+inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
// This is the easy case, just append the character.
output->push_back(code_point);
+ return 1;
}
#endif // defined(WCHAR_T_IS_UTF32)
@@ -131,31 +138,57 @@ inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
// determine the source, and the given output STL string will be replaced by
// the result.
template<typename SRC_CHAR, typename DEST_STRING>
-bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {
- output->clear();
+bool ConvertUnicode(const SRC_CHAR* src,
+ size_t src_len,
+ DEST_STRING* output,
+ size_t* offset_for_adjustment) {
+ size_t output_offset =
+ (offset_for_adjustment && *offset_for_adjustment < src_len) ?
+ *offset_for_adjustment : DEST_STRING::npos;
// ICU requires 32-bit numbers.
bool success = true;
int32 src_len32 = static_cast<int32>(src_len);
for (int32 i = 0; i < src_len32; i++) {
uint32 code_point;
+ size_t original_i = i;
+ size_t chars_written = 0;
if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
- WriteUnicodeCharacter(code_point, output);
+ chars_written = WriteUnicodeCharacter(code_point, output);
} else {
// TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
// in place of an invalid codepoint.
success = false;
}
+ if ((output_offset != DEST_STRING::npos) &&
+ (*offset_for_adjustment > original_i)) {
+ // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
+ // character read, not after it (so that incrementing it in the loop
+ // increment will place it at the right location), so we need to account
+ // for that in determining the amount that was read.
+ if (*offset_for_adjustment <= static_cast<size_t>(i))
+ output_offset = DEST_STRING::npos;
+ else
+ output_offset += chars_written - (i - original_i + 1);
+ }
}
+
+ if (offset_for_adjustment)
+ *offset_for_adjustment = output_offset;
return success;
}
-// Guesses the length of the output in UTF-8 in bytes, and reserves that amount
-// of space in the given string. We also assume that the input character types
-// are unsigned, which will be true for UTF-16 and -32 on our systems. We assume
-// the string length is greater than zero.
+// Guesses the length of the output in UTF-8 in bytes, clears that output
+// string, and reserves that amount of space. We assume that the input
+// character types are unsigned, which will be true for UTF-16 and -32 on our
+// systems.
template<typename CHAR>
-void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) {
+void PrepareForUTF8Output(const CHAR* src,
+ size_t src_len,
+ std::string* output) {
+ output->clear();
+ if (src_len == 0)
+ return;
if (src[0] < 0x80) {
// Assume that the entire input will be ASCII.
output->reserve(src_len);
@@ -165,11 +198,15 @@ void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) {
}
}
-// Guesses the size of the output buffer (containing either UTF-16 or -32 data)
-// given some UTF-8 input that will be converted to it. See ReserveUTF8Output.
-// We assume the source length is > 0.
+// Prepares an output buffer (containing either UTF-16 or -32 data) given some
+// UTF-8 input that will be converted to it. See PrepareForUTF8Output().
template<typename STRING>
-void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) {
+void PrepareForUTF16Or32Output(const char* src,
+ size_t src_len,
+ STRING* output) {
+ output->clear();
+ if (src_len == 0)
+ return;
if (static_cast<unsigned char>(src[0]) < 0x80) {
// Assume the input is all ASCII, which means 1:1 correspondence.
output->reserve(src_len);
@@ -184,111 +221,121 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) {
// UTF-8 <-> Wide --------------------------------------------------------------
-std::string WideToUTF8(const std::wstring& wide) {
- std::string ret;
- if (wide.empty())
- return ret;
+bool WideToUTF8AndAdjustOffset(const wchar_t* src,
+ size_t src_len,
+ std::string* output,
+ size_t* offset_for_adjustment) {
+ PrepareForUTF8Output(src, src_len, output);
+ return ConvertUnicode<wchar_t, std::string>(src, src_len, output,
+ offset_for_adjustment);
+}
+std::string WideToUTF8AndAdjustOffset(const std::wstring& wide,
+ size_t* offset_for_adjustment) {
+ std::string ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
- WideToUTF8(wide.data(), wide.length(), &ret);
+ WideToUTF8AndAdjustOffset(wide.data(), wide.length(), &ret,
+ offset_for_adjustment);
return ret;
}
-bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- ReserveUTF8Output(src, src_len, output);
- return ConvertUnicode<wchar_t, std::string>(src, src_len, output);
+bool UTF8ToWideAndAdjustOffset(const char* src,
+ size_t src_len,
+ std::wstring* output,
+ size_t* offset_for_adjustment) {
+ PrepareForUTF16Or32Output(src, src_len, output);
+ return ConvertUnicode<char, std::wstring>(src, src_len, output,
+ offset_for_adjustment);
}
-std::wstring UTF8ToWide(const base::StringPiece& utf8) {
+std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8,
+ size_t* offset_for_adjustment) {
std::wstring ret;
- if (utf8.empty())
- return ret;
-
- UTF8ToWide(utf8.data(), utf8.length(), &ret);
+ UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret,
+ offset_for_adjustment);
return ret;
}
-bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- ReserveUTF16Or32Output(src, src_len, output);
- return ConvertUnicode<char, std::wstring>(src, src_len, output);
-}
-
// UTF-16 <-> Wide -------------------------------------------------------------
#if defined(WCHAR_T_IS_UTF16)
// When wide == UTF-16, then conversions are a NOP.
-string16 WideToUTF16(const std::wstring& wide) {
- return wide;
-}
-
-bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
+bool WideToUTF16AndAdjustOffset(const wchar_t* src,
+ size_t src_len,
+ string16* output,
+ size_t* offset_for_adjustment) {
output->assign(src, src_len);
+ if (offset_for_adjustment && (*offset_for_adjustment >= src_len))
+ *offset_for_adjustment = string16::npos;
return true;
}
-std::wstring UTF16ToWide(const string16& utf16) {
- return utf16;
+string16 WideToUTF16AndAdjustOffset(const std::wstring& wide,
+ size_t* offset_for_adjustment) {
+ if (offset_for_adjustment && (*offset_for_adjustment >= wide.length()))
+ *offset_for_adjustment = string16::npos;
+ return wide;
}
-bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
+bool UTF16ToWideAndAdjustOffset(const char16* src,
+ size_t src_len,
+ std::wstring* output,
+ size_t* offset_for_adjustment) {
output->assign(src, src_len);
+ if (offset_for_adjustment && (*offset_for_adjustment >= src_len))
+ *offset_for_adjustment = std::wstring::npos;
return true;
}
-#elif defined(WCHAR_T_IS_UTF32)
-
-string16 WideToUTF16(const std::wstring& wide) {
- string16 ret;
- if (wide.empty())
- return ret;
-
- WideToUTF16(wide.data(), wide.length(), &ret);
- return ret;
+std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
+ size_t* offset_for_adjustment) {
+ if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length()))
+ *offset_for_adjustment = std::wstring::npos;
+ return utf16;
}
-bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
+#elif defined(WCHAR_T_IS_UTF32)
+bool WideToUTF16AndAdjustOffset(const wchar_t* src,
+ size_t src_len,
+ string16* output,
+ size_t* offset_for_adjustment) {
+ output->clear();
// Assume that normally we won't have any non-BMP characters so the counts
// will be the same.
output->reserve(src_len);
- return ConvertUnicode<wchar_t, string16>(src, src_len, output);
+ return ConvertUnicode<wchar_t, string16>(src, src_len, output,
+ offset_for_adjustment);
}
-std::wstring UTF16ToWide(const string16& utf16) {
- std::wstring ret;
- if (utf16.empty())
- return ret;
-
- UTF16ToWide(utf16.data(), utf16.length(), &ret);
+string16 WideToUTF16AndAdjustOffset(const std::wstring& wide,
+ size_t* offset_for_adjustment) {
+ string16 ret;
+ WideToUTF16AndAdjustOffset(wide.data(), wide.length(), &ret,
+ offset_for_adjustment);
return ret;
}
-bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
+bool UTF16ToWideAndAdjustOffset(const char16* src,
+ size_t src_len,
+ std::wstring* output,
+ size_t* offset_for_adjustment) {
+ output->clear();
// Assume that normally we won't have any non-BMP characters so the counts
// will be the same.
output->reserve(src_len);
- return ConvertUnicode<char16, std::wstring>(src, src_len, output);
+ return ConvertUnicode<char16, std::wstring>(src, src_len, output,
+ offset_for_adjustment);
+}
+
+std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
+ size_t* offset_for_adjustment) {
+ std::wstring ret;
+ UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret,
+ offset_for_adjustment);
+ return ret;
}
#endif // defined(WCHAR_T_IS_UTF32)
@@ -298,20 +345,12 @@ bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
#if defined(WCHAR_T_IS_UTF32)
bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- ReserveUTF16Or32Output(src, src_len, output);
- return ConvertUnicode<char, string16>(src, src_len, output);
+ PrepareForUTF16Or32Output(src, src_len, output);
+ return ConvertUnicode<char, string16>(src, src_len, output, NULL);
}
string16 UTF8ToUTF16(const std::string& utf8) {
string16 ret;
- if (utf8.empty())
- return ret;
-
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF8ToUTF16(utf8.data(), utf8.length(), &ret);
@@ -319,20 +358,12 @@ string16 UTF8ToUTF16(const std::string& utf8) {
}
bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- ReserveUTF8Output(src, src_len, output);
- return ConvertUnicode<char16, std::string>(src, src_len, output);
+ PrepareForUTF8Output(src, src_len, output);
+ return ConvertUnicode<char16, std::string>(src, src_len, output, NULL);
}
std::string UTF16ToUTF8(const string16& utf16) {
std::string ret;
- if (utf16.empty())
- return ret;
-
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF16ToUTF8(utf16.data(), utf16.length(), &ret);