summaryrefslogtreecommitdiffstats
path: root/base/string_util_icu.cc
diff options
context:
space:
mode:
authorbrettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2008-08-07 15:29:49 +0000
committerbrettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2008-08-07 15:29:49 +0000
commit6b27db809e959efaf7183ea2de64c6ab3947ef3d (patch)
treeed2ed10f826f6eb40884231ee0c98d86afef44a7 /base/string_util_icu.cc
parent65b1094478e054ef1f924d3681f8d34ec88d9fcf (diff)
downloadchromium_src-6b27db809e959efaf7183ea2de64c6ab3947ef3d.zip
chromium_src-6b27db809e959efaf7183ea2de64c6ab3947ef3d.tar.gz
chromium_src-6b27db809e959efaf7183ea2de64c6ab3947ef3d.tar.bz2
Remove the old NativeMB functions from string util, and use the new ones in sys_strings.h. I also removed duplicated code from the sandbox that can now use this, and fixed one case in the bug reporter that should not have been using the native multibyte encoding.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@515 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/string_util_icu.cc')
-rw-r--r--base/string_util_icu.cc190
1 files changed, 190 insertions, 0 deletions
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index 797ccbd..1a84be3 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -26,6 +26,7 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
#include "base/string_util.h"
#include <string.h>
@@ -38,6 +39,195 @@
#include "unicode/numfmt.h"
#include "unicode/ustring.h"
+namespace {
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+// Reads a UTF-8 stream, placing the next code point into the given output
+// |*code_point|. |src| represents the entire string to read, and |*char_index|
+// is the character offset within the string to start reading at. |*char_index|
+// will be updated to index the last character read, such that incrementing it
+// (as in a for loop) will take the reader to the next character.
+//
+// Returns true on success. On false, |*code_point| will be invalid.
+bool ReadUnicodeCharacter(const char* src, int32 src_len,
+ int32* char_index, uint32* code_point) {
+ U8_NEXT(src, *char_index, src_len, *code_point);
+
+ // The ICU macro above moves to the next char, we want to point to the last
+ // char consumed.
+ (*char_index)--;
+
+ // Validate the decoded value.
+ return U_IS_UNICODE_CHAR(*code_point);
+}
+
+#ifdef WIN32
+// Reads a UTF-16 character for Windows. The usage is the same as the 8-bit
+// version above.
+bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
+ int32* char_index, uint32* code_point) {
+ if (U16_IS_SURROGATE(src[*char_index])) {
+ if (!U16_IS_SURROGATE_LEAD(src[*char_index]) ||
+ *char_index + 1 >= src_len ||
+ !U16_IS_TRAIL(src[*char_index + 1])) {
+ // Invalid surrogate pair.
+ return false;
+ }
+
+ // Valid surrogate pair.
+ *code_point = U16_GET_SUPPLEMENTARY(src[*char_index],
+ src[*char_index + 1]);
+ (*char_index)++;
+ } else {
+ // Not a surrogate, just one 16-bit word.
+ *code_point = src[*char_index];
+ }
+
+ return U_IS_UNICODE_CHAR(*code_point);
+}
+#else
+// Reads a 32-bit character for Mac and Linux systems. The usage is the same as
+// the 8-bit version above.
+bool ReadUnicodeCharacter(const wchar_t* src, in32 src_len,
+ int32* char_index, uint32* code_point) {
+ // Conversion is easy since the source is 32-bit.
+ *code_point = src[*char_index];
+
+ // Validate the value.
+ return U_IS_UNICODE_CHAR(*code_point);
+}
+#endif
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+// Appends a UTF-8 character to the given 8-bit string.
+void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) {
+ if (code_point <= 0x7f) {
+ // Fast path the common case of one byte.
+ output->push_back(code_point);
+ return;
+ }
+
+ // U8_APPEND_UNSAFE can append up to 4 bytes.
+ int32 char_offset = static_cast<int32>(output->length());
+ output->resize(char_offset + U8_MAX_LENGTH);
+
+ U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+
+ // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so
+ // it will represent the new length of the string.
+ output->resize(char_offset);
+}
+
+#ifdef WIN32
+// Appends the given code point as a UTF-16 character to the STL string. On
+// Windows, wchar_t is UTF-16.
+void WriteUnicodeCharacter(uint32 code_point,
+ std::basic_string<wchar_t>* output) {
+ if (U16_LENGTH(code_point) == 1) {
+ // Thie code point is in the Basic Multilingual Plane (BMP).
+ output->push_back(static_cast<wchar_t>(code_point));
+ } else {
+ // Non-BMP characters use a double-character encoding.
+ int32 char_offset = static_cast<int32>(output->length());
+ output->resize(char_offset + U16_MAX_LENGTH);
+ U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+ }
+}
+#else
+// Appends the given UCS-4 character to the given 32-bit string for Linux and
+// Mac where wchar_t is UCS-4.
+inline void WriteUnicodeCharacter(uint32 code_point,
+ std::basic_string<wchar_t>* output) {
+ // This is the easy case, just append the character.
+ output->push_back(code_point);
+}
+#endif
+
+// Generalized Unicode converter -----------------------------------------------
+
+// Converts the given source Unicode character type to the given destination
+// Unicode character type as a STL string. The given input buffer and size
+// determine the source, and the given output STL string will be replaced by
+// the result.
+template<typename SRC_CHAR, typename DEST_CHAR>
+bool ConvertUnicode(const SRC_CHAR* src, size_t src_len,
+ std::basic_string<DEST_CHAR>* output) {
+ output->clear();
+
+ // ICU requires 32-bit numbers.
+ bool success = true;
+ int32 src_len32 = static_cast<int32>(src_len);
+ for (int32 i = 0; i < src_len32; i++) {
+ uint32 code_point;
+ if (ReadUnicodeCharacter(src, src_len32, &i, &code_point))
+ WriteUnicodeCharacter(code_point, output);
+ else
+ success = false;
+ }
+ return success;
+}
+
+} // namespace
+
+// UTF-x <-> UTF-x -------------------------------------------------------------
+
+std::string WideToUTF8(const std::wstring& wide) {
+ std::string ret;
+ if (wide.empty())
+ return ret;
+
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ WideToUTF8(wide.data(), wide.length(), &ret);
+ return ret;
+}
+
+bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ // Intelligently guess the size of the output string. When it's an ASCII
+ // character, assume the rest will be ASCII and use a buffer size the same as
+ // the input. When it's not ASCII, assume 3-bytes per character as the
+ // starting point. This will be resized internally later if it's too small.
+ if (src[0] < 0x80)
+ output->reserve(src_len);
+ else
+ output->reserve(src_len * 3);
+ return ConvertUnicode<wchar_t, char>(src, src_len, output);
+}
+
+std::wstring UTF8ToWide(const std::string& utf8) {
+ std::wstring ret;
+ if (utf8.empty())
+ return ret;
+
+ UTF8ToWide(utf8.data(), utf8.length(), &ret);
+ return ret;
+}
+
+bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ // Intelligently guess the size of the output string. When it's an ASCII
+ // character, assume the rest will be ASCII and use a buffer size the same as
+ // the input. When it's not ASCII, assume the UTF-8 takes 2 bytes per
+ // character (this is more conservative than 3 which we use above when
+ // converting the other way).
+ if (src[0] < 0x80)
+ output->reserve(src_len);
+ else
+ output->reserve(src_len / 2);
+ return ConvertUnicode<char, wchar_t>(src, src_len, output);
+}
+
// Codepage <-> Wide -----------------------------------------------------------
// Convert a unicode string into the specified codepage_name. If the codepage