diff options
Diffstat (limited to 'base/i18n/icu_string_conversions.cc')
-rw-r--r-- | base/i18n/icu_string_conversions.cc | 204 |
1 files changed, 128 insertions, 76 deletions
diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc index ba9f9ae..c93b103 100644 --- a/base/i18n/icu_string_conversions.cc +++ b/base/i18n/icu_string_conversions.cc @@ -157,6 +157,90 @@ const char kCodepageUTF16LE[] = "UTF-16LE"; // Codepage <-> Wide/UTF-16 --------------------------------------------------- +// Convert a UTF-16 string into the specified codepage_name. If the codepage +// isn't found, return false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::string* encoded) { + encoded->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + return ConvertFromUTF16(converter, utf16.c_str(), + static_cast<int>(utf16.length()), on_error, encoded); +} + +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment) { + utf16->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + // Even in the worst case, the maximum length in 2-byte units of UTF-16 + // output would be at most the same as the number of bytes in input. There + // is no single-byte encoding in which a character is mapped to a + // non-BMP character requiring two 2-byte units. + // + // Moreover, non-BMP characters in legacy multibyte encodings + // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are + // BOCU and SCSU, but we don't care about them. + size_t uchar_max_length = encoded.length() + 1; + + SetUpErrorHandlerForToUChars(on_error, converter, &status); + char16* byte_buffer = WriteInto(utf16, uchar_max_length); + int byte_buffer_length = static_cast<int>(uchar_max_length); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = string16::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, + data, static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size; + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = string16::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data, + length, &status); + ucnv_close(converter); + if (!U_SUCCESS(status)) { + utf16->clear(); // Make sure the output is empty on error. + return false; + } + + utf16->resize(actual_size); + return true; +} + // Convert a wstring into the specified codepage_name. If the codepage // isn't found, return false. bool WideToCodepage(const std::wstring& wide, @@ -188,31 +272,16 @@ bool WideToCodepage(const std::wstring& wide, #endif // defined(WCHAR_T_IS_UTF32) } -// Convert a UTF-16 string into the specified codepage_name. If the codepage -// isn't found, return false. -bool UTF16ToCodepage(const string16& utf16, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::string* encoded) { - encoded->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - return ConvertFromUTF16(converter, utf16.c_str(), - static_cast<int>(utf16.length()), on_error, encoded); -} - // Converts a string of the given codepage into wstring. // If the codepage isn't found, return false. -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide) { +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment) { #if defined(WCHAR_T_IS_UTF16) - return CodepageToUTF16(encoded, codepage_name, on_error, wide); + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide, + offset_for_adjustment); #elif defined(WCHAR_T_IS_UTF32) wide->clear(); @@ -227,70 +296,53 @@ bool CodepageToWide(const std::string& encoded, // this can be 4 times larger than actually needed. size_t wchar_max_length = encoded.length() + 1; - // The byte buffer and its length to pass to ucnv_toAlgorithimic. - char* byte_buffer = reinterpret_cast<char*>( - WriteInto(wide, wchar_max_length)); - int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), - converter, - byte_buffer, - byte_buffer_length, - encoded.data(), - static_cast<int>(encoded.length()), - &status); + char* byte_buffer = + reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)); + int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = std::wstring::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, + static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size / sizeof(wchar_t); + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = std::wstring::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, length, &status); ucnv_close(converter); - if (!U_SUCCESS(status)) { wide->clear(); // Make sure the output is empty on error. return false; } // actual_size is # of bytes. - wide->resize(actual_size / 4); + wide->resize(actual_size / sizeof(wchar_t)); return true; #endif // defined(WCHAR_T_IS_UTF32) } -// Converts a string of the given codepage into UTF-16. -// If the codepage isn't found, return false. -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16) { - utf16->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - // Even in the worst case, the maximum length in 2-byte units of UTF-16 - // output would be at most the same as the number of bytes in input. There - // is no single-byte encoding in which a character is mapped to a - // non-BMP character requiring two 2-byte units. - // - // Moreover, non-BMP characters in legacy multibyte encodings - // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are - // BOCU and SCSU, but we don't care about them. - size_t uchar_max_length = encoded.length() + 1; - - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toUChars(converter, - WriteInto(utf16, uchar_max_length), - static_cast<int>(uchar_max_length), - encoded.data(), - static_cast<int>(encoded.length()), - &status); - ucnv_close(converter); - if (!U_SUCCESS(status)) { - utf16->clear(); // Make sure the output is empty on error. - return false; - } - - utf16->resize(actual_size); - return true; -} - } // namespace base |