1 files changed, 128 insertions, 76 deletions
diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc
index ba9f9ae..c93b103 100644
--- a/base/i18n/icu_string_conversions.cc
+++ b/base/i18n/icu_string_conversions.cc
@@ -157,6 +157,90 @@ const char kCodepageUTF16LE[] = "UTF-16LE";
 
 // Codepage <-> Wide/UTF-16  ---------------------------------------------------
 
+// Convert a UTF-16 string into the specified codepage_name.  If the codepage
+// isn't found, return false.
+bool UTF16ToCodepage(const string16& utf16,
+                     const char* codepage_name,
+                     OnStringConversionError::Type on_error,
+                     std::string* encoded) {
+  encoded->clear();
+
+  UErrorCode status = U_ZERO_ERROR;
+  UConverter* converter = ucnv_open(codepage_name, &status);
+  if (!U_SUCCESS(status))
+    return false;
+
+  return ConvertFromUTF16(converter, utf16.c_str(),
+                          static_cast<int>(utf16.length()), on_error, encoded);
+}
+
+bool CodepageToUTF16AndAdjustOffset(const std::string& encoded,
+                                    const char* codepage_name,
+                                    OnStringConversionError::Type on_error,
+                                    string16* utf16,
+                                    size_t* offset_for_adjustment) {
+  utf16->clear();
+
+  UErrorCode status = U_ZERO_ERROR;
+  UConverter* converter = ucnv_open(codepage_name, &status);
+  if (!U_SUCCESS(status))
+    return false;
+
+  // Even in the worst case, the maximum length in 2-byte units of UTF-16
+  // output would be at most the same as the number of bytes in input. There
+  // is no single-byte encoding in which a character is mapped to a
+  // non-BMP character requiring two 2-byte units.
+  //
+  // Moreover, non-BMP characters in legacy multibyte encodings
+  // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
+  // BOCU and SCSU, but we don't care about them.
+  size_t uchar_max_length = encoded.length() + 1;
+
+  SetUpErrorHandlerForToUChars(on_error, converter, &status);
+  char16* byte_buffer = WriteInto(utf16, uchar_max_length);
+  int byte_buffer_length = static_cast<int>(uchar_max_length);
+  const char* data = encoded.data();
+  int length = static_cast<int>(encoded.length());
+  int actual_size = 0;
+  if (offset_for_adjustment) {
+    if (*offset_for_adjustment >= encoded.length()) {
+      *offset_for_adjustment = string16::npos;
+    } else if (*offset_for_adjustment != 0) {
+      // Try to adjust the offset by converting the string in two pieces and
+      // using the length of the first piece as the adjusted offset.
+      actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length,
+          data, static_cast<int>(*offset_for_adjustment), &status);
+      if (U_SUCCESS(status)) {
+        // Conversion succeeded, so update the offset and then fall through to
+        // appending the second half of the string.
+        data += *offset_for_adjustment;
+        length -= *offset_for_adjustment;
+        *offset_for_adjustment = actual_size;
+        byte_buffer += actual_size;
+        byte_buffer_length -= actual_size;
+      } else {
+        // The offset may have been in the middle of an encoding sequence; mark
+        // it as having failed to adjust and then try to convert the entire
+        // string.
+        *offset_for_adjustment = string16::npos;
+        actual_size = 0;
+        ucnv_reset(converter);
+        status = U_ZERO_ERROR;
+      }
+    }
+  }
+  actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data,
+                               length, &status);
+  ucnv_close(converter);
+  if (!U_SUCCESS(status)) {
+    utf16->clear();  // Make sure the output is empty on error.
+    return false;
+  }
+
+  utf16->resize(actual_size);
+  return true;
+}
+
 // Convert a wstring into the specified codepage_name.  If the codepage
 // isn't found, return false.
 bool WideToCodepage(const std::wstring& wide,
@@ -188,31 +272,16 @@ bool WideToCodepage(const std::wstring& wide,
 #endif  // defined(WCHAR_T_IS_UTF32)
 }
 
-// Convert a UTF-16 string into the specified codepage_name.  If the codepage
-// isn't found, return false.
-bool UTF16ToCodepage(const string16& utf16,
-                    const char* codepage_name,
-                    OnStringConversionError::Type on_error,
-                    std::string* encoded) {
-  encoded->clear();
-
-  UErrorCode status = U_ZERO_ERROR;
-  UConverter* converter = ucnv_open(codepage_name, &status);
-  if (!U_SUCCESS(status))
-    return false;
-
-  return ConvertFromUTF16(converter, utf16.c_str(),
-                          static_cast<int>(utf16.length()), on_error, encoded);
-}
-
 // Converts a string of the given codepage into wstring.
 // If the codepage isn't found, return false.
-bool CodepageToWide(const std::string& encoded,
-                    const char* codepage_name,
-                    OnStringConversionError::Type on_error,
-                    std::wstring* wide) {
+bool CodepageToWideAndAdjustOffset(const std::string& encoded,
+                                   const char* codepage_name,
+                                   OnStringConversionError::Type on_error,
+                                   std::wstring* wide,
+                                   size_t* offset_for_adjustment) {
 #if defined(WCHAR_T_IS_UTF16)
-  return CodepageToUTF16(encoded, codepage_name, on_error, wide);
+  return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide,
+                                        offset_for_adjustment);
 #elif defined(WCHAR_T_IS_UTF32)
   wide->clear();
 
@@ -227,70 +296,53 @@ bool CodepageToWide(const std::string& encoded,
   // this can be 4 times larger than actually needed.
   size_t wchar_max_length = encoded.length() + 1;
 
-  // The byte buffer and its length to pass to ucnv_toAlgorithimic.
-  char* byte_buffer = reinterpret_cast<char*>(
-      WriteInto(wide, wchar_max_length));
-  int byte_buffer_length = static_cast<int>(wchar_max_length) * 4;
-
   SetUpErrorHandlerForToUChars(on_error, converter, &status);
-  int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(),
-                                       converter,
-                                       byte_buffer,
-                                       byte_buffer_length,
-                                       encoded.data(),
-                                       static_cast<int>(encoded.length()),
-                                       &status);
+  char* byte_buffer =
+      reinterpret_cast<char*>(WriteInto(wide, wchar_max_length));
+  int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t);
+  const char* data = encoded.data();
+  int length = static_cast<int>(encoded.length());
+  int actual_size = 0;
+  if (offset_for_adjustment) {
+    if (*offset_for_adjustment >= encoded.length()) {
+      *offset_for_adjustment = std::wstring::npos;
+    } else if (*offset_for_adjustment != 0) {
+      // Try to adjust the offset by converting the string in two pieces and
+      // using the length of the first piece as the adjusted offset.
+      actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter,
+          byte_buffer, byte_buffer_length, data,
+          static_cast<int>(*offset_for_adjustment), &status);
+      if (U_SUCCESS(status)) {
+        // Conversion succeeded, so update the offset and then fall through to
+        // appending the second half of the string.
+        data += *offset_for_adjustment;
+        length -= *offset_for_adjustment;
+        *offset_for_adjustment = actual_size / sizeof(wchar_t);
+        byte_buffer += actual_size;
+        byte_buffer_length -= actual_size;
+      } else {
+        // The offset may have been in the middle of an encoding sequence; mark
+        // it as having failed to adjust and then try to convert the entire
+        // string.
+        *offset_for_adjustment = std::wstring::npos;
+        actual_size = 0;
+        ucnv_reset(converter);
+        status = U_ZERO_ERROR;
+      }
+    }
+  }
+  actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter,
+      byte_buffer, byte_buffer_length, data, length, &status);
   ucnv_close(converter);
-
   if (!U_SUCCESS(status)) {
     wide->clear();  // Make sure the output is empty on error.
     return false;
   }
 
   // actual_size is # of bytes.
-  wide->resize(actual_size / 4);
+  wide->resize(actual_size / sizeof(wchar_t));
   return true;
 #endif  // defined(WCHAR_T_IS_UTF32)
 }
 
-// Converts a string of the given codepage into UTF-16.
-// If the codepage isn't found, return false.
-bool CodepageToUTF16(const std::string& encoded,
-                     const char* codepage_name,
-                     OnStringConversionError::Type on_error,
-                     string16* utf16) {
-  utf16->clear();
-
-  UErrorCode status = U_ZERO_ERROR;
-  UConverter* converter = ucnv_open(codepage_name, &status);
-  if (!U_SUCCESS(status))
-    return false;
-
-  // Even in the worst case, the maximum length in 2-byte units of UTF-16
-  // output would be at most the same as the number of bytes in input. There
-  // is no single-byte encoding in which a character is mapped to a
-  // non-BMP character requiring two 2-byte units.
-  //
-  // Moreover, non-BMP characters in legacy multibyte encodings
-  // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
-  // BOCU and SCSU, but we don't care about them.
-  size_t uchar_max_length = encoded.length() + 1;
-
-  SetUpErrorHandlerForToUChars(on_error, converter, &status);
-  int actual_size = ucnv_toUChars(converter,
-                                  WriteInto(utf16, uchar_max_length),
-                                  static_cast<int>(uchar_max_length),
-                                  encoded.data(),
-                                  static_cast<int>(encoded.length()),
-                                  &status);
-  ucnv_close(converter);
-  if (!U_SUCCESS(status)) {
-    utf16->clear();  // Make sure the output is empty on error.
-    return false;
-  }
-
-  utf16->resize(actual_size);
-  return true;
-}
-
 }  // namespace base