summaryrefslogtreecommitdiffstats
path: root/webkit
diff options
context:
space:
mode:
Diffstat (limited to 'webkit')
-rw-r--r--webkit/pending/TextCodecMac.cpp321
1 files changed, 321 insertions, 0 deletions
diff --git a/webkit/pending/TextCodecMac.cpp b/webkit/pending/TextCodecMac.cpp
new file mode 100644
index 0000000..b55516f
--- /dev/null
+++ b/webkit/pending/TextCodecMac.cpp
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextCodecMac.h"
+
+#include "CString.h"
+#include "CharacterNames.h"
+#include "CharsetData.h"
+#include "PlatformString.h"
+#include <wtf/Assertions.h>
+
+using std::auto_ptr;
+using std::min;
+
+namespace WebCore {
+
+// We need to keep this because ICU doesn't support some of the encodings that we need:
+// <http://bugs.webkit.org/show_bug.cgi?id=4195>.
+
+const size_t ConversionBufferSize = 16384;
+
+static TECObjectRef cachedConverterTEC;
+static TECTextEncodingID cachedConverterEncoding = invalidEncoding;
+
+void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar)
+{
+ TECTextEncodingID lastEncoding = invalidEncoding;
+ const char* lastName = 0;
+
+ for (size_t i = 0; CharsetTable[i].name; ++i) {
+ if (CharsetTable[i].encoding != lastEncoding) {
+ lastEncoding = CharsetTable[i].encoding;
+ lastName = CharsetTable[i].name;
+ }
+ registrar(CharsetTable[i].name, lastName);
+ }
+}
+
+static auto_ptr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData)
+{
+ return auto_ptr<TextCodec>(new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData)));
+}
+
+void TextCodecMac::registerCodecs(TextCodecRegistrar registrar)
+{
+ TECTextEncodingID lastEncoding = invalidEncoding;
+
+ for (size_t i = 0; CharsetTable[i].name; ++i)
+ if (CharsetTable[i].encoding != lastEncoding) {
+ registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding);
+ lastEncoding = CharsetTable[i].encoding;
+ }
+}
+
+TextCodecMac::TextCodecMac(TECTextEncodingID encoding)
+ : m_encoding(encoding)
+ , m_error(false)
+ , m_numBufferedBytes(0)
+ , m_converterTEC(0)
+{
+}
+
+TextCodecMac::~TextCodecMac()
+{
+ releaseTECConverter();
+}
+
+void TextCodecMac::releaseTECConverter() const
+{
+ if (m_converterTEC) {
+ if (cachedConverterTEC != 0)
+ TECDisposeConverter(cachedConverterTEC);
+ cachedConverterTEC = m_converterTEC;
+ cachedConverterEncoding = m_encoding;
+ m_converterTEC = 0;
+ }
+}
+
+OSStatus TextCodecMac::createTECConverter() const
+{
+ bool cachedEncodingEqual = cachedConverterEncoding == m_encoding;
+ cachedConverterEncoding = invalidEncoding;
+
+ if (cachedEncodingEqual && cachedConverterTEC) {
+ m_converterTEC = cachedConverterTEC;
+ cachedConverterTEC = 0;
+ TECClearConverterContextInfo(m_converterTEC);
+ } else {
+ OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
+ CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
+ if (status)
+ return status;
+
+ TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
+ }
+
+ return noErr;
+}
+
+OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
+ void *outputBuffer, int outputBufferLength, int& outputLength)
+{
+ OSStatus status;
+ unsigned long bytesRead = 0;
+ unsigned long bytesWritten = 0;
+
+ if (m_numBufferedBytes != 0) {
+ // Finish converting a partial character that's in our buffer.
+
+ // First, fill the partial character buffer with as many bytes as are available.
+ ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes));
+ const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
+ const int bytesToPutInBuffer = MIN(spaceInBuffer, inputBufferLength);
+ ASSERT(bytesToPutInBuffer != 0);
+ memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);
+
+ // Now, do a conversion on the buffer.
+ status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
+ reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
+ ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);
+
+ if (status == kTECPartialCharErr && bytesRead == 0) {
+ // Handle the case where the partial character was not converted.
+ if (bytesToPutInBuffer >= spaceInBuffer) {
+ LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes));
+ m_numBufferedBytes = 0;
+ status = kTECUnmappableElementErr; // should never happen, but use this error code
+ } else {
+ // Tell the caller we read all the source bytes and keep them in the buffer.
+ m_numBufferedBytes += bytesToPutInBuffer;
+ bytesRead = bytesToPutInBuffer;
+ status = noErr;
+ }
+ } else {
+ // We are done with the partial character buffer.
+ // Also, we have read some of the bytes from the main buffer.
+ if (bytesRead > m_numBufferedBytes) {
+ bytesRead -= m_numBufferedBytes;
+ } else {
+ LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
+ bytesRead = 0;
+ }
+ m_numBufferedBytes = 0;
+ if (status == kTECPartialCharErr) {
+ // While there may be a partial character problem in the small buffer,
+ // we have to try again and not get confused and think there is a partial
+ // character problem in the large buffer.
+ status = noErr;
+ }
+ }
+ } else {
+ status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
+ static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
+ ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
+ }
+
+ // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
+ if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0) {
+ status = kTECOutputBufferFullStatus;
+ }
+
+ inputLength = bytesRead;
+ outputLength = bytesWritten;
+ return status;
+}
+
+String TextCodecMac::decode(const char* bytes, size_t length, bool flush)
+{
+ // Get a converter for the passed-in encoding.
+ if (!m_converterTEC && createTECConverter() != noErr)
+ return String();
+
+ Vector<UChar> result;
+
+ const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
+ int sourceLength = length;
+ bool bufferWasFull = false;
+ UniChar buffer[ConversionBufferSize];
+
+ while (sourceLength || bufferWasFull) {
+ int bytesRead = 0;
+ int bytesWritten = 0;
+ OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
+ ASSERT(bytesRead <= sourceLength);
+ sourcePointer += bytesRead;
+ sourceLength -= bytesRead;
+
+ switch (status) {
+ case noErr:
+ case kTECOutputBufferFullStatus:
+ break;
+ case kTextMalformedInputErr:
+ case kTextUndefinedElementErr:
+ // FIXME: Put FFFD character into the output string in this case?
+ TECClearConverterContextInfo(m_converterTEC);
+ if (sourceLength) {
+ sourcePointer += 1;
+ sourceLength -= 1;
+ }
+ break;
+ case kTECPartialCharErr: {
+ // Put the partial character into the buffer.
+ ASSERT(m_numBufferedBytes == 0);
+ const int bufferSize = sizeof(m_numBufferedBytes);
+ if (sourceLength < bufferSize) {
+ memcpy(m_bufferedBytes, sourcePointer, sourceLength);
+ m_numBufferedBytes = sourceLength;
+ } else {
+ LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
+ }
+ sourceLength = 0;
+ break;
+ }
+ default:
+ LOG_ERROR("text decoding failed with error %ld", static_cast<long>(status));
+ m_error = true;
+ return String();
+ }
+
+ ASSERT(!(bytesWritten % sizeof(UChar)));
+ result.append(buffer, bytesWritten / sizeof(UChar));
+
+ bufferWasFull = status == kTECOutputBufferFullStatus;
+ }
+
+ if (flush) {
+ unsigned long bytesWritten = 0;
+ TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
+ ASSERT(!(bytesWritten % sizeof(UChar)));
+ result.append(buffer, bytesWritten / sizeof(UChar));
+ }
+
+ String resultString = String::adopt(result);
+
+ // <rdar://problem/3225472>
+ // Simplified Chinese pages use the code A3A0 to mean "full-width space".
+ // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
+ // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
+ if (m_encoding == kCFStringEncodingGB_18030_2000)
+ resultString.replace(0xE5E5, ideographicSpace);
+
+ return resultString;
+}
+
+CString TextCodecMac::encode(const UChar* characters, size_t length, bool allowEntities)
+{
+ // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
+
+ // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
+ // Encoding will change the yen sign back into a backslash.
+ String copy(characters, length);
+ copy.replace('\\', m_backslashAsCurrencySymbol);
+ CFStringRef cfs = copy.createCFString();
+
+ CFIndex startPos = 0;
+ CFIndex charactersLeft = CFStringGetLength(cfs);
+ Vector<char> result;
+ size_t size = 0;
+ UInt8 lossByte = allowEntities ? 0 : '?';
+ while (charactersLeft > 0) {
+ CFRange range = CFRangeMake(startPos, charactersLeft);
+ CFIndex bufferLength;
+ CFStringGetBytes(cfs, range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength);
+
+ result.grow(size + bufferLength);
+ unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size);
+ CFIndex charactersConverted = CFStringGetBytes(cfs, range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength);
+ size += bufferLength;
+
+ if (charactersConverted != charactersLeft) {
+ unsigned badChar = CFStringGetCharacterAtIndex(cfs, startPos + charactersConverted);
+ ++charactersConverted;
+ if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate
+ UniChar low = CFStringGetCharacterAtIndex(cfs, startPos + charactersConverted);
+ if ((low & 0xFC00) == 0xDC00) { // is low surrogate
+ badChar <<= 10;
+ badChar += low;
+ badChar += 0x10000 - (0xD800 << 10) - 0xDC00;
+ ++charactersConverted;
+ }
+ }
+ char entityBuffer[16];
+ sprintf(entityBuffer, "&#%u;", badChar);
+ size_t entityLength = strlen(entityBuffer);
+ result.grow(size + entityLength);
+ memcpy(result.data() + size, entityBuffer, entityLength);
+ size += entityLength;
+ }
+
+ startPos += charactersConverted;
+ charactersLeft -= charactersConverted;
+ }
+ CFRelease(cfs);
+ return CString(result.data(), size);
+}
+
+} // namespace WebCore