Define a variable to distinguish system_icu from bundled_icu

Most of encoding name aliases manually added in TextCodecICU are present in Chrome's copy of ICU. So, they don't have to be added when Chrome's ICU is used. Define 'USING_SYSTEM_ICU" when 'use_system_icu=1' and register the above encoding name aliases only when 'USING_SYSTEM_ICU' is set. In addition, the following was done : * Remove GBK aliases not specified in the encoding spec. * Add tests for GBK and EUC-KR aliases in the spec that have not been tested before. * Add two aliases for ISO-8859-8-I regardless of whether bundled or system ICU is used. * Remove xA3xA0 => U+3000 override in GBK and GB18030 because Chrome's copy of ICU already has that with https://codereview.chromium.org/1162723008/ * Remove gbkFallback for 4 code points when bundled ICU is used for the same reason as above. * Remove one-way mapping from Unicode for U+22EF and U+301C even with system_icu because Chinese IMEs on Mac do not produce them on Mac. (they're added to Webkit to be compatible with an old Mac converter, but they're not specified in the encoding spec.) BUG=493824 TEST=Layout test fast/encoding/* Review URL: https://codereview.chromium.org/1167523003 git-svn-id: svn://svn.chromium.org/blink/trunk@196607 bbb929c8-8fbe-4397-9dbb-9b2b20218538
author: jshin@chromium.org <jshin@chromium.org> 2015-06-05 22:48:02 +0000
committer: jshin@chromium.org <jshin@chromium.org> 2015-06-05 22:48:02 +0000
commit: 52692952ee4c963268d9c1020bc637aff6029e0e (patch)
tree: c51c677f565574eba2e91e42ff30457688471de2
parent: f62c0204c635fb6acee45fad2582dc4cd48d844f (diff)
download: chromium_src-52692952ee4c963268d9c1020bc637aff6029e0e.zip
chromium_src-52692952ee4c963268d9c1020bc637aff6029e0e.tar.gz
chromium_src-52692952ee4c963268d9c1020bc637aff6029e0e.tar.bz2
15 files changed, 161 insertions, 97 deletions
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/GBK/cn-gb-expected.txt b/third_party/WebKit/LayoutTests/fast/encoding/GBK/cn-gb-expected.txt
deleted file mode 100644
index ec39154..0000000
--- a/third_party/WebKit/LayoutTests/fast/encoding/GBK/cn-gb-expected.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-The following two lines should look identically:
-
-ä¸€ç†â‚¬é••
-
-ä¸€ç†â‚¬é••
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/GBK/cn-gb.html b/third_party/WebKit/LayoutTests/fast/encoding/GBK/cn-gb.html
deleted file mode 100644
index d7e2f49..0000000
--- a/third_party/WebKit/LayoutTests/fast/encoding/GBK/cn-gb.html
+++ /dev/null
@@ -1,10 +0,0 @@
-<head>
-    <meta content="text/html; charset=cn-gb" http-equiv="Content-Type"/>
-</head>
-<script>
-if (window.testRunner)
-    testRunner.dumpAsText();
-</script>
-<p>The following two lines should look identically:</p>
-<p>Ò»¹P€éF</p>
-<p>&#19968;&#31558;&euro;&#38229;</p>
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/GBK/csgb231280-expected.txt b/third_party/WebKit/LayoutTests/fast/encoding/GBK/csgb231280-expected.txt
deleted file mode 100644
index ec39154..0000000
--- a/third_party/WebKit/LayoutTests/fast/encoding/GBK/csgb231280-expected.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-The following two lines should look identically:
-
-ä¸€ç†â‚¬é••
-
-ä¸€ç†â‚¬é••
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/GBK/csgb231280.html b/third_party/WebKit/LayoutTests/fast/encoding/GBK/csgb231280.html
deleted file mode 100644
index 1cbc739..0000000
--- a/third_party/WebKit/LayoutTests/fast/encoding/GBK/csgb231280.html
+++ /dev/null
@@ -1,10 +0,0 @@
-<head>
-    <meta content="text/html; charset=csgb231280" http-equiv="Content-Type"/>
-</head>
-<script>
-if (window.testRunner)
-    testRunner.dumpAsText();
-</script>
-<p>The following two lines should look identically:</p>
-<p>Ò»¹P€éF</p>
-<p>&#19968;&#31558;&euro;&#38229;</p>
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/GBK/x-euc-cn-expected.txt b/third_party/WebKit/LayoutTests/fast/encoding/GBK/x-euc-cn-expected.txt
deleted file mode 100644
index ec39154..0000000
--- a/third_party/WebKit/LayoutTests/fast/encoding/GBK/x-euc-cn-expected.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-The following two lines should look identically:
-
-ä¸€ç†â‚¬é••
-
-ä¸€ç†â‚¬é••
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/GBK/x-euc-cn.html b/third_party/WebKit/LayoutTests/fast/encoding/GBK/x-euc-cn.html
deleted file mode 100644
index 2024cd2..0000000
--- a/third_party/WebKit/LayoutTests/fast/encoding/GBK/x-euc-cn.html
+++ /dev/null
@@ -1,10 +0,0 @@
-<head>
-    <meta content="text/html; charset=x-euc-cn" http-equiv="Content-Type"/>
-</head>
-<script>
-if (window.testRunner)
-    testRunner.dumpAsText();
-</script>
-<p>The following two lines should look identically:</p>
-<p>Ò»¹P€éF</p>
-<p>&#19968;&#31558;&euro;&#38229;</p>
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/char-decoding-expected.txt b/third_party/WebKit/LayoutTests/fast/encoding/char-decoding-expected.txt
index 153f8e6..2d57ec0 100644
--- a/third_party/WebKit/LayoutTests/fast/encoding/char-decoding-expected.txt
+++ b/third_party/WebKit/LayoutTests/fast/encoding/char-decoding-expected.txt
@@ -6,13 +6,31 @@ On success, you will see a series of "PASS" messages, followed by "TEST COMPLETE
 
 PASS decode('UTF-8', '%E2%88%9A') is 'U+221A'
 PASS decode('gb2312', '%A3%A0') is 'U+3000'
+PASS decode('gb_2312', '%A3%A0') is 'U+3000'
 PASS decode('gb_2312-80', '%A3%A0') is 'U+3000'
+PASS decode('csgb2312', '%A3%A0') is 'U+3000'
+PASS decode('iso-ir-58', '%A3%A0') is 'U+3000'
+PASS decode('csiso58gb231280', '%A3%A0') is 'U+3000'
 PASS decode('chinese', '%A3%A0') is 'U+3000'
 PASS decode('gbk', '%A3%A0') is 'U+3000'
+PASS decode('x-gbk', '%A3%A0') is 'U+3000'
 PASS decode('gb18030', '%A3%A0') is 'U+3000'
 PASS decode('EUC-CN', '%A3%A0') is 'U+3000'
+PASS decode('gbk', '%A8%BF') is 'U+01F9'
+PASS decode('gbk', '%A1%AD') is 'U+2026'
+PASS decode('gbk', '%A1%AB') is 'U+FF5E'
+PASS decode('gb18030', '%A8%BF') is 'U+01F9'
+PASS decode('gb18030', '%A8%BC') is 'U+1E3F'
+PASS decode('gb18030', '%A1%AD') is 'U+2026'
+PASS decode('gb18030', '%A1%AB') is 'U+FF5E'
+PASS decode('gbk', '%A8%BC') is 'U+E7C7'
 PASS decode('Shift_JIS', '%82%d0') is 'U+3072'
 PASS decode('shift-jis', '%82%d0') is 'U+3072'
+PASS decode('csshiftjis', '%82%d0') is 'U+3072'
+PASS decode('sjis', '%82%d0') is 'U+3072'
+PASS decode('x-sjis', '%82%d0') is 'U+3072'
+PASS decode('ms_kanji', '%82%d0') is 'U+3072'
+PASS decode('windows-31j', '%82%d0') is 'U+3072'
 PASS decode('korean', '%A2%E6') is 'U+20AC'
 PASS decode('korean', '%A1%A4') is 'U+00B7'
 PASS decode('korean', '%A1%A9') is 'U+00AD'
@@ -49,30 +67,30 @@ PASS decode('windows-949', '%1C') is 'U+001C'
 PASS decode('windows-949', '%8F%A1') is 'U+B8EA'
 PASS decode('windows-949', '%B4%D3') is 'U+B2D2'
 PASS decode('windows-949', '%A2%41') is 'U+C910'
-PASS decode('x-windows-949', '%A2%E6') is 'U+20AC'
-PASS decode('x-windows-949', '%A1%A4') is 'U+00B7'
-PASS decode('x-windows-949', '%A1%A9') is 'U+00AD'
-PASS decode('x-windows-949', '%A1%AA') is 'U+2015'
-PASS decode('x-windows-949', '%A1%AD') is 'U+223C'
-PASS decode('x-windows-949', '%A2%A6') is 'U+FF5E'
-PASS decode('x-windows-949', '%A2%C1') is 'U+2299'
-PASS decode('x-windows-949', '%1A') is 'U+001A'
-PASS decode('x-windows-949', '%1C') is 'U+001C'
-PASS decode('x-windows-949', '%8F%A1') is 'U+B8EA'
-PASS decode('x-windows-949', '%B4%D3') is 'U+B2D2'
-PASS decode('x-windows-949', '%A2%41') is 'U+C910'
-PASS decode('x-uhc', '%A2%E6') is 'U+20AC'
-PASS decode('x-uhc', '%A1%A4') is 'U+00B7'
-PASS decode('x-uhc', '%A1%A9') is 'U+00AD'
-PASS decode('x-uhc', '%A1%AA') is 'U+2015'
-PASS decode('x-uhc', '%A1%AD') is 'U+223C'
-PASS decode('x-uhc', '%A2%A6') is 'U+FF5E'
-PASS decode('x-uhc', '%A2%C1') is 'U+2299'
-PASS decode('x-uhc', '%1A') is 'U+001A'
-PASS decode('x-uhc', '%1C') is 'U+001C'
-PASS decode('x-uhc', '%8F%A1') is 'U+B8EA'
-PASS decode('x-uhc', '%B4%D3') is 'U+B2D2'
-PASS decode('x-uhc', '%A2%41') is 'U+C910'
+PASS decode('cseuckr', '%A2%E6') is 'U+20AC'
+PASS decode('cseuckr', '%A1%A4') is 'U+00B7'
+PASS decode('cseuckr', '%A1%A9') is 'U+00AD'
+PASS decode('cseuckr', '%A1%AA') is 'U+2015'
+PASS decode('cseuckr', '%A1%AD') is 'U+223C'
+PASS decode('cseuckr', '%A2%A6') is 'U+FF5E'
+PASS decode('cseuckr', '%A2%C1') is 'U+2299'
+PASS decode('cseuckr', '%1A') is 'U+001A'
+PASS decode('cseuckr', '%1C') is 'U+001C'
+PASS decode('cseuckr', '%8F%A1') is 'U+B8EA'
+PASS decode('cseuckr', '%B4%D3') is 'U+B2D2'
+PASS decode('cseuckr', '%A2%41') is 'U+C910'
+PASS decode('csksc56011987', '%A2%E6') is 'U+20AC'
+PASS decode('csksc56011987', '%A1%A4') is 'U+00B7'
+PASS decode('csksc56011987', '%A1%A9') is 'U+00AD'
+PASS decode('csksc56011987', '%A1%AA') is 'U+2015'
+PASS decode('csksc56011987', '%A1%AD') is 'U+223C'
+PASS decode('csksc56011987', '%A2%A6') is 'U+FF5E'
+PASS decode('csksc56011987', '%A2%C1') is 'U+2299'
+PASS decode('csksc56011987', '%1A') is 'U+001A'
+PASS decode('csksc56011987', '%1C') is 'U+001C'
+PASS decode('csksc56011987', '%8F%A1') is 'U+B8EA'
+PASS decode('csksc56011987', '%B4%D3') is 'U+B2D2'
+PASS decode('csksc56011987', '%A2%41') is 'U+C910'
 PASS decode('iso-ir-149', '%A2%E6') is 'U+20AC'
 PASS decode('iso-ir-149', '%A1%A4') is 'U+00B7'
 PASS decode('iso-ir-149', '%A1%A9') is 'U+00AD'
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/char-decoding.html b/third_party/WebKit/LayoutTests/fast/encoding/char-decoding.html
index 5ac47ad..0a0acad 100644
--- a/third_party/WebKit/LayoutTests/fast/encoding/char-decoding.html
+++ b/third_party/WebKit/LayoutTests/fast/encoding/char-decoding.html
@@ -10,21 +10,44 @@ description("This tests decoding characters in various character sets.");
 
 testDecode('UTF-8', '%E2%88%9A', 'U+221A');
 
-// <http://bugs.webkit.org/show_bug.cgi?id=17014> EUC-CN code A3A0 is mapped to U+E5E5 instead of U+3000
+// \xA3\xA0 in GBK should be mapped to U+3000 instead of U+E5E5.
 testDecode('gb2312', '%A3%A0', 'U+3000');
+testDecode('gb_2312', '%A3%A0', 'U+3000');
 testDecode('gb_2312-80', '%A3%A0', 'U+3000');
+testDecode('csgb2312', '%A3%A0', 'U+3000');
+testDecode('iso-ir-58', '%A3%A0', 'U+3000');
+testDecode('csiso58gb231280', '%A3%A0', 'U+3000');
 testDecode('chinese', '%A3%A0', 'U+3000');
 testDecode('gbk', '%A3%A0', 'U+3000');
+testDecode('x-gbk', '%A3%A0', 'U+3000');
 testDecode('gb18030', '%A3%A0', 'U+3000');
 testDecode('EUC-CN', '%A3%A0', 'U+3000');
 
+// Align GBK with GB18030
+testDecode('gbk', '%A8%BF', 'U+01F9');
+testDecode('gbk', '%A1%AD', 'U+2026');
+testDecode('gbk', '%A1%AB', 'U+FF5E');
+testDecode('gb18030', '%A8%BF', 'U+01F9');
+testDecode('gb18030', '%A8%BC', 'U+1E3F');
+testDecode('gb18030', '%A1%AD', 'U+2026');
+testDecode('gb18030', '%A1%AB', 'U+FF5E');
+
+// Replace U+E7C7 with U+1E3F once
+// https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 is resolved.
+testDecode('gbk', '%A8%BC', 'U+E7C7');
+
 // Test Shift_JIS aliases.
 testDecode('Shift_JIS', '%82%d0', 'U+3072');
 testDecode('shift-jis', '%82%d0', 'U+3072');
+testDecode('csshiftjis', '%82%d0', 'U+3072');
+testDecode('sjis', '%82%d0', 'U+3072');
+testDecode('x-sjis', '%82%d0', 'U+3072');
+testDecode('ms_kanji', '%82%d0', 'U+3072');
+testDecode('windows-31j', '%82%d0', 'U+3072');
 
 // Test that all Korean encodings of EUC-KR family are treated as windows-949.
 var korean = {
-    encodings: ['korean', 'EUC-KR', 'windows-949', 'x-windows-949', 'x-uhc',
+    encodings: ['korean', 'EUC-KR', 'windows-949', 'cseuckr', 'csksc56011987',
                 'iso-ir-149', 'KS_C_5601-1987', 'KS_C_5601-1989',
                 'KSC5601', 'KSC_5601'],
     encoded: ['%A2%E6', '%A1%A4', '%A1%A9', '%A1%AA', '%A1%AD', '%A2%A6',
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/char-encoding-expected.txt b/third_party/WebKit/LayoutTests/fast/encoding/char-encoding-expected.txt
index fd5db76..4f7c949 100644
--- a/third_party/WebKit/LayoutTests/fast/encoding/char-encoding-expected.txt
+++ b/third_party/WebKit/LayoutTests/fast/encoding/char-encoding-expected.txt
@@ -19,8 +19,14 @@ PASS encode('GB_2312-80', 'U+20AC') is '%80'
 PASS encode('EUC-CN', 'U+20AC') is '%80'
 PASS encode('GBK', 'U+01F9') is '%A8%BF'
 PASS encode('GBK', 'U+1E3F') is '%A8%BC'
-PASS encode('GBK', 'U+22EF') is '%A1%AD'
-PASS encode('GBK', 'U+301C') is '%A1%AB'
+PASS encode('gb18030', 'U+01F9') is '%A8%BF'
+PASS encode('gb18030', 'U+1E3F') is '%A8%BC'
+PASS encode('GBK', 'U+2026') is '%A1%AD'
+PASS encode('GBK', 'U+FF5E') is '%A1%AB'
+PASS encode('gb18030', 'U+2026') is '%A1%AD'
+PASS encode('gb18030', 'U+FF5E') is '%A1%AB'
+PASS encode('GBK', 'U+22EF') is '%26%238943%3B'
+PASS encode('GBK', 'U+301C') is '%26%2312316%3B'
 PASS encode('csiso2022kr', 'U+00A0') is '%C2%A0'
 PASS encode('hz-gb-2312', 'U+00A0') is '%C2%A0'
 PASS encode('iso-2022-cn', 'U+00A0') is '%C2%A0'
diff --git a/third_party/WebKit/LayoutTests/fast/encoding/char-encoding.html b/third_party/WebKit/LayoutTests/fast/encoding/char-encoding.html
index bb34346..3c13100 100644
--- a/third_party/WebKit/LayoutTests/fast/encoding/char-encoding.html
+++ b/third_party/WebKit/LayoutTests/fast/encoding/char-encoding.html
@@ -34,11 +34,19 @@ testEncode('GBK', 'U+20AC', '%80');
 testEncode('gb2312', 'U+20AC', '%80');
 testEncode('GB_2312-80', 'U+20AC', '%80');
 testEncode('EUC-CN', 'U+20AC', '%80');
-//Misc symbols from TEC specific GBK translation
+//Align GBK with gb18030
+// See https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3
 testEncode('GBK', 'U+01F9', '%A8%BF');
 testEncode('GBK', 'U+1E3F', '%A8%BC');
-testEncode('GBK', 'U+22EF', '%A1%AD');
-testEncode('GBK', 'U+301C', '%A1%AB');
+testEncode('gb18030', 'U+01F9', '%A8%BF');
+testEncode('gb18030', 'U+1E3F', '%A8%BC');
+testEncode('GBK', 'U+2026', '%A1%AD');
+testEncode('GBK', 'U+FF5E', '%A1%AB');
+testEncode('gb18030', 'U+2026', '%A1%AD');
+testEncode('gb18030', 'U+FF5E', '%A1%AB');
+// GBK does not cover these two characters.
+testEncode('GBK', 'U+22EF', '%26%238943%3B');
+testEncode('GBK', 'U+301C', '%26%2312316%3B');
 
 // Replacement encodings - should encode as UTF-8
 testEncode("csiso2022kr", "U+00A0", "%C2%A0");
diff --git a/third_party/WebKit/Source/BUILD.gn b/third_party/WebKit/Source/BUILD.gn
index 30cafc1..4db0db7 100644
--- a/third_party/WebKit/Source/BUILD.gn
+++ b/third_party/WebKit/Source/BUILD.gn
@@ -67,6 +67,11 @@ config("config") {
       #'cflags': ['<!@(../../../tools/clang/scripts/blink_gc_plugin_flags.py enable-oilpan=<(enable_oilpan) dump-graph=<(blink_gc_plugin_dump_graph))'],
     }
   }
+
+  ## TODO(GYP) : gn does not yet support use_system_icu.
+  #if (use_system_icu) {
+  #  defines += [ "USING_SYSTEM_ICU" ]
+  #}
 }
 
 # The follow configs apply to all targets except for unit tests, which rely on
diff --git a/third_party/WebKit/Source/config.gyp b/third_party/WebKit/Source/config.gyp
index 8a801ad..8bfb8ab 100644
--- a/third_party/WebKit/Source/config.gyp
+++ b/third_party/WebKit/Source/config.gyp
@@ -117,6 +117,11 @@
             'MEMORY_TOOL_REPLACES_ALLOCATOR',
           ],
         }],
+        ['use_system_icu==1', {
+          'defines': [
+            'USING_SYSTEM_ICU',
+          ],
+        }],
       ],
     },
   },
diff --git a/third_party/WebKit/Source/wtf/text/TextCodecICU.cpp b/third_party/WebKit/Source/wtf/text/TextCodecICU.cpp
index 15beef7..f4460b6 100644
--- a/third_party/WebKit/Source/wtf/text/TextCodecICU.cpp
+++ b/third_party/WebKit/Source/wtf/text/TextCodecICU.cpp
@@ -71,16 +71,20 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
     for (int32_t i = 0; i < numEncodings; ++i) {
         const char* name = ucnv_getAvailableName(i);
         UErrorCode error = U_ZERO_ERROR;
-        // Try MIME before trying IANA to pick up commonly used names like
-        // 'EUC-JP' instead of horrendously long names like
-        // 'Extended_UNIX_Code_Packed_Format_for_Japanese'.
-        const char* standardName = ucnv_getStandardName(name, "MIME", &error);
-        if (!U_SUCCESS(error) || !standardName) {
+#if !defined(USING_SYSTEM_ICU)
+        const char* primaryStandard = "HTML";
+        const char* secondaryStandard = "MIME";
+#else
+        const char* primaryStandard = "MIME";
+        const char* secondaryStandard = "IANA";
+#endif
+        const char* standardName = ucnv_getStandardName(name, primaryStandard, &error);
+        if (U_FAILURE(error) || !standardName) {
             error = U_ZERO_ERROR;
             // Try IANA to pick up 'windows-12xx' and other names
             // which are not preferred MIME names but are widely used.
-            standardName = ucnv_getStandardName(name, "IANA", &error);
-            if (!U_SUCCESS(error) || !standardName)
+            standardName = ucnv_getStandardName(name, secondaryStandard, &error);
+            if (U_FAILURE(error) || !standardName)
                 continue;
         }
 
@@ -90,6 +94,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
         // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
         // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
         //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
+#if defined(USING_SYSTEM_ICU)
         if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80"))
             standardName = "GBK";
         // Similarly, EUC-KR encodings all map to an extended version, but
@@ -101,6 +106,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
             standardName = "windows-1254";
         else if (!strcmp(standardName, "TIS-620"))
             standardName = "windows-874";
+#endif
 
         registrar(standardName, standardName);
 
@@ -116,6 +122,12 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
             }
     }
 
+    // These two entries have to be added here because ICU's converter table
+    // cannot have both ISO-8859-8-I and ISO-8859-8.
+    registrar("csISO88598I", "ISO-8859-8-I");
+    registrar("logical", "ISO-8859-8-I");
+
+#if defined(USING_SYSTEM_ICU)
     // Additional alias for MacCyrillic not present in ICU.
     registrar("maccyrillic", "x-mac-cyrillic");
 
@@ -131,9 +143,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
     registrar("csgb231280", "GBK");
     registrar("x-euc-cn", "GBK");
     registrar("x-gbk", "GBK");
-    registrar("csISO88598I", "ISO-8859-8-I");
     registrar("koi", "KOI8-R");
-    registrar("logical", "ISO-8859-8-I");
     registrar("visual", "ISO-8859-8");
     registrar("winarabic", "windows-1256");
     registrar("winbaltic", "windows-1257");
@@ -176,8 +186,6 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
     // and Firefox (as of Oct 2014), but not in the upstream ICU.
     // Three entries for windows-1252 need not be listed here because
     // TextCodecLatin1 registers them.
-    // FIXME: We may introduce SYSTEM_ICU and enclose this block
-    // with |#if SYSTEM_ICU| because Chromium's ICU has them all.
     registrar("csiso58gb231280", "GBK");
     registrar("csiso88596e", "ISO-8859-6");
     registrar("csiso88596i", "ISO-8859-6");
@@ -212,6 +220,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
     registrar("x-cp1256", "windows-1256");
     registrar("x-cp1257", "windows-1257");
     registrar("x-cp1258", "windows-1258");
+#endif
 }
 
 void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)
@@ -237,7 +246,9 @@ void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)
 TextCodecICU::TextCodecICU(const TextEncoding& encoding)
     : m_encoding(encoding)
     , m_converterICU(0)
+#if defined(USING_SYSTEM_ICU)
     , m_needsGBKFallbacks(false)
+#endif
 {
 }
 
@@ -261,8 +272,10 @@ void TextCodecICU::createICUConverter() const
 {
     ASSERT(!m_converterICU);
 
+#if defined(USING_SYSTEM_ICU)
     const char* name = m_encoding.name();
     m_needsGBKFallbacks = name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3];
+#endif
 
     UErrorCode err;
 
@@ -367,32 +380,41 @@ String TextCodecICU::decode(const char* bytes, size_t length, FlushBehavior flus
         sawError = true;
     }
 
+#if !defined(USING_SYSTEM_ICU)
+    // Chrome's copy of ICU does not have the issue described below.
+    return result.toString();
+#else
     String resultString = result.toString();
 
     // <http://bugs.webkit.org/show_bug.cgi?id=17014>
     // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5.
-    if (!strcmp(m_encoding.name(), "GBK") || !strcasecmp(m_encoding.name(), "gb18030"))
-        resultString.replace(0xE5E5, ideographicSpaceCharacter);
+    if (!strcmp(m_encoding.name(), "GBK")) {
+        if (!strcasecmp(m_encoding.name(), "gb18030"))
+            resultString.replace(0xE5E5, ideographicSpaceCharacter);
+        // Make GBK compliant to the encoding spec and align with GB18030
+        resultString.replace(0x01F9, 0xE7C8);
+        // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3
+        // is resolved, add U+1E3F => 0xE7C7.
+    }
 
     return resultString;
+#endif
 }
 
-// We need to apply these fallbacks ourselves as they are not currently supported by ICU and
-// they were provided by the old TEC encoding path. Needed to fix <rdar://problem/4708689>.
+#if defined(USING_SYSTEM_ICU)
+// U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding
+// spec, but ICU converter does not have them.
 static UChar fallbackForGBK(UChar32 character)
 {
     switch (character) {
     case 0x01F9:
-        return 0xE7C8;
+        return 0xE7C8; // mapped to xA8xBF by ICU.
     case 0x1E3F:
-        return 0xE7C7;
-    case 0x22EF:
-        return 0x2026;
-    case 0x301C:
-        return 0xFF5E;
+        return 0xE7C7; // mapped to xA8xBC by ICU.
     }
     return 0;
 }
+#endif
 
 // Invalid character handler when writing escaped entities for unrepresentable
 // characters. See the declaration of TextCodec::encode for more.
@@ -409,6 +431,7 @@ static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeA
         UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
 }
 
+#if defined(USING_SYSTEM_ICU)
 // Substitutes special GBK characters, escaping all other unassigned entities.
 static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
     UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err)
@@ -452,6 +475,7 @@ static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs
     }
     UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
 }
+#endif // USING_SYSTEM_ICU
 
 class TextCodecInput {
 public:
@@ -488,13 +512,25 @@ CString TextCodecICU::encodeInternal(const TextCodecInput& input, UnencodableHan
     switch (handling) {
         case QuestionMarksForUnencodables:
             ucnv_setSubstChars(m_converterICU, "?", 1, &err);
+#if !defined(USING_SYSTEM_ICU)
+            ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
+#else
             ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
+#endif
             break;
         case EntitiesForUnencodables:
+#if !defined(USING_SYSTEM_ICU)
+            ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
+#else
             ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
+#endif
             break;
         case URLEncodedEntitiesForUnencodables:
+#if !defined(USING_SYSTEM_ICU)
+            ucnv_setFromUCallBack(m_converterICU, urlEscapedEntityCallback, 0, 0, 0, &err);
+#else
             ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err);
+#endif
             break;
     }
 
diff --git a/third_party/WebKit/Source/wtf/text/TextCodecICU.h b/third_party/WebKit/Source/wtf/text/TextCodecICU.h
index d11f0ad..b3e6e8d 100644
--- a/third_party/WebKit/Source/wtf/text/TextCodecICU.h
+++ b/third_party/WebKit/Source/wtf/text/TextCodecICU.h
@@ -58,15 +58,19 @@ private:
 
     void createICUConverter() const;
     void releaseICUConverter() const;
+#if defined(USING_SYSTEM_ICU)
     bool needsGBKFallbacks() const { return m_needsGBKFallbacks; }
     void setNeedsGBKFallbacks(bool needsFallbacks) { m_needsGBKFallbacks = needsFallbacks; }
+#endif
 
     int decodeToBuffer(UChar* buffer, UChar* bufferLimit, const char*& source,
         const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode&);
 
     TextEncoding m_encoding;
     mutable UConverter* m_converterICU;
+#if defined(USING_SYSTEM_ICU)
     mutable bool m_needsGBKFallbacks;
+#endif
 };
 
 struct ICUConverterWrapper {
diff --git a/third_party/WebKit/Source/wtf/unicode/CharacterNames.h b/third_party/WebKit/Source/wtf/unicode/CharacterNames.h
index aad3391..3eae9b6 100644
--- a/third_party/WebKit/Source/wtf/unicode/CharacterNames.h
+++ b/third_party/WebKit/Source/wtf/unicode/CharacterNames.h
@@ -60,7 +60,9 @@ const UChar hyphenCharacter = 0x2010;
 const UChar hyphenMinusCharacter = 0x002D;
 const UChar ideographicCommaCharacter = 0x3001;
 const UChar ideographicFullStopCharacter = 0x3002;
+#if defined(USING_SYSTEM_ICU)
 const UChar ideographicSpaceCharacter = 0x3000;
+#endif
 const UChar inhibitArabicFormShapingCharacter = 0x206C;
 const UChar inhibitSymmetricSwappingCharacter = 0x206A;
 const UChar latinCapitalLetterIWithDotAbove = 0x0130;
@@ -132,7 +134,9 @@ using WTF::Unicode::hyphenCharacter;
 using WTF::Unicode::hyphenMinusCharacter;
 using WTF::Unicode::ideographicCommaCharacter;
 using WTF::Unicode::ideographicFullStopCharacter;
+#if defined(USING_SYSTEM_ICU)
 using WTF::Unicode::ideographicSpaceCharacter;
+#endif
 using WTF::Unicode::inhibitArabicFormShapingCharacter;
 using WTF::Unicode::inhibitSymmetricSwappingCharacter;
 using WTF::Unicode::latinCapitalLetterIWithDotAbove;
author	jshin@chromium.org <jshin@chromium.org>	2015-06-05 22:48:02 +0000
committer	jshin@chromium.org <jshin@chromium.org>	2015-06-05 22:48:02 +0000
commit	52692952ee4c963268d9c1020bc637aff6029e0e (patch)
tree	c51c677f565574eba2e91e42ff30457688471de2
parent	f62c0204c635fb6acee45fad2582dc4cd48d844f (diff)
download	chromium_src-52692952ee4c963268d9c1020bc637aff6029e0e.zip chromium_src-52692952ee4c963268d9c1020bc637aff6029e0e.tar.gz chromium_src-52692952ee4c963268d9c1020bc637aff6029e0e.tar.bz2