diff options
author | Bruno Haible <bruno@clisp.org> | 2007-01-28 15:39:27 +0000 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2009-06-23 12:14:38 +0200 |
commit | 95bcc2a842218afee0368c65414de9dc8c013cc5 (patch) | |
tree | f57489e41821cbded084dfa5bc79703d146a3380 /gettext-tools | |
parent | d93af4d6fc8647113d64f6d83643862912616e38 (diff) | |
download | external_gettext-95bcc2a842218afee0368c65414de9dc8c013cc5.zip external_gettext-95bcc2a842218afee0368c65414de9dc8c013cc5.tar.gz external_gettext-95bcc2a842218afee0368c65414de9dc8c013cc5.tar.bz2 |
Handle lone high surrogates correctly.
Diffstat (limited to 'gettext-tools')
-rw-r--r-- | gettext-tools/src/ChangeLog | 7 | ||||
-rw-r--r-- | gettext-tools/src/x-java.c | 63 | ||||
-rw-r--r-- | gettext-tools/tests/ChangeLog | 5 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-java-2 | 2 |
4 files changed, 74 insertions, 3 deletions
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index aaca948..d73fbae 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,10 @@ +2007-01-28 Bruno Haible <bruno@clisp.org> + + * x-java.c (string_buffer_flush_utf16_surr): Give a warning when + converting a surrogate code point to U+FFFD. + (string_buffer_append): Convert a lone high surrogate code point to + U+FFFD, and give a warning. + 2007-01-26 Bruno Haible <bruno@clisp.org> * msgfilter.c: Include <sys/time.h> unconditionally. diff --git a/gettext-tools/src/x-java.c b/gettext-tools/src/x-java.c index ae122c6..7be06da 100644 --- a/gettext-tools/src/x-java.c +++ b/gettext-tools/src/x-java.c @@ -1,5 +1,5 @@ /* xgettext Java backend. - Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc. + Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2003. This program is free software; you can redistribute it and/or modify @@ -458,7 +458,34 @@ string_buffer_flush_utf16_surr (struct string_buffer *bp) { if (bp->utf16_surr != 0) { - /* A half surrogate is invalid, therefore use U+FFFD instead. */ + /* A half surrogate is invalid, therefore use U+FFFD instead. + It appears to be valid Java: The Java Language Specification, + 3rd ed., says "The Java programming language represents text + in sequences of 16-bit code units, using the UTF-16 encoding." + but does not impose constraints on the use of \uxxxx escape + sequences for surrogates. And the JDK's javac happily groks + half surrogates. + But a half surrogate is invalid in UTF-8: + - RFC 3629 says + "The definition of UTF-8 prohibits encoding character + numbers between U+D800 and U+DFFF". + - Unicode 4.0 chapter 3 + <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf> + section 3.9, p.77, says + "Because surrogate code points are not Unicode scalar + values, any UTF-8 byte sequence that would otherwise + map to code points D800..DFFF is ill-formed." + and in table 3-6, p. 78, does not mention D800..DFFF. + - The unicode.org FAQ question "How do I convert an unpaired + UTF-16 surrogate to UTF-8?" has the answer + "By representing such an unpaired surrogate on its own + as a 3-byte sequence, the resulting UTF-8 data stream + would become ill-formed." + So use U+FFFD instead. */ + error_with_progname = false; + error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"), + logical_file_name, line_number, bp->utf16_surr); + error_with_progname = true; string_buffer_append_unicode (bp, 0xfffd); bp->utf16_surr = 0; } @@ -524,6 +551,38 @@ string_buffer_append (struct string_buffer *bp, int c) if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) bp->utf16_surr = UTF16_VALUE (c); + else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)) + { + /* A half surrogate is invalid, therefore use U+FFFD instead. + It appears to be valid Java: The Java Language Specification, + 3rd ed., says "The Java programming language represents text + in sequences of 16-bit code units, using the UTF-16 encoding." + but does not impose constraints on the use of \uxxxx escape + sequences for surrogates. And the JDK's javac happily groks + half surrogates. + But a half surrogate is invalid in UTF-8: + - RFC 3629 says + "The definition of UTF-8 prohibits encoding character + numbers between U+D800 and U+DFFF". + - Unicode 4.0 chapter 3 + <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf> + section 3.9, p.77, says + "Because surrogate code points are not Unicode scalar + values, any UTF-8 byte sequence that would otherwise + map to code points D800..DFFF is ill-formed." + and in table 3-6, p. 78, does not mention D800..DFFF. + - The unicode.org FAQ question "How do I convert an unpaired + UTF-16 surrogate to UTF-8?" has the answer + "By representing such an unpaired surrogate on its own + as a 3-byte sequence, the resulting UTF-8 data stream + would become ill-formed." + So use U+FFFD instead. */ + error_with_progname = false; + error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"), + logical_file_name, line_number, UTF16_VALUE (c)); + error_with_progname = true; + string_buffer_append_unicode (bp, 0xfffd); + } else string_buffer_append_unicode (bp, UTF16_VALUE (c)); } diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index 0a5590e..9dff82b 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,8 @@ +2007-01-28 Bruno Haible <bruno@clisp.org> + + * xgettext-java-2: Change expected result to contains U+FFFD instead + of an ill-formed UTF-8 sequence. + 2007-01-27 Bruno Haible <bruno@clisp.org> * lang-smalltalk: Update expected result so that it works with GNU diff --git a/gettext-tools/tests/xgettext-java-2 b/gettext-tools/tests/xgettext-java-2 index b0c8dd1..fdd344a 100755 --- a/gettext-tools/tests/xgettext-java-2 +++ b/gettext-tools/tests/xgettext-java-2 @@ -105,7 +105,7 @@ msgid "invalid surrogate � first half" msgstr "" #: xg-j-2.java:12 -msgid "invalid surrogate í²ž second half" +msgid "invalid surrogate � second half" msgstr "" #. Don't let the line numbers be confused by \u newlines. |