From 95bcc2a842218afee0368c65414de9dc8c013cc5 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sun, 28 Jan 2007 15:39:27 +0000 Subject: Handle lone high surrogates correctly. --- gettext-tools/src/ChangeLog | 7 +++++ gettext-tools/src/x-java.c | 63 +++++++++++++++++++++++++++++++++++-- gettext-tools/tests/ChangeLog | 5 +++ gettext-tools/tests/xgettext-java-2 | 2 +- 4 files changed, 74 insertions(+), 3 deletions(-) (limited to 'gettext-tools') diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index aaca948..d73fbae 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,10 @@ +2007-01-28 Bruno Haible + + * x-java.c (string_buffer_flush_utf16_surr): Give a warning when + converting a surrogate code point to U+FFFD. + (string_buffer_append): Convert a lone high surrogate code point to + U+FFFD, and give a warning. + 2007-01-26 Bruno Haible * msgfilter.c: Include unconditionally. diff --git a/gettext-tools/src/x-java.c b/gettext-tools/src/x-java.c index ae122c6..7be06da 100644 --- a/gettext-tools/src/x-java.c +++ b/gettext-tools/src/x-java.c @@ -1,5 +1,5 @@ /* xgettext Java backend. - Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc. + Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc. Written by Bruno Haible , 2003. This program is free software; you can redistribute it and/or modify @@ -458,7 +458,34 @@ string_buffer_flush_utf16_surr (struct string_buffer *bp) { if (bp->utf16_surr != 0) { - /* A half surrogate is invalid, therefore use U+FFFD instead. */ + /* A half surrogate is invalid, therefore use U+FFFD instead. + It appears to be valid Java: The Java Language Specification, + 3rd ed., says "The Java programming language represents text + in sequences of 16-bit code units, using the UTF-16 encoding." + but does not impose constraints on the use of \uxxxx escape + sequences for surrogates. And the JDK's javac happily groks + half surrogates. + But a half surrogate is invalid in UTF-8: + - RFC 3629 says + "The definition of UTF-8 prohibits encoding character + numbers between U+D800 and U+DFFF". + - Unicode 4.0 chapter 3 + + section 3.9, p.77, says + "Because surrogate code points are not Unicode scalar + values, any UTF-8 byte sequence that would otherwise + map to code points D800..DFFF is ill-formed." + and in table 3-6, p. 78, does not mention D800..DFFF. + - The unicode.org FAQ question "How do I convert an unpaired + UTF-16 surrogate to UTF-8?" has the answer + "By representing such an unpaired surrogate on its own + as a 3-byte sequence, the resulting UTF-8 data stream + would become ill-formed." + So use U+FFFD instead. */ + error_with_progname = false; + error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"), + logical_file_name, line_number, bp->utf16_surr); + error_with_progname = true; string_buffer_append_unicode (bp, 0xfffd); bp->utf16_surr = 0; } @@ -524,6 +551,38 @@ string_buffer_append (struct string_buffer *bp, int c) if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) bp->utf16_surr = UTF16_VALUE (c); + else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)) + { + /* A half surrogate is invalid, therefore use U+FFFD instead. + It appears to be valid Java: The Java Language Specification, + 3rd ed., says "The Java programming language represents text + in sequences of 16-bit code units, using the UTF-16 encoding." + but does not impose constraints on the use of \uxxxx escape + sequences for surrogates. And the JDK's javac happily groks + half surrogates. + But a half surrogate is invalid in UTF-8: + - RFC 3629 says + "The definition of UTF-8 prohibits encoding character + numbers between U+D800 and U+DFFF". + - Unicode 4.0 chapter 3 + + section 3.9, p.77, says + "Because surrogate code points are not Unicode scalar + values, any UTF-8 byte sequence that would otherwise + map to code points D800..DFFF is ill-formed." + and in table 3-6, p. 78, does not mention D800..DFFF. + - The unicode.org FAQ question "How do I convert an unpaired + UTF-16 surrogate to UTF-8?" has the answer + "By representing such an unpaired surrogate on its own + as a 3-byte sequence, the resulting UTF-8 data stream + would become ill-formed." + So use U+FFFD instead. */ + error_with_progname = false; + error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"), + logical_file_name, line_number, UTF16_VALUE (c)); + error_with_progname = true; + string_buffer_append_unicode (bp, 0xfffd); + } else string_buffer_append_unicode (bp, UTF16_VALUE (c)); } diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index 0a5590e..9dff82b 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,8 @@ +2007-01-28 Bruno Haible + + * xgettext-java-2: Change expected result to contains U+FFFD instead + of an ill-formed UTF-8 sequence. + 2007-01-27 Bruno Haible * lang-smalltalk: Update expected result so that it works with GNU diff --git a/gettext-tools/tests/xgettext-java-2 b/gettext-tools/tests/xgettext-java-2 index b0c8dd1..fdd344a 100755 --- a/gettext-tools/tests/xgettext-java-2 +++ b/gettext-tools/tests/xgettext-java-2 @@ -105,7 +105,7 @@ msgid "invalid surrogate � first half" msgstr "" #: xg-j-2.java:12 -msgid "invalid surrogate í²ž second half" +msgid "invalid surrogate � second half" msgstr "" #. Don't let the line numbers be confused by \u newlines. -- cgit v1.1