summaryrefslogtreecommitdiffstats
path: root/gettext-tools
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2007-01-28 15:39:27 +0000
committerBruno Haible <bruno@clisp.org>2009-06-23 12:14:38 +0200
commit95bcc2a842218afee0368c65414de9dc8c013cc5 (patch)
treef57489e41821cbded084dfa5bc79703d146a3380 /gettext-tools
parentd93af4d6fc8647113d64f6d83643862912616e38 (diff)
downloadexternal_gettext-95bcc2a842218afee0368c65414de9dc8c013cc5.zip
external_gettext-95bcc2a842218afee0368c65414de9dc8c013cc5.tar.gz
external_gettext-95bcc2a842218afee0368c65414de9dc8c013cc5.tar.bz2
Handle lone high surrogates correctly.
Diffstat (limited to 'gettext-tools')
-rw-r--r--gettext-tools/src/ChangeLog7
-rw-r--r--gettext-tools/src/x-java.c63
-rw-r--r--gettext-tools/tests/ChangeLog5
-rwxr-xr-xgettext-tools/tests/xgettext-java-22
4 files changed, 74 insertions, 3 deletions
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index aaca948..d73fbae 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,10 @@
+2007-01-28 Bruno Haible <bruno@clisp.org>
+
+ * x-java.c (string_buffer_flush_utf16_surr): Give a warning when
+ converting a surrogate code point to U+FFFD.
+ (string_buffer_append): Convert a lone high surrogate code point to
+ U+FFFD, and give a warning.
+
2007-01-26 Bruno Haible <bruno@clisp.org>
* msgfilter.c: Include <sys/time.h> unconditionally.
diff --git a/gettext-tools/src/x-java.c b/gettext-tools/src/x-java.c
index ae122c6..7be06da 100644
--- a/gettext-tools/src/x-java.c
+++ b/gettext-tools/src/x-java.c
@@ -1,5 +1,5 @@
/* xgettext Java backend.
- Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
+ Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2003.
This program is free software; you can redistribute it and/or modify
@@ -458,7 +458,34 @@ string_buffer_flush_utf16_surr (struct string_buffer *bp)
{
if (bp->utf16_surr != 0)
{
- /* A half surrogate is invalid, therefore use U+FFFD instead. */
+ /* A half surrogate is invalid, therefore use U+FFFD instead.
+ It appears to be valid Java: The Java Language Specification,
+ 3rd ed., says "The Java programming language represents text
+ in sequences of 16-bit code units, using the UTF-16 encoding."
+ but does not impose constraints on the use of \uxxxx escape
+ sequences for surrogates. And the JDK's javac happily groks
+ half surrogates.
+ But a half surrogate is invalid in UTF-8:
+ - RFC 3629 says
+ "The definition of UTF-8 prohibits encoding character
+ numbers between U+D800 and U+DFFF".
+ - Unicode 4.0 chapter 3
+ <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
+ section 3.9, p.77, says
+ "Because surrogate code points are not Unicode scalar
+ values, any UTF-8 byte sequence that would otherwise
+ map to code points D800..DFFF is ill-formed."
+ and in table 3-6, p. 78, does not mention D800..DFFF.
+ - The unicode.org FAQ question "How do I convert an unpaired
+ UTF-16 surrogate to UTF-8?" has the answer
+ "By representing such an unpaired surrogate on its own
+ as a 3-byte sequence, the resulting UTF-8 data stream
+ would become ill-formed."
+ So use U+FFFD instead. */
+ error_with_progname = false;
+ error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
+ logical_file_name, line_number, bp->utf16_surr);
+ error_with_progname = true;
string_buffer_append_unicode (bp, 0xfffd);
bp->utf16_surr = 0;
}
@@ -524,6 +551,38 @@ string_buffer_append (struct string_buffer *bp, int c)
if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
bp->utf16_surr = UTF16_VALUE (c);
+ else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
+ {
+ /* A half surrogate is invalid, therefore use U+FFFD instead.
+ It appears to be valid Java: The Java Language Specification,
+ 3rd ed., says "The Java programming language represents text
+ in sequences of 16-bit code units, using the UTF-16 encoding."
+ but does not impose constraints on the use of \uxxxx escape
+ sequences for surrogates. And the JDK's javac happily groks
+ half surrogates.
+ But a half surrogate is invalid in UTF-8:
+ - RFC 3629 says
+ "The definition of UTF-8 prohibits encoding character
+ numbers between U+D800 and U+DFFF".
+ - Unicode 4.0 chapter 3
+ <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
+ section 3.9, p.77, says
+ "Because surrogate code points are not Unicode scalar
+ values, any UTF-8 byte sequence that would otherwise
+ map to code points D800..DFFF is ill-formed."
+ and in table 3-6, p. 78, does not mention D800..DFFF.
+ - The unicode.org FAQ question "How do I convert an unpaired
+ UTF-16 surrogate to UTF-8?" has the answer
+ "By representing such an unpaired surrogate on its own
+ as a 3-byte sequence, the resulting UTF-8 data stream
+ would become ill-formed."
+ So use U+FFFD instead. */
+ error_with_progname = false;
+ error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
+ logical_file_name, line_number, UTF16_VALUE (c));
+ error_with_progname = true;
+ string_buffer_append_unicode (bp, 0xfffd);
+ }
else
string_buffer_append_unicode (bp, UTF16_VALUE (c));
}
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog
index 0a5590e..9dff82b 100644
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,8 @@
+2007-01-28 Bruno Haible <bruno@clisp.org>
+
+ * xgettext-java-2: Change expected result to contains U+FFFD instead
+ of an ill-formed UTF-8 sequence.
+
2007-01-27 Bruno Haible <bruno@clisp.org>
* lang-smalltalk: Update expected result so that it works with GNU
diff --git a/gettext-tools/tests/xgettext-java-2 b/gettext-tools/tests/xgettext-java-2
index b0c8dd1..fdd344a 100755
--- a/gettext-tools/tests/xgettext-java-2
+++ b/gettext-tools/tests/xgettext-java-2
@@ -105,7 +105,7 @@ msgid "invalid surrogate � first half"
msgstr ""
#: xg-j-2.java:12
-msgid "invalid surrogate í²ž second half"
+msgid "invalid surrogate � second half"
msgstr ""
#. Don't let the line numbers be confused by \u newlines.