From 95bcc2a842218afee0368c65414de9dc8c013cc5 Mon Sep 17 00:00:00 2001
From: Bruno Haible <bruno@clisp.org>
Date: Sun, 28 Jan 2007 15:39:27 +0000
Subject: Handle lone high surrogates correctly.

---
 gettext-tools/src/ChangeLog         |  7 +++++
 gettext-tools/src/x-java.c          | 63 +++++++++++++++++++++++++++++++++++--
 gettext-tools/tests/ChangeLog       |  5 +++
 gettext-tools/tests/xgettext-java-2 |  2 +-
 4 files changed, 74 insertions(+), 3 deletions(-)

(limited to 'gettext-tools')

diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index aaca948..d73fbae 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,10 @@
+2007-01-28  Bruno Haible  <bruno@clisp.org>
+
+	* x-java.c (string_buffer_flush_utf16_surr): Give a warning when
+	converting a surrogate code point to U+FFFD.
+	(string_buffer_append): Convert a lone high surrogate code point to
+	U+FFFD, and give a warning.
+
 2007-01-26  Bruno Haible  <bruno@clisp.org>
 
 	* msgfilter.c: Include <sys/time.h> unconditionally.
diff --git a/gettext-tools/src/x-java.c b/gettext-tools/src/x-java.c
index ae122c6..7be06da 100644
--- a/gettext-tools/src/x-java.c
+++ b/gettext-tools/src/x-java.c
@@ -1,5 +1,5 @@
 /* xgettext Java backend.
-   Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc.
    Written by Bruno Haible <bruno@clisp.org>, 2003.
 
    This program is free software; you can redistribute it and/or modify
@@ -458,7 +458,34 @@ string_buffer_flush_utf16_surr (struct string_buffer *bp)
 {
   if (bp->utf16_surr != 0)
     {
-      /* A half surrogate is invalid, therefore use U+FFFD instead.  */
+      /* A half surrogate is invalid, therefore use U+FFFD instead.
+	 It appears to be valid Java: The Java Language Specification,
+	 3rd ed., says "The Java programming language represents text
+	 in sequences of 16-bit code units, using the UTF-16 encoding."
+	 but does not impose constraints on the use of \uxxxx escape
+	 sequences for surrogates.  And the JDK's javac happily groks
+	 half surrogates.
+	 But a half surrogate is invalid in UTF-8:
+	   - RFC 3629 says
+	       "The definition of UTF-8 prohibits encoding character
+		numbers between U+D800 and U+DFFF".
+	   - Unicode 4.0 chapter 3
+	     <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
+	     section 3.9, p.77, says
+	       "Because surrogate code points are not Unicode scalar
+		values, any UTF-8 byte sequence that would otherwise
+		map to code points D800..DFFF is ill-formed."
+	     and in table 3-6, p. 78, does not mention D800..DFFF.
+	   - The unicode.org FAQ question "How do I convert an unpaired
+	     UTF-16 surrogate to UTF-8?" has the answer
+	       "By representing such an unpaired surrogate on its own
+		as a 3-byte sequence, the resulting UTF-8 data stream
+		would become ill-formed."
+	 So use U+FFFD instead.  */
+      error_with_progname = false;
+      error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
+	     logical_file_name, line_number, bp->utf16_surr);
+      error_with_progname = true;
       string_buffer_append_unicode (bp, 0xfffd);
       bp->utf16_surr = 0;
     }
@@ -524,6 +551,38 @@ string_buffer_append (struct string_buffer *bp, int c)
 
 	  if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
 	    bp->utf16_surr = UTF16_VALUE (c);
+	  else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
+	    {
+	      /* A half surrogate is invalid, therefore use U+FFFD instead.
+		 It appears to be valid Java: The Java Language Specification,
+		 3rd ed., says "The Java programming language represents text
+		 in sequences of 16-bit code units, using the UTF-16 encoding."
+		 but does not impose constraints on the use of \uxxxx escape
+		 sequences for surrogates.  And the JDK's javac happily groks
+		 half surrogates.
+		 But a half surrogate is invalid in UTF-8:
+		   - RFC 3629 says
+		       "The definition of UTF-8 prohibits encoding character
+			numbers between U+D800 and U+DFFF".
+		   - Unicode 4.0 chapter 3
+		     <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
+		     section 3.9, p.77, says
+		       "Because surrogate code points are not Unicode scalar
+			values, any UTF-8 byte sequence that would otherwise
+			map to code points D800..DFFF is ill-formed."
+		     and in table 3-6, p. 78, does not mention D800..DFFF.
+		   - The unicode.org FAQ question "How do I convert an unpaired
+		     UTF-16 surrogate to UTF-8?" has the answer
+		       "By representing such an unpaired surrogate on its own
+			as a 3-byte sequence, the resulting UTF-8 data stream
+			would become ill-formed."
+		 So use U+FFFD instead.  */
+	      error_with_progname = false;
+	      error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
+		     logical_file_name, line_number, UTF16_VALUE (c));
+	      error_with_progname = true;
+	      string_buffer_append_unicode (bp, 0xfffd);
+	    }
 	  else
 	    string_buffer_append_unicode (bp, UTF16_VALUE (c));
 	}
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog
index 0a5590e..9dff82b 100644
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,8 @@
+2007-01-28  Bruno Haible  <bruno@clisp.org>
+
+	* xgettext-java-2: Change expected result to contains U+FFFD instead
+	of an ill-formed UTF-8 sequence.
+
 2007-01-27  Bruno Haible  <bruno@clisp.org>
 
 	* lang-smalltalk: Update expected result so that it works with GNU
diff --git a/gettext-tools/tests/xgettext-java-2 b/gettext-tools/tests/xgettext-java-2
index b0c8dd1..fdd344a 100755
--- a/gettext-tools/tests/xgettext-java-2
+++ b/gettext-tools/tests/xgettext-java-2
@@ -105,7 +105,7 @@ msgid "invalid surrogate ï¿½ first half"
 msgstr ""
 
 #: xg-j-2.java:12
-msgid "invalid surrogate í²ž second half"
+msgid "invalid surrogate ï¿½ second half"
 msgstr ""
 
 #. Don't let the line numbers be confused by \u newlines.
-- 
cgit v1.1