21 files changed, 583 insertions, 80 deletions
diff --git a/NEWS b/NEWS
index 948f34f..f62f010 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,10 @@ Version 0.11.6 - October 2002
   strings in C++. This is needed for proper internationalization of C++
   programs.
 
+* xgettext now supports msgid strings in other encodings than ASCII.
+  xgettext has a new option --from-code that specifies the encoding of the
+  source files. The resulting POT files are UTF-8 encoded.
+
 * Compatibility with automake-1.7.
 
 Version 0.11.5 - August 2002
diff --git a/doc/ChangeLog b/doc/ChangeLog
index a506f7f..8cfe1f7 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2002-11-05  Bruno Haible  <bruno@clisp.org>
+
+	* xgettext.texi: Document --from-code option.
+
 2002-10-30  Bruno Haible  <bruno@clisp.org>
 
 	* gettext.texi (C): Refer to node Top of autosprintf.info. Needed to
diff --git a/doc/xgettext.texi b/doc/xgettext.texi
index 7385d50..4c0ba4c 100644
--- a/doc/xgettext.texi
+++ b/doc/xgettext.texi
@@ -84,6 +84,20 @@ This is a shorthand for @code{--language=C++}.
 By default the language is guessed depending on the input file name
 extension.
 
+@subsection Input file interpretation
+
+@table @samp
+@item --from-code=@var{name}
+@opindex --from-code@r{, @code{xgettext} option}
+Specifies the encoding of the input files.  This option is needed only
+if some untranslated message strings or their corresponding comments
+contain non-ASCII characters.  Note that Python, Tcl, and Glade input
+files are always assumed to be in UTF-8, regardless of this option.
+
+@end table
+
+By default the input files are assumed to be in ASCII.
+
 @subsection Operation mode
 
 @table @samp
diff --git a/src/ChangeLog b/src/ChangeLog
index 21a8cd6..9c872b5 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,43 @@
+2002-11-05  Bruno Haible  <bruno@clisp.org>
+
+	Allow non-ASCII msgids in POT files.
+	* po-charset.h (po_charset_utf8): New declaration.
+	* po-charset.c (utf8, po_charset_utf8): New variables.
+	(po_charset_canonicalize): Use po_charset_utf8.
+	* msgl-iconv.h: Include iconv.h.
+	(convert_string): New declaration.
+	* msgl-iconv.c (convert_string): Export function.
+	(convert_msgid): New function.
+	(iconv_message_list): Call it.
+	* xgettext.h: Include iconv.h.
+	(xgettext_global_source_encoding, xgettext_global_source_iconv,
+	xgettext_current_source_encoding, xgettext_current_source_iconv): New
+	declarations.
+	* xgettext.c (xgettext_global_source_encoding,
+	xgettext_global_source_iconv, xgettext_current_source_encoding,
+	xgettext_current_source_iconv): New variables.
+	(long_options): New option --from-code.
+	(main): Initialize xgettext_global_source_encoding. Handle option
+	--from-code. Initialize and destroy xgettext_global_source_iconv.
+	(usage): Document option --from-code.
+	(extract_from_file): Set xgettext_current_source_encoding and
+	xgettext_current_source_iconv.
+	(CONVERT_STRING): New macro.
+	(remember_a_message, remember_a_message_plural): Call CONVERT_STRING.
+	(finalize_header): Set the charset in the header here.
+	* x-glade.c (do_extract_glade): Set xgettext_current_source_encoding.
+	Don't set the result's header charset; this is now done in xgettext.c.
+	* x-python.c (extract_python): Likewise.
+	* x-tcl.c (extract_tcl): Likewise.
+	* write-po.c (message_print, message_print_obsolete): Don't warn about
+	non-ASCII msgids if the file's encoding is UTF-8.
+	* msginit.c (content_type): Add header argument. Use charset UTF-8
+	if that was already the POT file's encoding.
+	(fields): Update.
+	* msgmerge.c (merge): If the POT file was in UTF-8, convert the
+	definitions to UTF-8.
+	* msgcmp.c (compare): Likewise.
+
 2002-11-01  Bruno Haible  <bruno@clisp.org>
 
 	* msgcmp.c: Include read-po.h.
diff --git a/src/msgcmp.c b/src/msgcmp.c
index 615acf0..0919f52 100644
--- a/src/msgcmp.c
+++ b/src/msgcmp.c
@@ -33,9 +33,12 @@
 #include "basename.h"
 #include "message.h"
 #include "exit.h"
-#include "gettext.h"
 #include "read-po.h"
 #include "po.h"
+#include "msgl-iconv.h"
+#include "strstr.h"
+#include "strcase.h"
+#include "gettext.h"
 
 #define _(str) gettext (str)
 
@@ -301,6 +304,40 @@ compare (fn1, fn2)
      the xgettext program.  */
   ref = remove_obsoletes (read_po_file (fn2));
 
+  /* The references file can be either in ASCII or in UTF-8.  If it is
+     in UTF-8, we have to convert the definitions to UTF-8 as well.  */
+  {
+    bool was_utf8 = false;
+    for (k = 0; k < ref->nitems; k++)
+      {
+	message_list_ty *mlp = ref->item[k]->messages;
+
+	for (j = 0; j < mlp->nitems; j++)
+	  if (mlp->item[j]->msgid[0] == '\0' /* && !mlp->item[j]->obsolete */)
+	    {
+	      const char *header = mlp->item[j]->msgstr; 
+
+	      if (header != NULL)
+		{
+		  const char *charsetstr = strstr (header, "charset=");
+
+		  if (charsetstr != NULL)
+		    {
+		      size_t len;
+
+		      charsetstr += strlen ("charset=");
+		      len = strcspn (charsetstr, " \t\n");
+		      if (len == strlen ("UTF-8")
+			  && strncasecmp (charsetstr, "UTF-8", len) == 0)
+			was_utf8 = true;
+		    }
+		}
+	    }
+	}
+    if (was_utf8)
+      def = iconv_msgdomain_list (def, "UTF-8", fn1);
+  }
+
   empty_list = message_list_alloc (false);
 
   /* Every entry in the xgettext generated file must be matched by a
diff --git a/src/msginit.c b/src/msginit.c
index d1c4536..d443c2a 100644
--- a/src/msginit.c
+++ b/src/msginit.c
@@ -69,6 +69,8 @@
 #include "progname.h"
 #include "basename.h"
 #include "strpbrk.h"
+#include "strstr.h"
+#include "strcase.h"
 #include "message.h"
 #include "read-po.h"
 #include "write-po.h"
@@ -145,7 +147,7 @@ static const char *last_translator PARAMS ((void));
 static const char *language_team_address PARAMS ((void));
 static const char *language_team PARAMS ((void));
 static const char *mime_version PARAMS ((void));
-static const char *content_type PARAMS ((void));
+static const char *content_type PARAMS ((const char *header));
 static const char *content_transfer_encoding PARAMS ((void));
 static const char *plural_forms PARAMS ((void));
 static char *get_field PARAMS ((const char *header, const char *field));
@@ -1212,9 +1214,30 @@ mime_version ()
 
 /* Construct the value for the Content-Type field.  */
 static const char *
-content_type ()
+content_type (header)
+     const char *header;
 {
-  return xasprintf ("text/plain; charset=%s", canonical_locale_charset ());
+  bool was_utf8;
+  const char *old_field;
+
+  /* If the POT file contains charset=UTF-8, it means that the POT file
+     contains non-ASCII characters, and we keep the UTF-8 encoding.
+     Otherwise, when the POT file is plain ASCII, we use the locale's
+     encoding.  */
+  was_utf8 = false;
+  old_field = get_field (header, "Content-Type");
+  if (old_field != NULL)
+    {
+      const char *charsetstr = strstr (old_field, "charset=");
+
+      if (charsetstr != NULL)
+	{
+	  charsetstr += strlen ("charset=");
+	  was_utf8 = (strcasecmp (charsetstr, "UTF-8") == 0);
+	}
+    }
+  return xasprintf ("text/plain; charset=%s",
+		    was_utf8 ? "UTF-8" : canonical_locale_charset ());
 }
 
 
@@ -1259,7 +1282,7 @@ fields[] =
     { "Last-Translator", last_translator, NULL },
     { "Language-Team", language_team, NULL },
     { "MIME-Version", mime_version, NULL },
-    { "Content-Type", content_type, NULL },
+    { "Content-Type", NULL, content_type },
     { "Content-Transfer-Encoding", content_transfer_encoding, NULL },
     { "Plural-Forms", plural_forms, NULL }
   };
diff --git a/src/msgl-iconv.c b/src/msgl-iconv.c
index 6620cfd..61633b5 100644
--- a/src/msgl-iconv.c
+++ b/src/msgl-iconv.c
@@ -54,8 +54,8 @@
 static int iconv_string PARAMS ((iconv_t cd,
 				 const char *start, const char *end,
 				 char **resultp, size_t *lengthp));
-static const char *convert_string PARAMS ((iconv_t cd, const char *string));
 static void convert_string_list PARAMS ((iconv_t cd, string_list_ty *slp));
+static void convert_msgid PARAMS ((iconv_t cd, message_ty *mp));
 static void convert_msgstr PARAMS ((iconv_t cd, message_ty *mp));
 #endif
 
@@ -184,7 +184,7 @@ iconv_string (cd, start, end, resultp, lengthp)
 #undef tmpbufsize
 }
 
-static const char *
+char *
 convert_string (cd, string)
      iconv_t cd;
      const char *string;
@@ -217,6 +217,16 @@ convert_string_list (cd, slp)
 }
 
 static void
+convert_msgid (cd, mp)
+     iconv_t cd;
+     message_ty *mp;
+{
+  mp->msgid = convert_string (cd, mp->msgid);
+  if (mp->msgid_plural != NULL)
+    mp->msgid_plural = convert_string (cd, mp->msgid_plural);
+}
+
+static void
 convert_msgstr (cd, mp)
      iconv_t cd;
      message_ty *mp;
@@ -377,6 +387,7 @@ and iconv() does not support this conversion."),
 
 	  convert_string_list (cd, mp->comment);
 	  convert_string_list (cd, mp->comment_dot);
+	  convert_msgid (cd, mp);
 	  convert_msgstr (cd, mp);
 	}
 
diff --git a/src/msgl-iconv.h b/src/msgl-iconv.h
index 45cd5de..976427a 100644
--- a/src/msgl-iconv.h
+++ b/src/msgl-iconv.h
@@ -19,8 +19,17 @@
 #ifndef _MSGL_ICONV_H
 #define _MSGL_ICONV_H
 
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
 #include "message.h"
 
+#if HAVE_ICONV
+/* Converts the STRING through the conversion descriptor CD.  */
+extern char *convert_string PARAMS ((iconv_t cd, const char *string));
+#endif
+
 /* Converts the message list MLP to the (already canonicalized) encoding
    CANON_TO_CODE.  The (already canonicalized) encoding before conversion
    can be passed as CANON_FROM_CODE; if NULL is passed instead, the
diff --git a/src/msgmerge.c b/src/msgmerge.c
index dc71eb3..adae806 100644
--- a/src/msgmerge.c
+++ b/src/msgmerge.c
@@ -44,6 +44,7 @@
 #include "stpcpy.h"
 #include "stpncpy.h"
 #include "po.h"
+#include "msgl-iconv.h"
 #include "msgl-equal.h"
 #include "plural-exp.h"
 #include "backupfile.h"
@@ -1017,6 +1018,40 @@ merge (fn1, fn2, defp)
 	message_list_prepend (ref->item[k]->messages, refheader);
       }
 
+  /* The references file can be either in ASCII or in UTF-8.  If it is
+     in UTF-8, we have to convert the definitions to UTF-8 as well.  */
+  {
+    bool was_utf8 = false;
+    for (k = 0; k < ref->nitems; k++)
+      {
+	message_list_ty *mlp = ref->item[k]->messages;
+
+	for (j = 0; j < mlp->nitems; j++)
+	  if (mlp->item[j]->msgid[0] == '\0' && !mlp->item[j]->obsolete)
+	    {
+	      const char *header = mlp->item[j]->msgstr; 
+
+	      if (header != NULL)
+		{
+		  const char *charsetstr = strstr (header, "charset=");
+
+		  if (charsetstr != NULL)
+		    {
+		      size_t len;
+
+		      charsetstr += strlen ("charset=");
+		      len = strcspn (charsetstr, " \t\n");
+		      if (len == strlen ("UTF-8")
+			  && strncasecmp (charsetstr, "UTF-8", len) == 0)
+			was_utf8 = true;
+		    }
+		}
+	    }
+	}
+    if (was_utf8)
+      def = iconv_msgdomain_list (def, "UTF-8", fn1);
+  }
+
   result = msgdomain_list_alloc (false);
   processed = 0;
 
diff --git a/src/po-charset.c b/src/po-charset.c
index b759716..f3d7368 100644
--- a/src/po-charset.c
+++ b/src/po-charset.c
@@ -44,6 +44,11 @@ static const char ascii[] = "ASCII";
 /* The canonicalized encoding name for ASCII.  */
 const char *po_charset_ascii = ascii;
 
+static const char utf8[] = "UTF-8";
+
+/* The canonicalized encoding name for UTF-8.  */
+const char *po_charset_utf8 = utf8;
+
 /* Canonicalize an encoding name.  */
 const char *
 po_charset_canonicalize (charset)
@@ -96,7 +101,7 @@ po_charset_canonicalize (charset)
     "TIS-620",
     "VISCII",
     "GEORGIAN-PS",
-    "UTF-8"
+    utf8
   };
   size_t i;
 
diff --git a/src/po-charset.h b/src/po-charset.h
index 216ab33..dfc7f6f 100644
--- a/src/po-charset.h
+++ b/src/po-charset.h
@@ -33,6 +33,9 @@ extern const char *po_charset_canonicalize PARAMS ((const char *charset));
 /* The canonicalized encoding name for ASCII.  */
 extern const char *po_charset_ascii;
 
+/* The canonicalized encoding name for UTF-8.  */
+extern const char *po_charset_utf8;
+
 /* Test for ASCII compatibility.  */
 extern bool po_charset_ascii_compatible PARAMS ((const char *canon_charset));
 
diff --git a/src/write-po.c b/src/write-po.c
index 6feb002..5fec377 100644
--- a/src/write-po.c
+++ b/src/write-po.c
@@ -778,7 +778,8 @@ message_print (mp, fp, charset, blank_line, debug)
   /* Print each of the message components.  Wrap them nicely so they
      are as readable as possible.  If there is no recorded msgstr for
      this domain, emit an empty string.  */
-  if (!is_ascii_string (mp->msgid))
+  if (!is_ascii_string (mp->msgid)
+      && po_charset_canonicalize (charset) != po_charset_utf8)
     multiline_warning (xasprintf (_("warning: ")),
 		       xasprintf (_("\
 The following msgid contains non-ASCII characters.\n\
@@ -872,7 +873,8 @@ message_print_obsolete (mp, fp, charset, blank_line)
 
   /* Print each of the message components.  Wrap them nicely so they
      are as readable as possible.  */
-  if (!is_ascii_string (mp->msgid))
+  if (!is_ascii_string (mp->msgid)
+      && po_charset_canonicalize (charset) != po_charset_utf8)
     multiline_warning (xasprintf (_("warning: ")),
 		       xasprintf (_("\
 The following msgid contains non-ASCII characters.\n\
diff --git a/src/x-glade.c b/src/x-glade.c
index 715c059..369e077 100644
--- a/src/x-glade.c
+++ b/src/x-glade.c
@@ -372,6 +372,9 @@ do_extract_glade (fp, real_filename, logical_filename, mdlp)
 {
   mlp = mdlp->item[0]->messages;
 
+  /* expat feeds us strings in UTF-8 encoding.  */
+  xgettext_current_source_encoding = po_charset_utf8;
+
   logical_file_name = xstrdup (logical_filename);
 
   init_keywords ();
@@ -413,14 +416,6 @@ error while reading \"%s\""), real_filename);
 
   XML_ParserFree (parser);
 
-  /* expat feeds us strings in UTF-8 encoding.  If not all the strings
-     were plain ASCII, set the charset in the header to UTF-8.  */
-  if (!is_ascii_message_list (mlp))
-    {
-      const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
-      iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
-    }
-
   /* Close scanner.  */
   logical_file_name = NULL;
   parser = NULL;
diff --git a/src/x-python.c b/src/x-python.c
index 823e852..161a103 100644
--- a/src/x-python.c
+++ b/src/x-python.c
@@ -1159,6 +1159,9 @@ extract_python (f, real_filename, logical_filename, mdlp)
 {
   message_list_ty *mlp = mdlp->item[0]->messages;
 
+  /* We convert our strings to UTF-8 encoding.  */
+  xgettext_current_source_encoding = po_charset_utf8;
+
   fp = f;
   real_file_name = real_filename;
   logical_file_name = xstrdup (logical_filename);
@@ -1176,14 +1179,6 @@ extract_python (f, real_filename, logical_filename, mdlp)
   while (!extract_parenthesized (mlp, -1, 0))
     ;
 
-  /* We converted our strings to UTF-8 encoding.  If not all the strings
-     were plain ASCII, set the charset in the header to UTF-8.  */
-  if (!is_ascii_message_list (mlp))
-    {
-      const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
-      iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
-    }
-
   fp = NULL;
   real_file_name = NULL;
   logical_file_name = NULL;
diff --git a/src/x-tcl.c b/src/x-tcl.c
index a942796..b744632 100644
--- a/src/x-tcl.c
+++ b/src/x-tcl.c
@@ -1002,6 +1002,9 @@ extract_tcl (f, real_filename, logical_filename, mdlp)
 {
   mlp = mdlp->item[0]->messages;
 
+  /* We convert our strings to UTF-8 encoding.  */
+  xgettext_current_source_encoding = po_charset_utf8;
+
   fp = f;
   real_file_name = real_filename;
   logical_file_name = xstrdup (logical_filename);
@@ -1018,14 +1021,6 @@ extract_tcl (f, real_filename, logical_filename, mdlp)
   /* Eat tokens until eof is seen.  */
   read_command_list ('\0');
 
-  /* We converted our strings to UTF-8 encoding.  If not all the strings
-     were plain ASCII, set the charset in the header to UTF-8.  */
-  if (!is_ascii_message_list (mlp))
-    {
-      const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
-      iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
-    }
-
   fp = NULL;
   real_file_name = NULL;
   logical_file_name = NULL;
diff --git a/src/xgettext.c b/src/xgettext.c
index 4224c2e..52f6747 100644
--- a/src/xgettext.c
+++ b/src/xgettext.c
@@ -48,6 +48,9 @@
 #include "stpcpy.h"
 #include "po.h"
 #include "message.h"
+#include "po-charset.h"
+#include "msgl-iconv.h"
+#include "msgl-ascii.h"
 #include "po-time.h"
 #include "write-po.h"
 #include "format.h"
@@ -111,6 +114,24 @@ static char *output_dir;
 /* If nonzero omit header with information about this run.  */
 int xgettext_omit_header;
 
+/* Canonicalized encoding name for all input files.  */
+const char *xgettext_global_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_global_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+iconv_t xgettext_global_source_iconv;
+#endif
+
+/* Canonicalized encoding name for the current input file.  */
+const char *xgettext_current_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_current_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+iconv_t xgettext_current_source_iconv;
+#endif
+
 /* Long options.  */
 static const struct option long_options[] =
 {
@@ -127,6 +148,7 @@ static const struct option long_options[] =
   { "files-from", required_argument, NULL, 'f' },
   { "force-po", no_argument, &force_po, 1 },
   { "foreign-user", no_argument, NULL, CHAR_MAX + 2 },
+  { "from-code", required_argument, NULL, CHAR_MAX + 3 },
   { "help", no_argument, NULL, 'h' },
   { "indent", no_argument, NULL, 'i' },
   { "join-existing", no_argument, NULL, 'j' },
@@ -137,7 +159,7 @@ static const struct option long_options[] =
   { "msgstr-suffix", optional_argument, NULL, 'M' },
   { "no-escape", no_argument, NULL, 'e' },
   { "no-location", no_argument, &line_comment, 0 },
-  { "no-wrap", no_argument, NULL, CHAR_MAX + 3 },
+  { "no-wrap", no_argument, NULL, CHAR_MAX + 4 },
   { "omit-header", no_argument, &xgettext_omit_header, 1 },
   { "output", required_argument, NULL, 'o' },
   { "output-dir", required_argument, NULL, 'p' },
@@ -220,6 +242,7 @@ main (argc, argv)
 
   /* Set initial value of variables.  */
   default_domain = MESSAGE_DOMAIN_DEFAULT;
+  xgettext_global_source_encoding = po_charset_ascii;
 
   while ((optchar = getopt_long (argc, argv,
 				 "ac::Cd:D:eEf:Fhijk::l:L:m::M::no:p:sTVw:x:",
@@ -366,7 +389,12 @@ main (argc, argv)
       case CHAR_MAX + 2:	/* --foreign-user */
 	copyright_holder = "";
 	break;
-      case CHAR_MAX + 3:	/* --no-wrap */
+      case CHAR_MAX + 3:	/* --from-code */
+	xgettext_global_source_encoding = po_charset_canonicalize (optarg);
+	if (xgettext_global_source_encoding == NULL)
+	  xgettext_global_source_encoding = po_charset_ascii;
+	break;
+      case CHAR_MAX + 4:	/* --no-wrap */
 	message_page_width_ignore ();
 	break;
       default:
@@ -453,6 +481,37 @@ xgettext cannot work without keywords to look for"));
   for (cnt = optind; cnt < argc; ++cnt)
     string_list_append_unique (file_list, argv[cnt]);
 
+  /* Allocate converter from xgettext_global_source_encoding to UTF-8 (except
+     from ASCII or UTF-8, when this conversion is a no-op).  */
+  if (xgettext_global_source_encoding != po_charset_ascii
+      && xgettext_global_source_encoding != po_charset_utf8)
+    {
+#if HAVE_ICONV
+      iconv_t cd;
+
+      /* Avoid glibc-2.1 bug with EUC-KR.  */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+      if (strcmp (xgettext_global_source_encoding, "EUC-KR") == 0)
+	cd = (iconv_t)(-1);
+      else
+# endif
+      cd = iconv_open (po_charset_utf8, xgettext_global_source_encoding);
+      if (cd == (iconv_t)(-1))
+	error (EXIT_FAILURE, 0, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
+and iconv() does not support this conversion."),
+	       xgettext_global_source_encoding, po_charset_utf8,
+	       basename (program_name));
+      xgettext_global_source_iconv = cd;
+#else
+      error (EXIT_FAILURE, 0, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
+This version was built without iconv()."),
+	     xgettext_global_source_encoding, po_charset_utf8,
+	     basename (program_name));
+#endif
+    }
+
   /* Allocate a message list to remember all the messages.  */
   mdlp = msgdomain_list_alloc (true);
 
@@ -519,6 +578,13 @@ warning: file `%s' extension `%s' is unknown; will try C"), fname, extension);
   if (!xgettext_omit_header)
     finalize_header (mdlp);
 
+  /* Free the allocated converter.  */
+#if HAVE_ICONV
+  if (xgettext_global_source_encoding != po_charset_ascii
+      && xgettext_global_source_encoding != po_charset_utf8)
+    iconv_close (xgettext_global_source_iconv);
+#endif
+
   /* Sorting the list of messages.  */
   if (sort_by_filepos)
     msgdomain_list_sort_by_filepos (mdlp);
@@ -589,6 +655,14 @@ By default the language is guessed depending on the input file name extension.\n
       printf ("\n");
       /* xgettext: no-wrap */
       printf (_("\
+Input file interpretation:\n\
+      --from-code=NAME           encoding of input files\n\
+                                   (except for Python, Tcl, Glade)\n\
+By default the input files are assumed to be in ASCII.\n\
+"));
+      printf ("\n");
+      /* xgettext: no-wrap */
+      printf (_("\
 Operation mode:\n\
   -j, --join-existing            join messages with existing file\n\
   -x, --exclude-file=FILE.po     entries from FILE.po are not extracted\n\
@@ -890,6 +964,13 @@ extract_from_file (file_name, extractor, mdlp)
   char *real_file_name;
   FILE *fp = xgettext_open (file_name, &logical_file_name, &real_file_name);
 
+  /* Set the default for the source file encoding.  May be overridden by
+     the extractor function.  */
+  xgettext_current_source_encoding = xgettext_global_source_encoding;
+#if HAVE_ICONV
+  xgettext_current_source_iconv = xgettext_global_source_iconv;
+#endif
+
   extractor (fp, real_file_name, logical_file_name, mdlp);
 
   if (fp != stdin)
@@ -905,6 +986,36 @@ extract_from_file (file_name, extractor, mdlp)
 static struct formatstring_parser *current_formatstring_parser;
 
 
+/* Convert the given string from xgettext_current_source_encoding to
+   the output file encoding (i.e. ASCII or UTF-8).  */
+#define CONVERT_STRING(string) \
+  if (xgettext_current_source_encoding == po_charset_ascii)		\
+    {									\
+      if (!is_ascii_string (string))					\
+	{								\
+	  char buffer[21];						\
+	  if (pos->line_number == (size_t)(-1))				\
+	    buffer[0] = '\0';						\
+	  else								\
+	    sprintf (buffer, ":%ld", (long) pos->line_number);		\
+	  error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \
+		 pos->file_name, buffer);				\
+	}								\
+    }									\
+  else if (xgettext_current_source_encoding != po_charset_utf8)		\
+    {									\
+      string = convert_string (xgettext_current_source_iconv, string);	\
+    }
+
+#if !HAVE_ICONV
+/* If we don't have iconv(), the only supported values for
+   xgettext_global_source_encoding and thus also for
+   xgettext_current_source_encoding are ASCII and UTF-8.
+   convert_string() should not be called in this case.  */
+#define convert_string(cd,string) (abort (), (string))
+#endif
+
+
 message_ty *
 remember_a_message (mlp, string, pos)
      message_list_ty *mlp;
@@ -934,6 +1045,8 @@ remember_a_message (mlp, string, pos)
     is_format[i] = undecided;
   do_wrap = undecided;
 
+  CONVERT_STRING (msgid);
+
   if (msgid[0] == '\0' && !xgettext_omit_header)
     {
       char buffer[21];
@@ -999,6 +1112,8 @@ meta information, not the empty string.\n")));
 	  if (s == NULL)
 	    break;
 
+	  CONVERT_STRING (s);
+
 	  /* To reduce the possibility of unwanted matches be do a two
 	     step match: the line must contain `xgettext:' and one of
 	     the possible format description strings.  */
@@ -1102,6 +1217,8 @@ remember_a_message_plural (mp, string, pos)
 
   msgid_plural = string;
 
+  CONVERT_STRING (msgid_plural);
+
   /* See if the message is already a plural message.  */
   if (mp->msgid_plural == NULL)
     {
@@ -1205,53 +1322,78 @@ finalize_header (mdlp)
 {
   /* If the generated PO file has plural forms, add a Plural-Forms template
      to the constructed header.  */
-  bool has_plural;
-  size_t i, j;
+  {
+    bool has_plural;
+    size_t i, j;
 
-  has_plural = false;
-  for (i = 0; i < mdlp->nitems; i++)
-    {
-      message_list_ty *mlp = mdlp->item[i]->messages;
+    has_plural = false;
+    for (i = 0; i < mdlp->nitems; i++)
+      {
+	message_list_ty *mlp = mdlp->item[i]->messages;
 
-      for (j = 0; j < mlp->nitems; j++)
-	{
-	  message_ty *mp = mlp->item[j];
+	for (j = 0; j < mlp->nitems; j++)
+	  {
+	    message_ty *mp = mlp->item[j];
 
-	  if (mp->msgid_plural != NULL)
-	    {
-	      has_plural = true;
-	      break;
-	    }
-	}
-      if (has_plural)
-	break;
-    }
+	    if (mp->msgid_plural != NULL)
+	      {
+		has_plural = true;
+		break;
+	      }
+	  }
+	if (has_plural)
+	  break;
+      }
 
-  if (has_plural)
-    {
-      message_ty *header = message_list_search (mdlp->item[0]->messages, "");
-      if (header != NULL
-	  && strstr (header->msgstr, "Plural-Forms:") == NULL)
-	{
-	  size_t insertpos = strlen (header->msgstr);
-	  const char *suffix;
-	  size_t suffix_len;
-	  char *new_msgstr;
-
-	  suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n";
-	  if (insertpos == 0 || header->msgstr[insertpos-1] == '\n')
-	    suffix++;
-	  suffix_len = strlen (suffix);
-	  new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len);
-	  memcpy (new_msgstr, header->msgstr, insertpos);
-	  memcpy (new_msgstr + insertpos, suffix, suffix_len);
-	  memcpy (new_msgstr + insertpos + suffix_len,
-		  header->msgstr + insertpos,
-		  header->msgstr_len - insertpos);
-	  header->msgstr = new_msgstr;
-	  header->msgstr_len = header->msgstr_len + suffix_len;
-	}
-    }
+    if (has_plural)
+      {
+	message_ty *header = message_list_search (mdlp->item[0]->messages, "");
+	if (header != NULL
+	    && strstr (header->msgstr, "Plural-Forms:") == NULL)
+	  {
+	    size_t insertpos = strlen (header->msgstr);
+	    const char *suffix;
+	    size_t suffix_len;
+	    char *new_msgstr;
+
+	    suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n";
+	    if (insertpos == 0 || header->msgstr[insertpos-1] == '\n')
+	      suffix++;
+	    suffix_len = strlen (suffix);
+	    new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len);
+	    memcpy (new_msgstr, header->msgstr, insertpos);
+	    memcpy (new_msgstr + insertpos, suffix, suffix_len);
+	    memcpy (new_msgstr + insertpos + suffix_len,
+		    header->msgstr + insertpos,
+		    header->msgstr_len - insertpos);
+	    header->msgstr = new_msgstr;
+	    header->msgstr_len = header->msgstr_len + suffix_len;
+	  }
+      }
+  }
+
+  /* If not all the strings were plain ASCII, set the charset in the header
+     to UTF-8.  All messages have already been converted to UTF-8 in
+     remember_a_message and remember_a_message_plural.  */
+  {
+    bool has_nonascii = false;
+    size_t i;
+
+    for (i = 0; i < mdlp->nitems; i++)
+      {
+	message_list_ty *mlp = mdlp->item[i]->messages;
+
+	if (!is_ascii_message_list (mlp))
+	  has_nonascii = true;
+      }
+
+    if (has_nonascii)
+      {
+	message_list_ty *mlp = mdlp->item[0]->messages;
+
+	iconv_message_list (mlp, po_charset_utf8, po_charset_utf8, NULL);
+      }
+  }
 }
 
 
diff --git a/src/xgettext.h b/src/xgettext.h
index 6356449..ad36fbd 100644
--- a/src/xgettext.h
+++ b/src/xgettext.h
@@ -21,6 +21,11 @@
 #define _XGETTEXT_H
 
 #include <stddef.h>
+
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
 #include "message.h"
 #include "pos.h"
 
@@ -37,6 +42,24 @@ extern bool substring_match;
 extern void split_keywordspec PARAMS ((const char *spec, const char **endp,
 				       int *argnum1p, int *argnum2p));
 
+/* Canonicalized encoding name for all input files.  */
+extern const char *xgettext_global_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_global_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+extern iconv_t xgettext_global_source_iconv;
+#endif
+
+/* Canonicalized encoding name for the current input file.  */
+extern const char *xgettext_current_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_current_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+extern iconv_t xgettext_current_source_iconv;
+#endif
+
 /* List of messages whose msgids must not be extracted, or NULL.
    Used by remember_a_message().  */
 extern message_list_ty *exclude;
diff --git a/tests/ChangeLog b/tests/ChangeLog
index 031d109..5f76d0f 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2002-11-05  Bruno Haible  <bruno@clisp.org>
+
+	* xgettext-23: New file.
+	* msgmerge-21: New file.
+	* Makefile.am (TESTS): Add them.
+
 2002-11-01  Bruno Haible  <bruno@clisp.org>
 
 	* format-awk-1, format-awk-2, format-c-1, format-c-2, format-elisp-1,
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 649ec7e..73dfdf8 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -39,13 +39,14 @@ TESTS = gettext-1 gettext-2 \
 	msgmerge-1 msgmerge-2 msgmerge-3 msgmerge-4 msgmerge-5 msgmerge-6 \
 	msgmerge-7 msgmerge-8 msgmerge-9 msgmerge-10 msgmerge-11 msgmerge-12 \
 	msgmerge-13 msgmerge-14 msgmerge-15 msgmerge-16 msgmerge-17 \
-	msgmerge-18 msgmerge-19 msgmerge-20 \
+	msgmerge-18 msgmerge-19 msgmerge-20 msgmerge-21 \
 	msgunfmt-1 msgunfmt-2 msgunfmt-3 \
 	msguniq-1 msguniq-2 msguniq-3 \
 	xgettext-1 xgettext-2 xgettext-3 xgettext-4 xgettext-5 xgettext-6 \
 	xgettext-7 xgettext-8 xgettext-9 xgettext-10 xgettext-11 xgettext-12 \
 	xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \
 	xgettext-18 xgettext-19 xgettext-20 xgettext-21 xgettext-22 \
+	xgettext-23 \
 	format-awk-1 format-awk-2 \
 	format-c-1 format-c-2 format-c-3 format-c-4 \
 	format-elisp-1 format-elisp-2 \
diff --git a/tests/msgmerge-21 b/tests/msgmerge-21
new file mode 100755
index 0000000..11e52dc
--- /dev/null
+++ b/tests/msgmerge-21
@@ -0,0 +1,99 @@
+#! /bin/sh
+
+# Test merging of a ref.pot in UTF-8 encoding against a def.po in legacy
+# encoding (that was produced from an older version of ref.pot, in ASCII
+# encoding).
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles mm-test21-ru.po"
+cat <<\EOF > mm-test21-ru.po
+# Russian messages for CLISP
+# Copyright (C) 1998 Free Software Foundation, Inc.
+# Eduard Haritonov <hed@iis.nsk.su>, 1998.
+# Arseny Slobodjuck <ampy@ich.dvo.ru>, 2002.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: GNU elvis 1.7\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: 2002-11-01 01:23+0100\n"
+"Last-Translator: Arseny Slobodjuck <ampy@ich.dvo.ru>\n"
+"Language-Team: Russian <ru@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=koi8-r\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:9
+msgid "Division durch Null"
+msgstr "������� �� ����"
+EOF
+
+tmpfiles="$tmpfiles mm-test21.pot"
+cat <<\EOF > mm-test21.pot
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:10
+msgid "Division durch Null"
+msgstr ""
+
+#: arith.c:15
+msgid "Überlauf"
+msgstr ""
+EOF
+
+tmpfiles="$tmpfiles mm-test21.out"
+: ${MSGMERGE=msgmerge}
+${MSGMERGE} -q mm-test21-ru.po mm-test21.pot -o mm-test21.out
+test $? = 0 || { rm -fr $tmpfiles; exit 1; }
+
+tmpfiles="$tmpfiles mm-test21.ok"
+cat <<\EOF > mm-test21.ok
+# Russian messages for CLISP
+# Copyright (C) 1998 Free Software Foundation, Inc.
+# Eduard Haritonov <hed@iis.nsk.su>, 1998.
+# Arseny Slobodjuck <ampy@ich.dvo.ru>, 2002.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: GNU elvis 1.7\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: 2002-11-01 01:23+0100\n"
+"Last-Translator: Arseny Slobodjuck <ampy@ich.dvo.ru>\n"
+"Language-Team: Russian <ru@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:10
+msgid "Division durch Null"
+msgstr "деление на нуль"
+
+#: arith.c:15
+msgid "Überlauf"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} mm-test21.ok mm-test21.out
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result
diff --git a/tests/xgettext-23 b/tests/xgettext-23
new file mode 100755
index 0000000..5928b6c
--- /dev/null
+++ b/tests/xgettext-23
@@ -0,0 +1,60 @@
+#! /bin/sh
+
+# Test extraction of non-ASCII msgids.
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles xg-test23.c"
+cat <<EOF > xg-test23.c
+void foo (int option)
+{
+  printf (_("%s: nezn�m� p�ep�na� -- %c\n"), option);
+  printf (_("%s: p�ep�na� vy�aduje argument -- %c\n"), option);
+}
+EOF
+
+tmpfiles="$tmpfiles xg-test23.po"
+: ${XGETTEXT=xgettext}
+${XGETTEXT} --no-location -k_ -o xg-test23.po xg-test23.c 2>/dev/null
+test $? = 1 || { rm -fr $tmpfiles; exit 1; }
+${XGETTEXT} --no-location -k_ --from-code=iso-8859-2 -o xg-test23.po xg-test23.c
+test $? = 0 || { rm -fr $tmpfiles; exit 1; }
+
+tmpfiles="$tmpfiles xg-test23.pot"
+sed -e '/POT-Creation-Date/d' < xg-test23.po > xg-test23.pot
+
+tmpfiles="$tmpfiles xg-test23.ok"
+cat <<EOF > xg-test23.ok
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#, c-format
+msgid "%s: neznámý přepínač -- %c\n"
+msgstr ""
+
+#, c-format
+msgid "%s: přepínač vyžaduje argument -- %c\n"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} xg-test23.ok xg-test23.pot
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result