summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--NEWS4
-rw-r--r--doc/ChangeLog4
-rw-r--r--doc/xgettext.texi14
-rw-r--r--src/ChangeLog40
-rw-r--r--src/msgcmp.c39
-rw-r--r--src/msginit.c31
-rw-r--r--src/msgl-iconv.c15
-rw-r--r--src/msgl-iconv.h9
-rw-r--r--src/msgmerge.c35
-rw-r--r--src/po-charset.c7
-rw-r--r--src/po-charset.h3
-rw-r--r--src/write-po.c6
-rw-r--r--src/x-glade.c11
-rw-r--r--src/x-python.c11
-rw-r--r--src/x-tcl.c11
-rw-r--r--src/xgettext.c232
-rw-r--r--src/xgettext.h23
-rw-r--r--tests/ChangeLog6
-rw-r--r--tests/Makefile.am3
-rwxr-xr-xtests/msgmerge-2199
-rwxr-xr-xtests/xgettext-2360
21 files changed, 583 insertions, 80 deletions
diff --git a/NEWS b/NEWS
index 948f34f..f62f010 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,10 @@ Version 0.11.6 - October 2002
strings in C++. This is needed for proper internationalization of C++
programs.
+* xgettext now supports msgid strings in other encodings than ASCII.
+ xgettext has a new option --from-code that specifies the encoding of the
+ source files. The resulting POT files are UTF-8 encoded.
+
* Compatibility with automake-1.7.
Version 0.11.5 - August 2002
diff --git a/doc/ChangeLog b/doc/ChangeLog
index a506f7f..8cfe1f7 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2002-11-05 Bruno Haible <bruno@clisp.org>
+
+ * xgettext.texi: Document --from-code option.
+
2002-10-30 Bruno Haible <bruno@clisp.org>
* gettext.texi (C): Refer to node Top of autosprintf.info. Needed to
diff --git a/doc/xgettext.texi b/doc/xgettext.texi
index 7385d50..4c0ba4c 100644
--- a/doc/xgettext.texi
+++ b/doc/xgettext.texi
@@ -84,6 +84,20 @@ This is a shorthand for @code{--language=C++}.
By default the language is guessed depending on the input file name
extension.
+@subsection Input file interpretation
+
+@table @samp
+@item --from-code=@var{name}
+@opindex --from-code@r{, @code{xgettext} option}
+Specifies the encoding of the input files. This option is needed only
+if some untranslated message strings or their corresponding comments
+contain non-ASCII characters. Note that Python, Tcl, and Glade input
+files are always assumed to be in UTF-8, regardless of this option.
+
+@end table
+
+By default the input files are assumed to be in ASCII.
+
@subsection Operation mode
@table @samp
diff --git a/src/ChangeLog b/src/ChangeLog
index 21a8cd6..9c872b5 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,43 @@
+2002-11-05 Bruno Haible <bruno@clisp.org>
+
+ Allow non-ASCII msgids in POT files.
+ * po-charset.h (po_charset_utf8): New declaration.
+ * po-charset.c (utf8, po_charset_utf8): New variables.
+ (po_charset_canonicalize): Use po_charset_utf8.
+ * msgl-iconv.h: Include iconv.h.
+ (convert_string): New declaration.
+ * msgl-iconv.c (convert_string): Export function.
+ (convert_msgid): New function.
+ (iconv_message_list): Call it.
+ * xgettext.h: Include iconv.h.
+ (xgettext_global_source_encoding, xgettext_global_source_iconv,
+ xgettext_current_source_encoding, xgettext_current_source_iconv): New
+ declarations.
+ * xgettext.c (xgettext_global_source_encoding,
+ xgettext_global_source_iconv, xgettext_current_source_encoding,
+ xgettext_current_source_iconv): New variables.
+ (long_options): New option --from-code.
+ (main): Initialize xgettext_global_source_encoding. Handle option
+ --from-code. Initialize and destroy xgettext_global_source_iconv.
+ (usage): Document option --from-code.
+ (extract_from_file): Set xgettext_current_source_encoding and
+ xgettext_current_source_iconv.
+ (CONVERT_STRING): New macro.
+ (remember_a_message, remember_a_message_plural): Call CONVERT_STRING.
+ (finalize_header): Set the charset in the header here.
+ * x-glade.c (do_extract_glade): Set xgettext_current_source_encoding.
+ Don't set the result's header charset; this is now done in xgettext.c.
+ * x-python.c (extract_python): Likewise.
+ * x-tcl.c (extract_tcl): Likewise.
+ * write-po.c (message_print, message_print_obsolete): Don't warn about
+ non-ASCII msgids if the file's encoding is UTF-8.
+ * msginit.c (content_type): Add header argument. Use charset UTF-8
+ if that was already the POT file's encoding.
+ (fields): Update.
+ * msgmerge.c (merge): If the POT file was in UTF-8, convert the
+ definitions to UTF-8.
+ * msgcmp.c (compare): Likewise.
+
2002-11-01 Bruno Haible <bruno@clisp.org>
* msgcmp.c: Include read-po.h.
diff --git a/src/msgcmp.c b/src/msgcmp.c
index 615acf0..0919f52 100644
--- a/src/msgcmp.c
+++ b/src/msgcmp.c
@@ -33,9 +33,12 @@
#include "basename.h"
#include "message.h"
#include "exit.h"
-#include "gettext.h"
#include "read-po.h"
#include "po.h"
+#include "msgl-iconv.h"
+#include "strstr.h"
+#include "strcase.h"
+#include "gettext.h"
#define _(str) gettext (str)
@@ -301,6 +304,40 @@ compare (fn1, fn2)
the xgettext program. */
ref = remove_obsoletes (read_po_file (fn2));
+ /* The references file can be either in ASCII or in UTF-8. If it is
+ in UTF-8, we have to convert the definitions to UTF-8 as well. */
+ {
+ bool was_utf8 = false;
+ for (k = 0; k < ref->nitems; k++)
+ {
+ message_list_ty *mlp = ref->item[k]->messages;
+
+ for (j = 0; j < mlp->nitems; j++)
+ if (mlp->item[j]->msgid[0] == '\0' /* && !mlp->item[j]->obsolete */)
+ {
+ const char *header = mlp->item[j]->msgstr;
+
+ if (header != NULL)
+ {
+ const char *charsetstr = strstr (header, "charset=");
+
+ if (charsetstr != NULL)
+ {
+ size_t len;
+
+ charsetstr += strlen ("charset=");
+ len = strcspn (charsetstr, " \t\n");
+ if (len == strlen ("UTF-8")
+ && strncasecmp (charsetstr, "UTF-8", len) == 0)
+ was_utf8 = true;
+ }
+ }
+ }
+ }
+ if (was_utf8)
+ def = iconv_msgdomain_list (def, "UTF-8", fn1);
+ }
+
empty_list = message_list_alloc (false);
/* Every entry in the xgettext generated file must be matched by a
diff --git a/src/msginit.c b/src/msginit.c
index d1c4536..d443c2a 100644
--- a/src/msginit.c
+++ b/src/msginit.c
@@ -69,6 +69,8 @@
#include "progname.h"
#include "basename.h"
#include "strpbrk.h"
+#include "strstr.h"
+#include "strcase.h"
#include "message.h"
#include "read-po.h"
#include "write-po.h"
@@ -145,7 +147,7 @@ static const char *last_translator PARAMS ((void));
static const char *language_team_address PARAMS ((void));
static const char *language_team PARAMS ((void));
static const char *mime_version PARAMS ((void));
-static const char *content_type PARAMS ((void));
+static const char *content_type PARAMS ((const char *header));
static const char *content_transfer_encoding PARAMS ((void));
static const char *plural_forms PARAMS ((void));
static char *get_field PARAMS ((const char *header, const char *field));
@@ -1212,9 +1214,30 @@ mime_version ()
/* Construct the value for the Content-Type field. */
static const char *
-content_type ()
+content_type (header)
+ const char *header;
{
- return xasprintf ("text/plain; charset=%s", canonical_locale_charset ());
+ bool was_utf8;
+ const char *old_field;
+
+ /* If the POT file contains charset=UTF-8, it means that the POT file
+ contains non-ASCII characters, and we keep the UTF-8 encoding.
+ Otherwise, when the POT file is plain ASCII, we use the locale's
+ encoding. */
+ was_utf8 = false;
+ old_field = get_field (header, "Content-Type");
+ if (old_field != NULL)
+ {
+ const char *charsetstr = strstr (old_field, "charset=");
+
+ if (charsetstr != NULL)
+ {
+ charsetstr += strlen ("charset=");
+ was_utf8 = (strcasecmp (charsetstr, "UTF-8") == 0);
+ }
+ }
+ return xasprintf ("text/plain; charset=%s",
+ was_utf8 ? "UTF-8" : canonical_locale_charset ());
}
@@ -1259,7 +1282,7 @@ fields[] =
{ "Last-Translator", last_translator, NULL },
{ "Language-Team", language_team, NULL },
{ "MIME-Version", mime_version, NULL },
- { "Content-Type", content_type, NULL },
+ { "Content-Type", NULL, content_type },
{ "Content-Transfer-Encoding", content_transfer_encoding, NULL },
{ "Plural-Forms", plural_forms, NULL }
};
diff --git a/src/msgl-iconv.c b/src/msgl-iconv.c
index 6620cfd..61633b5 100644
--- a/src/msgl-iconv.c
+++ b/src/msgl-iconv.c
@@ -54,8 +54,8 @@
static int iconv_string PARAMS ((iconv_t cd,
const char *start, const char *end,
char **resultp, size_t *lengthp));
-static const char *convert_string PARAMS ((iconv_t cd, const char *string));
static void convert_string_list PARAMS ((iconv_t cd, string_list_ty *slp));
+static void convert_msgid PARAMS ((iconv_t cd, message_ty *mp));
static void convert_msgstr PARAMS ((iconv_t cd, message_ty *mp));
#endif
@@ -184,7 +184,7 @@ iconv_string (cd, start, end, resultp, lengthp)
#undef tmpbufsize
}
-static const char *
+char *
convert_string (cd, string)
iconv_t cd;
const char *string;
@@ -217,6 +217,16 @@ convert_string_list (cd, slp)
}
static void
+convert_msgid (cd, mp)
+ iconv_t cd;
+ message_ty *mp;
+{
+ mp->msgid = convert_string (cd, mp->msgid);
+ if (mp->msgid_plural != NULL)
+ mp->msgid_plural = convert_string (cd, mp->msgid_plural);
+}
+
+static void
convert_msgstr (cd, mp)
iconv_t cd;
message_ty *mp;
@@ -377,6 +387,7 @@ and iconv() does not support this conversion."),
convert_string_list (cd, mp->comment);
convert_string_list (cd, mp->comment_dot);
+ convert_msgid (cd, mp);
convert_msgstr (cd, mp);
}
diff --git a/src/msgl-iconv.h b/src/msgl-iconv.h
index 45cd5de..976427a 100644
--- a/src/msgl-iconv.h
+++ b/src/msgl-iconv.h
@@ -19,8 +19,17 @@
#ifndef _MSGL_ICONV_H
#define _MSGL_ICONV_H
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
#include "message.h"
+#if HAVE_ICONV
+/* Converts the STRING through the conversion descriptor CD. */
+extern char *convert_string PARAMS ((iconv_t cd, const char *string));
+#endif
+
/* Converts the message list MLP to the (already canonicalized) encoding
CANON_TO_CODE. The (already canonicalized) encoding before conversion
can be passed as CANON_FROM_CODE; if NULL is passed instead, the
diff --git a/src/msgmerge.c b/src/msgmerge.c
index dc71eb3..adae806 100644
--- a/src/msgmerge.c
+++ b/src/msgmerge.c
@@ -44,6 +44,7 @@
#include "stpcpy.h"
#include "stpncpy.h"
#include "po.h"
+#include "msgl-iconv.h"
#include "msgl-equal.h"
#include "plural-exp.h"
#include "backupfile.h"
@@ -1017,6 +1018,40 @@ merge (fn1, fn2, defp)
message_list_prepend (ref->item[k]->messages, refheader);
}
+ /* The references file can be either in ASCII or in UTF-8. If it is
+ in UTF-8, we have to convert the definitions to UTF-8 as well. */
+ {
+ bool was_utf8 = false;
+ for (k = 0; k < ref->nitems; k++)
+ {
+ message_list_ty *mlp = ref->item[k]->messages;
+
+ for (j = 0; j < mlp->nitems; j++)
+ if (mlp->item[j]->msgid[0] == '\0' && !mlp->item[j]->obsolete)
+ {
+ const char *header = mlp->item[j]->msgstr;
+
+ if (header != NULL)
+ {
+ const char *charsetstr = strstr (header, "charset=");
+
+ if (charsetstr != NULL)
+ {
+ size_t len;
+
+ charsetstr += strlen ("charset=");
+ len = strcspn (charsetstr, " \t\n");
+ if (len == strlen ("UTF-8")
+ && strncasecmp (charsetstr, "UTF-8", len) == 0)
+ was_utf8 = true;
+ }
+ }
+ }
+ }
+ if (was_utf8)
+ def = iconv_msgdomain_list (def, "UTF-8", fn1);
+ }
+
result = msgdomain_list_alloc (false);
processed = 0;
diff --git a/src/po-charset.c b/src/po-charset.c
index b759716..f3d7368 100644
--- a/src/po-charset.c
+++ b/src/po-charset.c
@@ -44,6 +44,11 @@ static const char ascii[] = "ASCII";
/* The canonicalized encoding name for ASCII. */
const char *po_charset_ascii = ascii;
+static const char utf8[] = "UTF-8";
+
+/* The canonicalized encoding name for UTF-8. */
+const char *po_charset_utf8 = utf8;
+
/* Canonicalize an encoding name. */
const char *
po_charset_canonicalize (charset)
@@ -96,7 +101,7 @@ po_charset_canonicalize (charset)
"TIS-620",
"VISCII",
"GEORGIAN-PS",
- "UTF-8"
+ utf8
};
size_t i;
diff --git a/src/po-charset.h b/src/po-charset.h
index 216ab33..dfc7f6f 100644
--- a/src/po-charset.h
+++ b/src/po-charset.h
@@ -33,6 +33,9 @@ extern const char *po_charset_canonicalize PARAMS ((const char *charset));
/* The canonicalized encoding name for ASCII. */
extern const char *po_charset_ascii;
+/* The canonicalized encoding name for UTF-8. */
+extern const char *po_charset_utf8;
+
/* Test for ASCII compatibility. */
extern bool po_charset_ascii_compatible PARAMS ((const char *canon_charset));
diff --git a/src/write-po.c b/src/write-po.c
index 6feb002..5fec377 100644
--- a/src/write-po.c
+++ b/src/write-po.c
@@ -778,7 +778,8 @@ message_print (mp, fp, charset, blank_line, debug)
/* Print each of the message components. Wrap them nicely so they
are as readable as possible. If there is no recorded msgstr for
this domain, emit an empty string. */
- if (!is_ascii_string (mp->msgid))
+ if (!is_ascii_string (mp->msgid)
+ && po_charset_canonicalize (charset) != po_charset_utf8)
multiline_warning (xasprintf (_("warning: ")),
xasprintf (_("\
The following msgid contains non-ASCII characters.\n\
@@ -872,7 +873,8 @@ message_print_obsolete (mp, fp, charset, blank_line)
/* Print each of the message components. Wrap them nicely so they
are as readable as possible. */
- if (!is_ascii_string (mp->msgid))
+ if (!is_ascii_string (mp->msgid)
+ && po_charset_canonicalize (charset) != po_charset_utf8)
multiline_warning (xasprintf (_("warning: ")),
xasprintf (_("\
The following msgid contains non-ASCII characters.\n\
diff --git a/src/x-glade.c b/src/x-glade.c
index 715c059..369e077 100644
--- a/src/x-glade.c
+++ b/src/x-glade.c
@@ -372,6 +372,9 @@ do_extract_glade (fp, real_filename, logical_filename, mdlp)
{
mlp = mdlp->item[0]->messages;
+ /* expat feeds us strings in UTF-8 encoding. */
+ xgettext_current_source_encoding = po_charset_utf8;
+
logical_file_name = xstrdup (logical_filename);
init_keywords ();
@@ -413,14 +416,6 @@ error while reading \"%s\""), real_filename);
XML_ParserFree (parser);
- /* expat feeds us strings in UTF-8 encoding. If not all the strings
- were plain ASCII, set the charset in the header to UTF-8. */
- if (!is_ascii_message_list (mlp))
- {
- const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
- iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
- }
-
/* Close scanner. */
logical_file_name = NULL;
parser = NULL;
diff --git a/src/x-python.c b/src/x-python.c
index 823e852..161a103 100644
--- a/src/x-python.c
+++ b/src/x-python.c
@@ -1159,6 +1159,9 @@ extract_python (f, real_filename, logical_filename, mdlp)
{
message_list_ty *mlp = mdlp->item[0]->messages;
+ /* We convert our strings to UTF-8 encoding. */
+ xgettext_current_source_encoding = po_charset_utf8;
+
fp = f;
real_file_name = real_filename;
logical_file_name = xstrdup (logical_filename);
@@ -1176,14 +1179,6 @@ extract_python (f, real_filename, logical_filename, mdlp)
while (!extract_parenthesized (mlp, -1, 0))
;
- /* We converted our strings to UTF-8 encoding. If not all the strings
- were plain ASCII, set the charset in the header to UTF-8. */
- if (!is_ascii_message_list (mlp))
- {
- const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
- iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
- }
-
fp = NULL;
real_file_name = NULL;
logical_file_name = NULL;
diff --git a/src/x-tcl.c b/src/x-tcl.c
index a942796..b744632 100644
--- a/src/x-tcl.c
+++ b/src/x-tcl.c
@@ -1002,6 +1002,9 @@ extract_tcl (f, real_filename, logical_filename, mdlp)
{
mlp = mdlp->item[0]->messages;
+ /* We convert our strings to UTF-8 encoding. */
+ xgettext_current_source_encoding = po_charset_utf8;
+
fp = f;
real_file_name = real_filename;
logical_file_name = xstrdup (logical_filename);
@@ -1018,14 +1021,6 @@ extract_tcl (f, real_filename, logical_filename, mdlp)
/* Eat tokens until eof is seen. */
read_command_list ('\0');
- /* We converted our strings to UTF-8 encoding. If not all the strings
- were plain ASCII, set the charset in the header to UTF-8. */
- if (!is_ascii_message_list (mlp))
- {
- const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
- iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
- }
-
fp = NULL;
real_file_name = NULL;
logical_file_name = NULL;
diff --git a/src/xgettext.c b/src/xgettext.c
index 4224c2e..52f6747 100644
--- a/src/xgettext.c
+++ b/src/xgettext.c
@@ -48,6 +48,9 @@
#include "stpcpy.h"
#include "po.h"
#include "message.h"
+#include "po-charset.h"
+#include "msgl-iconv.h"
+#include "msgl-ascii.h"
#include "po-time.h"
#include "write-po.h"
#include "format.h"
@@ -111,6 +114,24 @@ static char *output_dir;
/* If nonzero omit header with information about this run. */
int xgettext_omit_header;
+/* Canonicalized encoding name for all input files. */
+const char *xgettext_global_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_global_source_encoding to UTF-8 (except from
+ ASCII or UTF-8, when this conversion is a no-op). */
+iconv_t xgettext_global_source_iconv;
+#endif
+
+/* Canonicalized encoding name for the current input file. */
+const char *xgettext_current_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_current_source_encoding to UTF-8 (except from
+ ASCII or UTF-8, when this conversion is a no-op). */
+iconv_t xgettext_current_source_iconv;
+#endif
+
/* Long options. */
static const struct option long_options[] =
{
@@ -127,6 +148,7 @@ static const struct option long_options[] =
{ "files-from", required_argument, NULL, 'f' },
{ "force-po", no_argument, &force_po, 1 },
{ "foreign-user", no_argument, NULL, CHAR_MAX + 2 },
+ { "from-code", required_argument, NULL, CHAR_MAX + 3 },
{ "help", no_argument, NULL, 'h' },
{ "indent", no_argument, NULL, 'i' },
{ "join-existing", no_argument, NULL, 'j' },
@@ -137,7 +159,7 @@ static const struct option long_options[] =
{ "msgstr-suffix", optional_argument, NULL, 'M' },
{ "no-escape", no_argument, NULL, 'e' },
{ "no-location", no_argument, &line_comment, 0 },
- { "no-wrap", no_argument, NULL, CHAR_MAX + 3 },
+ { "no-wrap", no_argument, NULL, CHAR_MAX + 4 },
{ "omit-header", no_argument, &xgettext_omit_header, 1 },
{ "output", required_argument, NULL, 'o' },
{ "output-dir", required_argument, NULL, 'p' },
@@ -220,6 +242,7 @@ main (argc, argv)
/* Set initial value of variables. */
default_domain = MESSAGE_DOMAIN_DEFAULT;
+ xgettext_global_source_encoding = po_charset_ascii;
while ((optchar = getopt_long (argc, argv,
"ac::Cd:D:eEf:Fhijk::l:L:m::M::no:p:sTVw:x:",
@@ -366,7 +389,12 @@ main (argc, argv)
case CHAR_MAX + 2: /* --foreign-user */
copyright_holder = "";
break;
- case CHAR_MAX + 3: /* --no-wrap */
+ case CHAR_MAX + 3: /* --from-code */
+ xgettext_global_source_encoding = po_charset_canonicalize (optarg);
+ if (xgettext_global_source_encoding == NULL)
+ xgettext_global_source_encoding = po_charset_ascii;
+ break;
+ case CHAR_MAX + 4: /* --no-wrap */
message_page_width_ignore ();
break;
default:
@@ -453,6 +481,37 @@ xgettext cannot work without keywords to look for"));
for (cnt = optind; cnt < argc; ++cnt)
string_list_append_unique (file_list, argv[cnt]);
+ /* Allocate converter from xgettext_global_source_encoding to UTF-8 (except
+ from ASCII or UTF-8, when this conversion is a no-op). */
+ if (xgettext_global_source_encoding != po_charset_ascii
+ && xgettext_global_source_encoding != po_charset_utf8)
+ {
+#if HAVE_ICONV
+ iconv_t cd;
+
+ /* Avoid glibc-2.1 bug with EUC-KR. */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+ if (strcmp (xgettext_global_source_encoding, "EUC-KR") == 0)
+ cd = (iconv_t)(-1);
+ else
+# endif
+ cd = iconv_open (po_charset_utf8, xgettext_global_source_encoding);
+ if (cd == (iconv_t)(-1))
+ error (EXIT_FAILURE, 0, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
+and iconv() does not support this conversion."),
+ xgettext_global_source_encoding, po_charset_utf8,
+ basename (program_name));
+ xgettext_global_source_iconv = cd;
+#else
+ error (EXIT_FAILURE, 0, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
+This version was built without iconv()."),
+ xgettext_global_source_encoding, po_charset_utf8,
+ basename (program_name));
+#endif
+ }
+
/* Allocate a message list to remember all the messages. */
mdlp = msgdomain_list_alloc (true);
@@ -519,6 +578,13 @@ warning: file `%s' extension `%s' is unknown; will try C"), fname, extension);
if (!xgettext_omit_header)
finalize_header (mdlp);
+ /* Free the allocated converter. */
+#if HAVE_ICONV
+ if (xgettext_global_source_encoding != po_charset_ascii
+ && xgettext_global_source_encoding != po_charset_utf8)
+ iconv_close (xgettext_global_source_iconv);
+#endif
+
/* Sorting the list of messages. */
if (sort_by_filepos)
msgdomain_list_sort_by_filepos (mdlp);
@@ -589,6 +655,14 @@ By default the language is guessed depending on the input file name extension.\n
printf ("\n");
/* xgettext: no-wrap */
printf (_("\
+Input file interpretation:\n\
+ --from-code=NAME encoding of input files\n\
+ (except for Python, Tcl, Glade)\n\
+By default the input files are assumed to be in ASCII.\n\
+"));
+ printf ("\n");
+ /* xgettext: no-wrap */
+ printf (_("\
Operation mode:\n\
-j, --join-existing join messages with existing file\n\
-x, --exclude-file=FILE.po entries from FILE.po are not extracted\n\
@@ -890,6 +964,13 @@ extract_from_file (file_name, extractor, mdlp)
char *real_file_name;
FILE *fp = xgettext_open (file_name, &logical_file_name, &real_file_name);
+ /* Set the default for the source file encoding. May be overridden by
+ the extractor function. */
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+#if HAVE_ICONV
+ xgettext_current_source_iconv = xgettext_global_source_iconv;
+#endif
+
extractor (fp, real_file_name, logical_file_name, mdlp);
if (fp != stdin)
@@ -905,6 +986,36 @@ extract_from_file (file_name, extractor, mdlp)
static struct formatstring_parser *current_formatstring_parser;
+/* Convert the given string from xgettext_current_source_encoding to
+ the output file encoding (i.e. ASCII or UTF-8). */
+#define CONVERT_STRING(string) \
+ if (xgettext_current_source_encoding == po_charset_ascii) \
+ { \
+ if (!is_ascii_string (string)) \
+ { \
+ char buffer[21]; \
+ if (pos->line_number == (size_t)(-1)) \
+ buffer[0] = '\0'; \
+ else \
+ sprintf (buffer, ":%ld", (long) pos->line_number); \
+ error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \
+ pos->file_name, buffer); \
+ } \
+ } \
+ else if (xgettext_current_source_encoding != po_charset_utf8) \
+ { \
+ string = convert_string (xgettext_current_source_iconv, string); \
+ }
+
+#if !HAVE_ICONV
+/* If we don't have iconv(), the only supported values for
+ xgettext_global_source_encoding and thus also for
+ xgettext_current_source_encoding are ASCII and UTF-8.
+ convert_string() should not be called in this case. */
+#define convert_string(cd,string) (abort (), (string))
+#endif
+
+
message_ty *
remember_a_message (mlp, string, pos)
message_list_ty *mlp;
@@ -934,6 +1045,8 @@ remember_a_message (mlp, string, pos)
is_format[i] = undecided;
do_wrap = undecided;
+ CONVERT_STRING (msgid);
+
if (msgid[0] == '\0' && !xgettext_omit_header)
{
char buffer[21];
@@ -999,6 +1112,8 @@ meta information, not the empty string.\n")));
if (s == NULL)
break;
+ CONVERT_STRING (s);
+
/* To reduce the possibility of unwanted matches be do a two
step match: the line must contain `xgettext:' and one of
the possible format description strings. */
@@ -1102,6 +1217,8 @@ remember_a_message_plural (mp, string, pos)
msgid_plural = string;
+ CONVERT_STRING (msgid_plural);
+
/* See if the message is already a plural message. */
if (mp->msgid_plural == NULL)
{
@@ -1205,53 +1322,78 @@ finalize_header (mdlp)
{
/* If the generated PO file has plural forms, add a Plural-Forms template
to the constructed header. */
- bool has_plural;
- size_t i, j;
+ {
+ bool has_plural;
+ size_t i, j;
- has_plural = false;
- for (i = 0; i < mdlp->nitems; i++)
- {
- message_list_ty *mlp = mdlp->item[i]->messages;
+ has_plural = false;
+ for (i = 0; i < mdlp->nitems; i++)
+ {
+ message_list_ty *mlp = mdlp->item[i]->messages;
- for (j = 0; j < mlp->nitems; j++)
- {
- message_ty *mp = mlp->item[j];
+ for (j = 0; j < mlp->nitems; j++)
+ {
+ message_ty *mp = mlp->item[j];
- if (mp->msgid_plural != NULL)
- {
- has_plural = true;
- break;
- }
- }
- if (has_plural)
- break;
- }
+ if (mp->msgid_plural != NULL)
+ {
+ has_plural = true;
+ break;
+ }
+ }
+ if (has_plural)
+ break;
+ }
- if (has_plural)
- {
- message_ty *header = message_list_search (mdlp->item[0]->messages, "");
- if (header != NULL
- && strstr (header->msgstr, "Plural-Forms:") == NULL)
- {
- size_t insertpos = strlen (header->msgstr);
- const char *suffix;
- size_t suffix_len;
- char *new_msgstr;
-
- suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n";
- if (insertpos == 0 || header->msgstr[insertpos-1] == '\n')
- suffix++;
- suffix_len = strlen (suffix);
- new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len);
- memcpy (new_msgstr, header->msgstr, insertpos);
- memcpy (new_msgstr + insertpos, suffix, suffix_len);
- memcpy (new_msgstr + insertpos + suffix_len,
- header->msgstr + insertpos,
- header->msgstr_len - insertpos);
- header->msgstr = new_msgstr;
- header->msgstr_len = header->msgstr_len + suffix_len;
- }
- }
+ if (has_plural)
+ {
+ message_ty *header = message_list_search (mdlp->item[0]->messages, "");
+ if (header != NULL
+ && strstr (header->msgstr, "Plural-Forms:") == NULL)
+ {
+ size_t insertpos = strlen (header->msgstr);
+ const char *suffix;
+ size_t suffix_len;
+ char *new_msgstr;
+
+ suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n";
+ if (insertpos == 0 || header->msgstr[insertpos-1] == '\n')
+ suffix++;
+ suffix_len = strlen (suffix);
+ new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len);
+ memcpy (new_msgstr, header->msgstr, insertpos);
+ memcpy (new_msgstr + insertpos, suffix, suffix_len);
+ memcpy (new_msgstr + insertpos + suffix_len,
+ header->msgstr + insertpos,
+ header->msgstr_len - insertpos);
+ header->msgstr = new_msgstr;
+ header->msgstr_len = header->msgstr_len + suffix_len;
+ }
+ }
+ }
+
+ /* If not all the strings were plain ASCII, set the charset in the header
+ to UTF-8. All messages have already been converted to UTF-8 in
+ remember_a_message and remember_a_message_plural. */
+ {
+ bool has_nonascii = false;
+ size_t i;
+
+ for (i = 0; i < mdlp->nitems; i++)
+ {
+ message_list_ty *mlp = mdlp->item[i]->messages;
+
+ if (!is_ascii_message_list (mlp))
+ has_nonascii = true;
+ }
+
+ if (has_nonascii)
+ {
+ message_list_ty *mlp = mdlp->item[0]->messages;
+
+ iconv_message_list (mlp, po_charset_utf8, po_charset_utf8, NULL);
+ }
+ }
}
diff --git a/src/xgettext.h b/src/xgettext.h
index 6356449..ad36fbd 100644
--- a/src/xgettext.h
+++ b/src/xgettext.h
@@ -21,6 +21,11 @@
#define _XGETTEXT_H
#include <stddef.h>
+
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
#include "message.h"
#include "pos.h"
@@ -37,6 +42,24 @@ extern bool substring_match;
extern void split_keywordspec PARAMS ((const char *spec, const char **endp,
int *argnum1p, int *argnum2p));
+/* Canonicalized encoding name for all input files. */
+extern const char *xgettext_global_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_global_source_encoding to UTF-8 (except from
+ ASCII or UTF-8, when this conversion is a no-op). */
+extern iconv_t xgettext_global_source_iconv;
+#endif
+
+/* Canonicalized encoding name for the current input file. */
+extern const char *xgettext_current_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_current_source_encoding to UTF-8 (except from
+ ASCII or UTF-8, when this conversion is a no-op). */
+extern iconv_t xgettext_current_source_iconv;
+#endif
+
/* List of messages whose msgids must not be extracted, or NULL.
Used by remember_a_message(). */
extern message_list_ty *exclude;
diff --git a/tests/ChangeLog b/tests/ChangeLog
index 031d109..5f76d0f 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2002-11-05 Bruno Haible <bruno@clisp.org>
+
+ * xgettext-23: New file.
+ * msgmerge-21: New file.
+ * Makefile.am (TESTS): Add them.
+
2002-11-01 Bruno Haible <bruno@clisp.org>
* format-awk-1, format-awk-2, format-c-1, format-c-2, format-elisp-1,
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 649ec7e..73dfdf8 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -39,13 +39,14 @@ TESTS = gettext-1 gettext-2 \
msgmerge-1 msgmerge-2 msgmerge-3 msgmerge-4 msgmerge-5 msgmerge-6 \
msgmerge-7 msgmerge-8 msgmerge-9 msgmerge-10 msgmerge-11 msgmerge-12 \
msgmerge-13 msgmerge-14 msgmerge-15 msgmerge-16 msgmerge-17 \
- msgmerge-18 msgmerge-19 msgmerge-20 \
+ msgmerge-18 msgmerge-19 msgmerge-20 msgmerge-21 \
msgunfmt-1 msgunfmt-2 msgunfmt-3 \
msguniq-1 msguniq-2 msguniq-3 \
xgettext-1 xgettext-2 xgettext-3 xgettext-4 xgettext-5 xgettext-6 \
xgettext-7 xgettext-8 xgettext-9 xgettext-10 xgettext-11 xgettext-12 \
xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \
xgettext-18 xgettext-19 xgettext-20 xgettext-21 xgettext-22 \
+ xgettext-23 \
format-awk-1 format-awk-2 \
format-c-1 format-c-2 format-c-3 format-c-4 \
format-elisp-1 format-elisp-2 \
diff --git a/tests/msgmerge-21 b/tests/msgmerge-21
new file mode 100755
index 0000000..11e52dc
--- /dev/null
+++ b/tests/msgmerge-21
@@ -0,0 +1,99 @@
+#! /bin/sh
+
+# Test merging of a ref.pot in UTF-8 encoding against a def.po in legacy
+# encoding (that was produced from an older version of ref.pot, in ASCII
+# encoding).
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles mm-test21-ru.po"
+cat <<\EOF > mm-test21-ru.po
+# Russian messages for CLISP
+# Copyright (C) 1998 Free Software Foundation, Inc.
+# Eduard Haritonov <hed@iis.nsk.su>, 1998.
+# Arseny Slobodjuck <ampy@ich.dvo.ru>, 2002.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: GNU elvis 1.7\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: 2002-11-01 01:23+0100\n"
+"Last-Translator: Arseny Slobodjuck <ampy@ich.dvo.ru>\n"
+"Language-Team: Russian <ru@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=koi8-r\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:9
+msgid "Division durch Null"
+msgstr " "
+EOF
+
+tmpfiles="$tmpfiles mm-test21.pot"
+cat <<\EOF > mm-test21.pot
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:10
+msgid "Division durch Null"
+msgstr ""
+
+#: arith.c:15
+msgid "Überlauf"
+msgstr ""
+EOF
+
+tmpfiles="$tmpfiles mm-test21.out"
+: ${MSGMERGE=msgmerge}
+${MSGMERGE} -q mm-test21-ru.po mm-test21.pot -o mm-test21.out
+test $? = 0 || { rm -fr $tmpfiles; exit 1; }
+
+tmpfiles="$tmpfiles mm-test21.ok"
+cat <<\EOF > mm-test21.ok
+# Russian messages for CLISP
+# Copyright (C) 1998 Free Software Foundation, Inc.
+# Eduard Haritonov <hed@iis.nsk.su>, 1998.
+# Arseny Slobodjuck <ampy@ich.dvo.ru>, 2002.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: GNU elvis 1.7\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: 2002-11-01 01:23+0100\n"
+"Last-Translator: Arseny Slobodjuck <ampy@ich.dvo.ru>\n"
+"Language-Team: Russian <ru@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:10
+msgid "Division durch Null"
+msgstr "деление на нуль"
+
+#: arith.c:15
+msgid "Überlauf"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} mm-test21.ok mm-test21.out
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result
diff --git a/tests/xgettext-23 b/tests/xgettext-23
new file mode 100755
index 0000000..5928b6c
--- /dev/null
+++ b/tests/xgettext-23
@@ -0,0 +1,60 @@
+#! /bin/sh
+
+# Test extraction of non-ASCII msgids.
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles xg-test23.c"
+cat <<EOF > xg-test23.c
+void foo (int option)
+{
+ printf (_("%s: neznm pepna -- %c\n"), option);
+ printf (_("%s: pepna vyaduje argument -- %c\n"), option);
+}
+EOF
+
+tmpfiles="$tmpfiles xg-test23.po"
+: ${XGETTEXT=xgettext}
+${XGETTEXT} --no-location -k_ -o xg-test23.po xg-test23.c 2>/dev/null
+test $? = 1 || { rm -fr $tmpfiles; exit 1; }
+${XGETTEXT} --no-location -k_ --from-code=iso-8859-2 -o xg-test23.po xg-test23.c
+test $? = 0 || { rm -fr $tmpfiles; exit 1; }
+
+tmpfiles="$tmpfiles xg-test23.pot"
+sed -e '/POT-Creation-Date/d' < xg-test23.po > xg-test23.pot
+
+tmpfiles="$tmpfiles xg-test23.ok"
+cat <<EOF > xg-test23.ok
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#, c-format
+msgid "%s: neznámý přepínač -- %c\n"
+msgstr ""
+
+#, c-format
+msgid "%s: přepínač vyžaduje argument -- %c\n"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} xg-test23.ok xg-test23.pot
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result