diff options
-rw-r--r-- | NEWS | 4 | ||||
-rw-r--r-- | doc/ChangeLog | 4 | ||||
-rw-r--r-- | doc/xgettext.texi | 14 | ||||
-rw-r--r-- | src/ChangeLog | 40 | ||||
-rw-r--r-- | src/msgcmp.c | 39 | ||||
-rw-r--r-- | src/msginit.c | 31 | ||||
-rw-r--r-- | src/msgl-iconv.c | 15 | ||||
-rw-r--r-- | src/msgl-iconv.h | 9 | ||||
-rw-r--r-- | src/msgmerge.c | 35 | ||||
-rw-r--r-- | src/po-charset.c | 7 | ||||
-rw-r--r-- | src/po-charset.h | 3 | ||||
-rw-r--r-- | src/write-po.c | 6 | ||||
-rw-r--r-- | src/x-glade.c | 11 | ||||
-rw-r--r-- | src/x-python.c | 11 | ||||
-rw-r--r-- | src/x-tcl.c | 11 | ||||
-rw-r--r-- | src/xgettext.c | 232 | ||||
-rw-r--r-- | src/xgettext.h | 23 | ||||
-rw-r--r-- | tests/ChangeLog | 6 | ||||
-rw-r--r-- | tests/Makefile.am | 3 | ||||
-rwxr-xr-x | tests/msgmerge-21 | 99 | ||||
-rwxr-xr-x | tests/xgettext-23 | 60 |
21 files changed, 583 insertions, 80 deletions
@@ -6,6 +6,10 @@ Version 0.11.6 - October 2002 strings in C++. This is needed for proper internationalization of C++ programs. +* xgettext now supports msgid strings in other encodings than ASCII. + xgettext has a new option --from-code that specifies the encoding of the + source files. The resulting POT files are UTF-8 encoded. + * Compatibility with automake-1.7. Version 0.11.5 - August 2002 diff --git a/doc/ChangeLog b/doc/ChangeLog index a506f7f..8cfe1f7 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,7 @@ +2002-11-05 Bruno Haible <bruno@clisp.org> + + * xgettext.texi: Document --from-code option. + 2002-10-30 Bruno Haible <bruno@clisp.org> * gettext.texi (C): Refer to node Top of autosprintf.info. Needed to diff --git a/doc/xgettext.texi b/doc/xgettext.texi index 7385d50..4c0ba4c 100644 --- a/doc/xgettext.texi +++ b/doc/xgettext.texi @@ -84,6 +84,20 @@ This is a shorthand for @code{--language=C++}. By default the language is guessed depending on the input file name extension. +@subsection Input file interpretation + +@table @samp +@item --from-code=@var{name} +@opindex --from-code@r{, @code{xgettext} option} +Specifies the encoding of the input files. This option is needed only +if some untranslated message strings or their corresponding comments +contain non-ASCII characters. Note that Python, Tcl, and Glade input +files are always assumed to be in UTF-8, regardless of this option. + +@end table + +By default the input files are assumed to be in ASCII. + @subsection Operation mode @table @samp diff --git a/src/ChangeLog b/src/ChangeLog index 21a8cd6..9c872b5 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,43 @@ +2002-11-05 Bruno Haible <bruno@clisp.org> + + Allow non-ASCII msgids in POT files. + * po-charset.h (po_charset_utf8): New declaration. + * po-charset.c (utf8, po_charset_utf8): New variables. + (po_charset_canonicalize): Use po_charset_utf8. + * msgl-iconv.h: Include iconv.h. + (convert_string): New declaration. + * msgl-iconv.c (convert_string): Export function. + (convert_msgid): New function. + (iconv_message_list): Call it. + * xgettext.h: Include iconv.h. + (xgettext_global_source_encoding, xgettext_global_source_iconv, + xgettext_current_source_encoding, xgettext_current_source_iconv): New + declarations. + * xgettext.c (xgettext_global_source_encoding, + xgettext_global_source_iconv, xgettext_current_source_encoding, + xgettext_current_source_iconv): New variables. + (long_options): New option --from-code. + (main): Initialize xgettext_global_source_encoding. Handle option + --from-code. Initialize and destroy xgettext_global_source_iconv. + (usage): Document option --from-code. + (extract_from_file): Set xgettext_current_source_encoding and + xgettext_current_source_iconv. + (CONVERT_STRING): New macro. + (remember_a_message, remember_a_message_plural): Call CONVERT_STRING. + (finalize_header): Set the charset in the header here. + * x-glade.c (do_extract_glade): Set xgettext_current_source_encoding. + Don't set the result's header charset; this is now done in xgettext.c. + * x-python.c (extract_python): Likewise. + * x-tcl.c (extract_tcl): Likewise. + * write-po.c (message_print, message_print_obsolete): Don't warn about + non-ASCII msgids if the file's encoding is UTF-8. + * msginit.c (content_type): Add header argument. Use charset UTF-8 + if that was already the POT file's encoding. + (fields): Update. + * msgmerge.c (merge): If the POT file was in UTF-8, convert the + definitions to UTF-8. + * msgcmp.c (compare): Likewise. + 2002-11-01 Bruno Haible <bruno@clisp.org> * msgcmp.c: Include read-po.h. diff --git a/src/msgcmp.c b/src/msgcmp.c index 615acf0..0919f52 100644 --- a/src/msgcmp.c +++ b/src/msgcmp.c @@ -33,9 +33,12 @@ #include "basename.h" #include "message.h" #include "exit.h" -#include "gettext.h" #include "read-po.h" #include "po.h" +#include "msgl-iconv.h" +#include "strstr.h" +#include "strcase.h" +#include "gettext.h" #define _(str) gettext (str) @@ -301,6 +304,40 @@ compare (fn1, fn2) the xgettext program. */ ref = remove_obsoletes (read_po_file (fn2)); + /* The references file can be either in ASCII or in UTF-8. If it is + in UTF-8, we have to convert the definitions to UTF-8 as well. */ + { + bool was_utf8 = false; + for (k = 0; k < ref->nitems; k++) + { + message_list_ty *mlp = ref->item[k]->messages; + + for (j = 0; j < mlp->nitems; j++) + if (mlp->item[j]->msgid[0] == '\0' /* && !mlp->item[j]->obsolete */) + { + const char *header = mlp->item[j]->msgstr; + + if (header != NULL) + { + const char *charsetstr = strstr (header, "charset="); + + if (charsetstr != NULL) + { + size_t len; + + charsetstr += strlen ("charset="); + len = strcspn (charsetstr, " \t\n"); + if (len == strlen ("UTF-8") + && strncasecmp (charsetstr, "UTF-8", len) == 0) + was_utf8 = true; + } + } + } + } + if (was_utf8) + def = iconv_msgdomain_list (def, "UTF-8", fn1); + } + empty_list = message_list_alloc (false); /* Every entry in the xgettext generated file must be matched by a diff --git a/src/msginit.c b/src/msginit.c index d1c4536..d443c2a 100644 --- a/src/msginit.c +++ b/src/msginit.c @@ -69,6 +69,8 @@ #include "progname.h" #include "basename.h" #include "strpbrk.h" +#include "strstr.h" +#include "strcase.h" #include "message.h" #include "read-po.h" #include "write-po.h" @@ -145,7 +147,7 @@ static const char *last_translator PARAMS ((void)); static const char *language_team_address PARAMS ((void)); static const char *language_team PARAMS ((void)); static const char *mime_version PARAMS ((void)); -static const char *content_type PARAMS ((void)); +static const char *content_type PARAMS ((const char *header)); static const char *content_transfer_encoding PARAMS ((void)); static const char *plural_forms PARAMS ((void)); static char *get_field PARAMS ((const char *header, const char *field)); @@ -1212,9 +1214,30 @@ mime_version () /* Construct the value for the Content-Type field. */ static const char * -content_type () +content_type (header) + const char *header; { - return xasprintf ("text/plain; charset=%s", canonical_locale_charset ()); + bool was_utf8; + const char *old_field; + + /* If the POT file contains charset=UTF-8, it means that the POT file + contains non-ASCII characters, and we keep the UTF-8 encoding. + Otherwise, when the POT file is plain ASCII, we use the locale's + encoding. */ + was_utf8 = false; + old_field = get_field (header, "Content-Type"); + if (old_field != NULL) + { + const char *charsetstr = strstr (old_field, "charset="); + + if (charsetstr != NULL) + { + charsetstr += strlen ("charset="); + was_utf8 = (strcasecmp (charsetstr, "UTF-8") == 0); + } + } + return xasprintf ("text/plain; charset=%s", + was_utf8 ? "UTF-8" : canonical_locale_charset ()); } @@ -1259,7 +1282,7 @@ fields[] = { "Last-Translator", last_translator, NULL }, { "Language-Team", language_team, NULL }, { "MIME-Version", mime_version, NULL }, - { "Content-Type", content_type, NULL }, + { "Content-Type", NULL, content_type }, { "Content-Transfer-Encoding", content_transfer_encoding, NULL }, { "Plural-Forms", plural_forms, NULL } }; diff --git a/src/msgl-iconv.c b/src/msgl-iconv.c index 6620cfd..61633b5 100644 --- a/src/msgl-iconv.c +++ b/src/msgl-iconv.c @@ -54,8 +54,8 @@ static int iconv_string PARAMS ((iconv_t cd, const char *start, const char *end, char **resultp, size_t *lengthp)); -static const char *convert_string PARAMS ((iconv_t cd, const char *string)); static void convert_string_list PARAMS ((iconv_t cd, string_list_ty *slp)); +static void convert_msgid PARAMS ((iconv_t cd, message_ty *mp)); static void convert_msgstr PARAMS ((iconv_t cd, message_ty *mp)); #endif @@ -184,7 +184,7 @@ iconv_string (cd, start, end, resultp, lengthp) #undef tmpbufsize } -static const char * +char * convert_string (cd, string) iconv_t cd; const char *string; @@ -217,6 +217,16 @@ convert_string_list (cd, slp) } static void +convert_msgid (cd, mp) + iconv_t cd; + message_ty *mp; +{ + mp->msgid = convert_string (cd, mp->msgid); + if (mp->msgid_plural != NULL) + mp->msgid_plural = convert_string (cd, mp->msgid_plural); +} + +static void convert_msgstr (cd, mp) iconv_t cd; message_ty *mp; @@ -377,6 +387,7 @@ and iconv() does not support this conversion."), convert_string_list (cd, mp->comment); convert_string_list (cd, mp->comment_dot); + convert_msgid (cd, mp); convert_msgstr (cd, mp); } diff --git a/src/msgl-iconv.h b/src/msgl-iconv.h index 45cd5de..976427a 100644 --- a/src/msgl-iconv.h +++ b/src/msgl-iconv.h @@ -19,8 +19,17 @@ #ifndef _MSGL_ICONV_H #define _MSGL_ICONV_H +#if HAVE_ICONV +#include <iconv.h> +#endif + #include "message.h" +#if HAVE_ICONV +/* Converts the STRING through the conversion descriptor CD. */ +extern char *convert_string PARAMS ((iconv_t cd, const char *string)); +#endif + /* Converts the message list MLP to the (already canonicalized) encoding CANON_TO_CODE. The (already canonicalized) encoding before conversion can be passed as CANON_FROM_CODE; if NULL is passed instead, the diff --git a/src/msgmerge.c b/src/msgmerge.c index dc71eb3..adae806 100644 --- a/src/msgmerge.c +++ b/src/msgmerge.c @@ -44,6 +44,7 @@ #include "stpcpy.h" #include "stpncpy.h" #include "po.h" +#include "msgl-iconv.h" #include "msgl-equal.h" #include "plural-exp.h" #include "backupfile.h" @@ -1017,6 +1018,40 @@ merge (fn1, fn2, defp) message_list_prepend (ref->item[k]->messages, refheader); } + /* The references file can be either in ASCII or in UTF-8. If it is + in UTF-8, we have to convert the definitions to UTF-8 as well. */ + { + bool was_utf8 = false; + for (k = 0; k < ref->nitems; k++) + { + message_list_ty *mlp = ref->item[k]->messages; + + for (j = 0; j < mlp->nitems; j++) + if (mlp->item[j]->msgid[0] == '\0' && !mlp->item[j]->obsolete) + { + const char *header = mlp->item[j]->msgstr; + + if (header != NULL) + { + const char *charsetstr = strstr (header, "charset="); + + if (charsetstr != NULL) + { + size_t len; + + charsetstr += strlen ("charset="); + len = strcspn (charsetstr, " \t\n"); + if (len == strlen ("UTF-8") + && strncasecmp (charsetstr, "UTF-8", len) == 0) + was_utf8 = true; + } + } + } + } + if (was_utf8) + def = iconv_msgdomain_list (def, "UTF-8", fn1); + } + result = msgdomain_list_alloc (false); processed = 0; diff --git a/src/po-charset.c b/src/po-charset.c index b759716..f3d7368 100644 --- a/src/po-charset.c +++ b/src/po-charset.c @@ -44,6 +44,11 @@ static const char ascii[] = "ASCII"; /* The canonicalized encoding name for ASCII. */ const char *po_charset_ascii = ascii; +static const char utf8[] = "UTF-8"; + +/* The canonicalized encoding name for UTF-8. */ +const char *po_charset_utf8 = utf8; + /* Canonicalize an encoding name. */ const char * po_charset_canonicalize (charset) @@ -96,7 +101,7 @@ po_charset_canonicalize (charset) "TIS-620", "VISCII", "GEORGIAN-PS", - "UTF-8" + utf8 }; size_t i; diff --git a/src/po-charset.h b/src/po-charset.h index 216ab33..dfc7f6f 100644 --- a/src/po-charset.h +++ b/src/po-charset.h @@ -33,6 +33,9 @@ extern const char *po_charset_canonicalize PARAMS ((const char *charset)); /* The canonicalized encoding name for ASCII. */ extern const char *po_charset_ascii; +/* The canonicalized encoding name for UTF-8. */ +extern const char *po_charset_utf8; + /* Test for ASCII compatibility. */ extern bool po_charset_ascii_compatible PARAMS ((const char *canon_charset)); diff --git a/src/write-po.c b/src/write-po.c index 6feb002..5fec377 100644 --- a/src/write-po.c +++ b/src/write-po.c @@ -778,7 +778,8 @@ message_print (mp, fp, charset, blank_line, debug) /* Print each of the message components. Wrap them nicely so they are as readable as possible. If there is no recorded msgstr for this domain, emit an empty string. */ - if (!is_ascii_string (mp->msgid)) + if (!is_ascii_string (mp->msgid) + && po_charset_canonicalize (charset) != po_charset_utf8) multiline_warning (xasprintf (_("warning: ")), xasprintf (_("\ The following msgid contains non-ASCII characters.\n\ @@ -872,7 +873,8 @@ message_print_obsolete (mp, fp, charset, blank_line) /* Print each of the message components. Wrap them nicely so they are as readable as possible. */ - if (!is_ascii_string (mp->msgid)) + if (!is_ascii_string (mp->msgid) + && po_charset_canonicalize (charset) != po_charset_utf8) multiline_warning (xasprintf (_("warning: ")), xasprintf (_("\ The following msgid contains non-ASCII characters.\n\ diff --git a/src/x-glade.c b/src/x-glade.c index 715c059..369e077 100644 --- a/src/x-glade.c +++ b/src/x-glade.c @@ -372,6 +372,9 @@ do_extract_glade (fp, real_filename, logical_filename, mdlp) { mlp = mdlp->item[0]->messages; + /* expat feeds us strings in UTF-8 encoding. */ + xgettext_current_source_encoding = po_charset_utf8; + logical_file_name = xstrdup (logical_filename); init_keywords (); @@ -413,14 +416,6 @@ error while reading \"%s\""), real_filename); XML_ParserFree (parser); - /* expat feeds us strings in UTF-8 encoding. If not all the strings - were plain ASCII, set the charset in the header to UTF-8. */ - if (!is_ascii_message_list (mlp)) - { - const char *canon_utf_8 = po_charset_canonicalize ("UTF-8"); - iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL); - } - /* Close scanner. */ logical_file_name = NULL; parser = NULL; diff --git a/src/x-python.c b/src/x-python.c index 823e852..161a103 100644 --- a/src/x-python.c +++ b/src/x-python.c @@ -1159,6 +1159,9 @@ extract_python (f, real_filename, logical_filename, mdlp) { message_list_ty *mlp = mdlp->item[0]->messages; + /* We convert our strings to UTF-8 encoding. */ + xgettext_current_source_encoding = po_charset_utf8; + fp = f; real_file_name = real_filename; logical_file_name = xstrdup (logical_filename); @@ -1176,14 +1179,6 @@ extract_python (f, real_filename, logical_filename, mdlp) while (!extract_parenthesized (mlp, -1, 0)) ; - /* We converted our strings to UTF-8 encoding. If not all the strings - were plain ASCII, set the charset in the header to UTF-8. */ - if (!is_ascii_message_list (mlp)) - { - const char *canon_utf_8 = po_charset_canonicalize ("UTF-8"); - iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL); - } - fp = NULL; real_file_name = NULL; logical_file_name = NULL; diff --git a/src/x-tcl.c b/src/x-tcl.c index a942796..b744632 100644 --- a/src/x-tcl.c +++ b/src/x-tcl.c @@ -1002,6 +1002,9 @@ extract_tcl (f, real_filename, logical_filename, mdlp) { mlp = mdlp->item[0]->messages; + /* We convert our strings to UTF-8 encoding. */ + xgettext_current_source_encoding = po_charset_utf8; + fp = f; real_file_name = real_filename; logical_file_name = xstrdup (logical_filename); @@ -1018,14 +1021,6 @@ extract_tcl (f, real_filename, logical_filename, mdlp) /* Eat tokens until eof is seen. */ read_command_list ('\0'); - /* We converted our strings to UTF-8 encoding. If not all the strings - were plain ASCII, set the charset in the header to UTF-8. */ - if (!is_ascii_message_list (mlp)) - { - const char *canon_utf_8 = po_charset_canonicalize ("UTF-8"); - iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL); - } - fp = NULL; real_file_name = NULL; logical_file_name = NULL; diff --git a/src/xgettext.c b/src/xgettext.c index 4224c2e..52f6747 100644 --- a/src/xgettext.c +++ b/src/xgettext.c @@ -48,6 +48,9 @@ #include "stpcpy.h" #include "po.h" #include "message.h" +#include "po-charset.h" +#include "msgl-iconv.h" +#include "msgl-ascii.h" #include "po-time.h" #include "write-po.h" #include "format.h" @@ -111,6 +114,24 @@ static char *output_dir; /* If nonzero omit header with information about this run. */ int xgettext_omit_header; +/* Canonicalized encoding name for all input files. */ +const char *xgettext_global_source_encoding; + +#if HAVE_ICONV +/* Converter from xgettext_global_source_encoding to UTF-8 (except from + ASCII or UTF-8, when this conversion is a no-op). */ +iconv_t xgettext_global_source_iconv; +#endif + +/* Canonicalized encoding name for the current input file. */ +const char *xgettext_current_source_encoding; + +#if HAVE_ICONV +/* Converter from xgettext_current_source_encoding to UTF-8 (except from + ASCII or UTF-8, when this conversion is a no-op). */ +iconv_t xgettext_current_source_iconv; +#endif + /* Long options. */ static const struct option long_options[] = { @@ -127,6 +148,7 @@ static const struct option long_options[] = { "files-from", required_argument, NULL, 'f' }, { "force-po", no_argument, &force_po, 1 }, { "foreign-user", no_argument, NULL, CHAR_MAX + 2 }, + { "from-code", required_argument, NULL, CHAR_MAX + 3 }, { "help", no_argument, NULL, 'h' }, { "indent", no_argument, NULL, 'i' }, { "join-existing", no_argument, NULL, 'j' }, @@ -137,7 +159,7 @@ static const struct option long_options[] = { "msgstr-suffix", optional_argument, NULL, 'M' }, { "no-escape", no_argument, NULL, 'e' }, { "no-location", no_argument, &line_comment, 0 }, - { "no-wrap", no_argument, NULL, CHAR_MAX + 3 }, + { "no-wrap", no_argument, NULL, CHAR_MAX + 4 }, { "omit-header", no_argument, &xgettext_omit_header, 1 }, { "output", required_argument, NULL, 'o' }, { "output-dir", required_argument, NULL, 'p' }, @@ -220,6 +242,7 @@ main (argc, argv) /* Set initial value of variables. */ default_domain = MESSAGE_DOMAIN_DEFAULT; + xgettext_global_source_encoding = po_charset_ascii; while ((optchar = getopt_long (argc, argv, "ac::Cd:D:eEf:Fhijk::l:L:m::M::no:p:sTVw:x:", @@ -366,7 +389,12 @@ main (argc, argv) case CHAR_MAX + 2: /* --foreign-user */ copyright_holder = ""; break; - case CHAR_MAX + 3: /* --no-wrap */ + case CHAR_MAX + 3: /* --from-code */ + xgettext_global_source_encoding = po_charset_canonicalize (optarg); + if (xgettext_global_source_encoding == NULL) + xgettext_global_source_encoding = po_charset_ascii; + break; + case CHAR_MAX + 4: /* --no-wrap */ message_page_width_ignore (); break; default: @@ -453,6 +481,37 @@ xgettext cannot work without keywords to look for")); for (cnt = optind; cnt < argc; ++cnt) string_list_append_unique (file_list, argv[cnt]); + /* Allocate converter from xgettext_global_source_encoding to UTF-8 (except + from ASCII or UTF-8, when this conversion is a no-op). */ + if (xgettext_global_source_encoding != po_charset_ascii + && xgettext_global_source_encoding != po_charset_utf8) + { +#if HAVE_ICONV + iconv_t cd; + + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION + if (strcmp (xgettext_global_source_encoding, "EUC-KR") == 0) + cd = (iconv_t)(-1); + else +# endif + cd = iconv_open (po_charset_utf8, xgettext_global_source_encoding); + if (cd == (iconv_t)(-1)) + error (EXIT_FAILURE, 0, _("\ +Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \ +and iconv() does not support this conversion."), + xgettext_global_source_encoding, po_charset_utf8, + basename (program_name)); + xgettext_global_source_iconv = cd; +#else + error (EXIT_FAILURE, 0, _("\ +Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \ +This version was built without iconv()."), + xgettext_global_source_encoding, po_charset_utf8, + basename (program_name)); +#endif + } + /* Allocate a message list to remember all the messages. */ mdlp = msgdomain_list_alloc (true); @@ -519,6 +578,13 @@ warning: file `%s' extension `%s' is unknown; will try C"), fname, extension); if (!xgettext_omit_header) finalize_header (mdlp); + /* Free the allocated converter. */ +#if HAVE_ICONV + if (xgettext_global_source_encoding != po_charset_ascii + && xgettext_global_source_encoding != po_charset_utf8) + iconv_close (xgettext_global_source_iconv); +#endif + /* Sorting the list of messages. */ if (sort_by_filepos) msgdomain_list_sort_by_filepos (mdlp); @@ -589,6 +655,14 @@ By default the language is guessed depending on the input file name extension.\n printf ("\n"); /* xgettext: no-wrap */ printf (_("\ +Input file interpretation:\n\ + --from-code=NAME encoding of input files\n\ + (except for Python, Tcl, Glade)\n\ +By default the input files are assumed to be in ASCII.\n\ +")); + printf ("\n"); + /* xgettext: no-wrap */ + printf (_("\ Operation mode:\n\ -j, --join-existing join messages with existing file\n\ -x, --exclude-file=FILE.po entries from FILE.po are not extracted\n\ @@ -890,6 +964,13 @@ extract_from_file (file_name, extractor, mdlp) char *real_file_name; FILE *fp = xgettext_open (file_name, &logical_file_name, &real_file_name); + /* Set the default for the source file encoding. May be overridden by + the extractor function. */ + xgettext_current_source_encoding = xgettext_global_source_encoding; +#if HAVE_ICONV + xgettext_current_source_iconv = xgettext_global_source_iconv; +#endif + extractor (fp, real_file_name, logical_file_name, mdlp); if (fp != stdin) @@ -905,6 +986,36 @@ extract_from_file (file_name, extractor, mdlp) static struct formatstring_parser *current_formatstring_parser; +/* Convert the given string from xgettext_current_source_encoding to + the output file encoding (i.e. ASCII or UTF-8). */ +#define CONVERT_STRING(string) \ + if (xgettext_current_source_encoding == po_charset_ascii) \ + { \ + if (!is_ascii_string (string)) \ + { \ + char buffer[21]; \ + if (pos->line_number == (size_t)(-1)) \ + buffer[0] = '\0'; \ + else \ + sprintf (buffer, ":%ld", (long) pos->line_number); \ + error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \ + pos->file_name, buffer); \ + } \ + } \ + else if (xgettext_current_source_encoding != po_charset_utf8) \ + { \ + string = convert_string (xgettext_current_source_iconv, string); \ + } + +#if !HAVE_ICONV +/* If we don't have iconv(), the only supported values for + xgettext_global_source_encoding and thus also for + xgettext_current_source_encoding are ASCII and UTF-8. + convert_string() should not be called in this case. */ +#define convert_string(cd,string) (abort (), (string)) +#endif + + message_ty * remember_a_message (mlp, string, pos) message_list_ty *mlp; @@ -934,6 +1045,8 @@ remember_a_message (mlp, string, pos) is_format[i] = undecided; do_wrap = undecided; + CONVERT_STRING (msgid); + if (msgid[0] == '\0' && !xgettext_omit_header) { char buffer[21]; @@ -999,6 +1112,8 @@ meta information, not the empty string.\n"))); if (s == NULL) break; + CONVERT_STRING (s); + /* To reduce the possibility of unwanted matches be do a two step match: the line must contain `xgettext:' and one of the possible format description strings. */ @@ -1102,6 +1217,8 @@ remember_a_message_plural (mp, string, pos) msgid_plural = string; + CONVERT_STRING (msgid_plural); + /* See if the message is already a plural message. */ if (mp->msgid_plural == NULL) { @@ -1205,53 +1322,78 @@ finalize_header (mdlp) { /* If the generated PO file has plural forms, add a Plural-Forms template to the constructed header. */ - bool has_plural; - size_t i, j; + { + bool has_plural; + size_t i, j; - has_plural = false; - for (i = 0; i < mdlp->nitems; i++) - { - message_list_ty *mlp = mdlp->item[i]->messages; + has_plural = false; + for (i = 0; i < mdlp->nitems; i++) + { + message_list_ty *mlp = mdlp->item[i]->messages; - for (j = 0; j < mlp->nitems; j++) - { - message_ty *mp = mlp->item[j]; + for (j = 0; j < mlp->nitems; j++) + { + message_ty *mp = mlp->item[j]; - if (mp->msgid_plural != NULL) - { - has_plural = true; - break; - } - } - if (has_plural) - break; - } + if (mp->msgid_plural != NULL) + { + has_plural = true; + break; + } + } + if (has_plural) + break; + } - if (has_plural) - { - message_ty *header = message_list_search (mdlp->item[0]->messages, ""); - if (header != NULL - && strstr (header->msgstr, "Plural-Forms:") == NULL) - { - size_t insertpos = strlen (header->msgstr); - const char *suffix; - size_t suffix_len; - char *new_msgstr; - - suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n"; - if (insertpos == 0 || header->msgstr[insertpos-1] == '\n') - suffix++; - suffix_len = strlen (suffix); - new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len); - memcpy (new_msgstr, header->msgstr, insertpos); - memcpy (new_msgstr + insertpos, suffix, suffix_len); - memcpy (new_msgstr + insertpos + suffix_len, - header->msgstr + insertpos, - header->msgstr_len - insertpos); - header->msgstr = new_msgstr; - header->msgstr_len = header->msgstr_len + suffix_len; - } - } + if (has_plural) + { + message_ty *header = message_list_search (mdlp->item[0]->messages, ""); + if (header != NULL + && strstr (header->msgstr, "Plural-Forms:") == NULL) + { + size_t insertpos = strlen (header->msgstr); + const char *suffix; + size_t suffix_len; + char *new_msgstr; + + suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n"; + if (insertpos == 0 || header->msgstr[insertpos-1] == '\n') + suffix++; + suffix_len = strlen (suffix); + new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len); + memcpy (new_msgstr, header->msgstr, insertpos); + memcpy (new_msgstr + insertpos, suffix, suffix_len); + memcpy (new_msgstr + insertpos + suffix_len, + header->msgstr + insertpos, + header->msgstr_len - insertpos); + header->msgstr = new_msgstr; + header->msgstr_len = header->msgstr_len + suffix_len; + } + } + } + + /* If not all the strings were plain ASCII, set the charset in the header + to UTF-8. All messages have already been converted to UTF-8 in + remember_a_message and remember_a_message_plural. */ + { + bool has_nonascii = false; + size_t i; + + for (i = 0; i < mdlp->nitems; i++) + { + message_list_ty *mlp = mdlp->item[i]->messages; + + if (!is_ascii_message_list (mlp)) + has_nonascii = true; + } + + if (has_nonascii) + { + message_list_ty *mlp = mdlp->item[0]->messages; + + iconv_message_list (mlp, po_charset_utf8, po_charset_utf8, NULL); + } + } } diff --git a/src/xgettext.h b/src/xgettext.h index 6356449..ad36fbd 100644 --- a/src/xgettext.h +++ b/src/xgettext.h @@ -21,6 +21,11 @@ #define _XGETTEXT_H #include <stddef.h> + +#if HAVE_ICONV +#include <iconv.h> +#endif + #include "message.h" #include "pos.h" @@ -37,6 +42,24 @@ extern bool substring_match; extern void split_keywordspec PARAMS ((const char *spec, const char **endp, int *argnum1p, int *argnum2p)); +/* Canonicalized encoding name for all input files. */ +extern const char *xgettext_global_source_encoding; + +#if HAVE_ICONV +/* Converter from xgettext_global_source_encoding to UTF-8 (except from + ASCII or UTF-8, when this conversion is a no-op). */ +extern iconv_t xgettext_global_source_iconv; +#endif + +/* Canonicalized encoding name for the current input file. */ +extern const char *xgettext_current_source_encoding; + +#if HAVE_ICONV +/* Converter from xgettext_current_source_encoding to UTF-8 (except from + ASCII or UTF-8, when this conversion is a no-op). */ +extern iconv_t xgettext_current_source_iconv; +#endif + /* List of messages whose msgids must not be extracted, or NULL. Used by remember_a_message(). */ extern message_list_ty *exclude; diff --git a/tests/ChangeLog b/tests/ChangeLog index 031d109..5f76d0f 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,9 @@ +2002-11-05 Bruno Haible <bruno@clisp.org> + + * xgettext-23: New file. + * msgmerge-21: New file. + * Makefile.am (TESTS): Add them. + 2002-11-01 Bruno Haible <bruno@clisp.org> * format-awk-1, format-awk-2, format-c-1, format-c-2, format-elisp-1, diff --git a/tests/Makefile.am b/tests/Makefile.am index 649ec7e..73dfdf8 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -39,13 +39,14 @@ TESTS = gettext-1 gettext-2 \ msgmerge-1 msgmerge-2 msgmerge-3 msgmerge-4 msgmerge-5 msgmerge-6 \ msgmerge-7 msgmerge-8 msgmerge-9 msgmerge-10 msgmerge-11 msgmerge-12 \ msgmerge-13 msgmerge-14 msgmerge-15 msgmerge-16 msgmerge-17 \ - msgmerge-18 msgmerge-19 msgmerge-20 \ + msgmerge-18 msgmerge-19 msgmerge-20 msgmerge-21 \ msgunfmt-1 msgunfmt-2 msgunfmt-3 \ msguniq-1 msguniq-2 msguniq-3 \ xgettext-1 xgettext-2 xgettext-3 xgettext-4 xgettext-5 xgettext-6 \ xgettext-7 xgettext-8 xgettext-9 xgettext-10 xgettext-11 xgettext-12 \ xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \ xgettext-18 xgettext-19 xgettext-20 xgettext-21 xgettext-22 \ + xgettext-23 \ format-awk-1 format-awk-2 \ format-c-1 format-c-2 format-c-3 format-c-4 \ format-elisp-1 format-elisp-2 \ diff --git a/tests/msgmerge-21 b/tests/msgmerge-21 new file mode 100755 index 0000000..11e52dc --- /dev/null +++ b/tests/msgmerge-21 @@ -0,0 +1,99 @@ +#! /bin/sh + +# Test merging of a ref.pot in UTF-8 encoding against a def.po in legacy +# encoding (that was produced from an older version of ref.pot, in ASCII +# encoding). + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles mm-test21-ru.po" +cat <<\EOF > mm-test21-ru.po +# Russian messages for CLISP +# Copyright (C) 1998 Free Software Foundation, Inc. +# Eduard Haritonov <hed@iis.nsk.su>, 1998. +# Arseny Slobodjuck <ampy@ich.dvo.ru>, 2002. +# +msgid "" +msgstr "" +"Project-Id-Version: GNU elvis 1.7\n" +"POT-Creation-Date: 2002-11-01 01:22+0100\n" +"PO-Revision-Date: 2002-11-01 01:23+0100\n" +"Last-Translator: Arseny Slobodjuck <ampy@ich.dvo.ru>\n" +"Language-Team: Russian <ru@li.org>\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=koi8-r\n" +"Content-Transfer-Encoding: 8bit\n" + +#: arith.c:9 +msgid "Division durch Null" +msgstr " " +EOF + +tmpfiles="$tmpfiles mm-test21.pot" +cat <<\EOF > mm-test21.pot +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2002-11-01 01:22+0100\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" +"Language-Team: LANGUAGE <LL@li.org>\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#: arith.c:10 +msgid "Division durch Null" +msgstr "" + +#: arith.c:15 +msgid "Überlauf" +msgstr "" +EOF + +tmpfiles="$tmpfiles mm-test21.out" +: ${MSGMERGE=msgmerge} +${MSGMERGE} -q mm-test21-ru.po mm-test21.pot -o mm-test21.out +test $? = 0 || { rm -fr $tmpfiles; exit 1; } + +tmpfiles="$tmpfiles mm-test21.ok" +cat <<\EOF > mm-test21.ok +# Russian messages for CLISP +# Copyright (C) 1998 Free Software Foundation, Inc. +# Eduard Haritonov <hed@iis.nsk.su>, 1998. +# Arseny Slobodjuck <ampy@ich.dvo.ru>, 2002. +# +msgid "" +msgstr "" +"Project-Id-Version: GNU elvis 1.7\n" +"POT-Creation-Date: 2002-11-01 01:22+0100\n" +"PO-Revision-Date: 2002-11-01 01:23+0100\n" +"Last-Translator: Arseny Slobodjuck <ampy@ich.dvo.ru>\n" +"Language-Team: Russian <ru@li.org>\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#: arith.c:10 +msgid "Division durch Null" +msgstr "деление на нуль" + +#: arith.c:15 +msgid "Überlauf" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} mm-test21.ok mm-test21.out +result=$? + +rm -fr $tmpfiles + +exit $result diff --git a/tests/xgettext-23 b/tests/xgettext-23 new file mode 100755 index 0000000..5928b6c --- /dev/null +++ b/tests/xgettext-23 @@ -0,0 +1,60 @@ +#! /bin/sh + +# Test extraction of non-ASCII msgids. + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles xg-test23.c" +cat <<EOF > xg-test23.c +void foo (int option) +{ + printf (_("%s: neznm pepna -- %c\n"), option); + printf (_("%s: pepna vyaduje argument -- %c\n"), option); +} +EOF + +tmpfiles="$tmpfiles xg-test23.po" +: ${XGETTEXT=xgettext} +${XGETTEXT} --no-location -k_ -o xg-test23.po xg-test23.c 2>/dev/null +test $? = 1 || { rm -fr $tmpfiles; exit 1; } +${XGETTEXT} --no-location -k_ --from-code=iso-8859-2 -o xg-test23.po xg-test23.c +test $? = 0 || { rm -fr $tmpfiles; exit 1; } + +tmpfiles="$tmpfiles xg-test23.pot" +sed -e '/POT-Creation-Date/d' < xg-test23.po > xg-test23.pot + +tmpfiles="$tmpfiles xg-test23.ok" +cat <<EOF > xg-test23.ok +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" +"Language-Team: LANGUAGE <LL@li.org>\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#, c-format +msgid "%s: neznámý přepínač -- %c\n" +msgstr "" + +#, c-format +msgid "%s: přepínač vyžaduje argument -- %c\n" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} xg-test23.ok xg-test23.pot +result=$? + +rm -fr $tmpfiles + +exit $result |