diff options
Diffstat (limited to 'src/xgettext.c')
-rw-r--r-- | src/xgettext.c | 232 |
1 files changed, 187 insertions, 45 deletions
diff --git a/src/xgettext.c b/src/xgettext.c index 4224c2e..52f6747 100644 --- a/src/xgettext.c +++ b/src/xgettext.c @@ -48,6 +48,9 @@ #include "stpcpy.h" #include "po.h" #include "message.h" +#include "po-charset.h" +#include "msgl-iconv.h" +#include "msgl-ascii.h" #include "po-time.h" #include "write-po.h" #include "format.h" @@ -111,6 +114,24 @@ static char *output_dir; /* If nonzero omit header with information about this run. */ int xgettext_omit_header; +/* Canonicalized encoding name for all input files. */ +const char *xgettext_global_source_encoding; + +#if HAVE_ICONV +/* Converter from xgettext_global_source_encoding to UTF-8 (except from + ASCII or UTF-8, when this conversion is a no-op). */ +iconv_t xgettext_global_source_iconv; +#endif + +/* Canonicalized encoding name for the current input file. */ +const char *xgettext_current_source_encoding; + +#if HAVE_ICONV +/* Converter from xgettext_current_source_encoding to UTF-8 (except from + ASCII or UTF-8, when this conversion is a no-op). */ +iconv_t xgettext_current_source_iconv; +#endif + /* Long options. */ static const struct option long_options[] = { @@ -127,6 +148,7 @@ static const struct option long_options[] = { "files-from", required_argument, NULL, 'f' }, { "force-po", no_argument, &force_po, 1 }, { "foreign-user", no_argument, NULL, CHAR_MAX + 2 }, + { "from-code", required_argument, NULL, CHAR_MAX + 3 }, { "help", no_argument, NULL, 'h' }, { "indent", no_argument, NULL, 'i' }, { "join-existing", no_argument, NULL, 'j' }, @@ -137,7 +159,7 @@ static const struct option long_options[] = { "msgstr-suffix", optional_argument, NULL, 'M' }, { "no-escape", no_argument, NULL, 'e' }, { "no-location", no_argument, &line_comment, 0 }, - { "no-wrap", no_argument, NULL, CHAR_MAX + 3 }, + { "no-wrap", no_argument, NULL, CHAR_MAX + 4 }, { "omit-header", no_argument, &xgettext_omit_header, 1 }, { "output", required_argument, NULL, 'o' }, { "output-dir", required_argument, NULL, 'p' }, @@ -220,6 +242,7 @@ main (argc, argv) /* Set initial value of variables. */ default_domain = MESSAGE_DOMAIN_DEFAULT; + xgettext_global_source_encoding = po_charset_ascii; while ((optchar = getopt_long (argc, argv, "ac::Cd:D:eEf:Fhijk::l:L:m::M::no:p:sTVw:x:", @@ -366,7 +389,12 @@ main (argc, argv) case CHAR_MAX + 2: /* --foreign-user */ copyright_holder = ""; break; - case CHAR_MAX + 3: /* --no-wrap */ + case CHAR_MAX + 3: /* --from-code */ + xgettext_global_source_encoding = po_charset_canonicalize (optarg); + if (xgettext_global_source_encoding == NULL) + xgettext_global_source_encoding = po_charset_ascii; + break; + case CHAR_MAX + 4: /* --no-wrap */ message_page_width_ignore (); break; default: @@ -453,6 +481,37 @@ xgettext cannot work without keywords to look for")); for (cnt = optind; cnt < argc; ++cnt) string_list_append_unique (file_list, argv[cnt]); + /* Allocate converter from xgettext_global_source_encoding to UTF-8 (except + from ASCII or UTF-8, when this conversion is a no-op). */ + if (xgettext_global_source_encoding != po_charset_ascii + && xgettext_global_source_encoding != po_charset_utf8) + { +#if HAVE_ICONV + iconv_t cd; + + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION + if (strcmp (xgettext_global_source_encoding, "EUC-KR") == 0) + cd = (iconv_t)(-1); + else +# endif + cd = iconv_open (po_charset_utf8, xgettext_global_source_encoding); + if (cd == (iconv_t)(-1)) + error (EXIT_FAILURE, 0, _("\ +Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \ +and iconv() does not support this conversion."), + xgettext_global_source_encoding, po_charset_utf8, + basename (program_name)); + xgettext_global_source_iconv = cd; +#else + error (EXIT_FAILURE, 0, _("\ +Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \ +This version was built without iconv()."), + xgettext_global_source_encoding, po_charset_utf8, + basename (program_name)); +#endif + } + /* Allocate a message list to remember all the messages. */ mdlp = msgdomain_list_alloc (true); @@ -519,6 +578,13 @@ warning: file `%s' extension `%s' is unknown; will try C"), fname, extension); if (!xgettext_omit_header) finalize_header (mdlp); + /* Free the allocated converter. */ +#if HAVE_ICONV + if (xgettext_global_source_encoding != po_charset_ascii + && xgettext_global_source_encoding != po_charset_utf8) + iconv_close (xgettext_global_source_iconv); +#endif + /* Sorting the list of messages. */ if (sort_by_filepos) msgdomain_list_sort_by_filepos (mdlp); @@ -589,6 +655,14 @@ By default the language is guessed depending on the input file name extension.\n printf ("\n"); /* xgettext: no-wrap */ printf (_("\ +Input file interpretation:\n\ + --from-code=NAME encoding of input files\n\ + (except for Python, Tcl, Glade)\n\ +By default the input files are assumed to be in ASCII.\n\ +")); + printf ("\n"); + /* xgettext: no-wrap */ + printf (_("\ Operation mode:\n\ -j, --join-existing join messages with existing file\n\ -x, --exclude-file=FILE.po entries from FILE.po are not extracted\n\ @@ -890,6 +964,13 @@ extract_from_file (file_name, extractor, mdlp) char *real_file_name; FILE *fp = xgettext_open (file_name, &logical_file_name, &real_file_name); + /* Set the default for the source file encoding. May be overridden by + the extractor function. */ + xgettext_current_source_encoding = xgettext_global_source_encoding; +#if HAVE_ICONV + xgettext_current_source_iconv = xgettext_global_source_iconv; +#endif + extractor (fp, real_file_name, logical_file_name, mdlp); if (fp != stdin) @@ -905,6 +986,36 @@ extract_from_file (file_name, extractor, mdlp) static struct formatstring_parser *current_formatstring_parser; +/* Convert the given string from xgettext_current_source_encoding to + the output file encoding (i.e. ASCII or UTF-8). */ +#define CONVERT_STRING(string) \ + if (xgettext_current_source_encoding == po_charset_ascii) \ + { \ + if (!is_ascii_string (string)) \ + { \ + char buffer[21]; \ + if (pos->line_number == (size_t)(-1)) \ + buffer[0] = '\0'; \ + else \ + sprintf (buffer, ":%ld", (long) pos->line_number); \ + error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \ + pos->file_name, buffer); \ + } \ + } \ + else if (xgettext_current_source_encoding != po_charset_utf8) \ + { \ + string = convert_string (xgettext_current_source_iconv, string); \ + } + +#if !HAVE_ICONV +/* If we don't have iconv(), the only supported values for + xgettext_global_source_encoding and thus also for + xgettext_current_source_encoding are ASCII and UTF-8. + convert_string() should not be called in this case. */ +#define convert_string(cd,string) (abort (), (string)) +#endif + + message_ty * remember_a_message (mlp, string, pos) message_list_ty *mlp; @@ -934,6 +1045,8 @@ remember_a_message (mlp, string, pos) is_format[i] = undecided; do_wrap = undecided; + CONVERT_STRING (msgid); + if (msgid[0] == '\0' && !xgettext_omit_header) { char buffer[21]; @@ -999,6 +1112,8 @@ meta information, not the empty string.\n"))); if (s == NULL) break; + CONVERT_STRING (s); + /* To reduce the possibility of unwanted matches be do a two step match: the line must contain `xgettext:' and one of the possible format description strings. */ @@ -1102,6 +1217,8 @@ remember_a_message_plural (mp, string, pos) msgid_plural = string; + CONVERT_STRING (msgid_plural); + /* See if the message is already a plural message. */ if (mp->msgid_plural == NULL) { @@ -1205,53 +1322,78 @@ finalize_header (mdlp) { /* If the generated PO file has plural forms, add a Plural-Forms template to the constructed header. */ - bool has_plural; - size_t i, j; + { + bool has_plural; + size_t i, j; - has_plural = false; - for (i = 0; i < mdlp->nitems; i++) - { - message_list_ty *mlp = mdlp->item[i]->messages; + has_plural = false; + for (i = 0; i < mdlp->nitems; i++) + { + message_list_ty *mlp = mdlp->item[i]->messages; - for (j = 0; j < mlp->nitems; j++) - { - message_ty *mp = mlp->item[j]; + for (j = 0; j < mlp->nitems; j++) + { + message_ty *mp = mlp->item[j]; - if (mp->msgid_plural != NULL) - { - has_plural = true; - break; - } - } - if (has_plural) - break; - } + if (mp->msgid_plural != NULL) + { + has_plural = true; + break; + } + } + if (has_plural) + break; + } - if (has_plural) - { - message_ty *header = message_list_search (mdlp->item[0]->messages, ""); - if (header != NULL - && strstr (header->msgstr, "Plural-Forms:") == NULL) - { - size_t insertpos = strlen (header->msgstr); - const char *suffix; - size_t suffix_len; - char *new_msgstr; - - suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n"; - if (insertpos == 0 || header->msgstr[insertpos-1] == '\n') - suffix++; - suffix_len = strlen (suffix); - new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len); - memcpy (new_msgstr, header->msgstr, insertpos); - memcpy (new_msgstr + insertpos, suffix, suffix_len); - memcpy (new_msgstr + insertpos + suffix_len, - header->msgstr + insertpos, - header->msgstr_len - insertpos); - header->msgstr = new_msgstr; - header->msgstr_len = header->msgstr_len + suffix_len; - } - } + if (has_plural) + { + message_ty *header = message_list_search (mdlp->item[0]->messages, ""); + if (header != NULL + && strstr (header->msgstr, "Plural-Forms:") == NULL) + { + size_t insertpos = strlen (header->msgstr); + const char *suffix; + size_t suffix_len; + char *new_msgstr; + + suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n"; + if (insertpos == 0 || header->msgstr[insertpos-1] == '\n') + suffix++; + suffix_len = strlen (suffix); + new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len); + memcpy (new_msgstr, header->msgstr, insertpos); + memcpy (new_msgstr + insertpos, suffix, suffix_len); + memcpy (new_msgstr + insertpos + suffix_len, + header->msgstr + insertpos, + header->msgstr_len - insertpos); + header->msgstr = new_msgstr; + header->msgstr_len = header->msgstr_len + suffix_len; + } + } + } + + /* If not all the strings were plain ASCII, set the charset in the header + to UTF-8. All messages have already been converted to UTF-8 in + remember_a_message and remember_a_message_plural. */ + { + bool has_nonascii = false; + size_t i; + + for (i = 0; i < mdlp->nitems; i++) + { + message_list_ty *mlp = mdlp->item[i]->messages; + + if (!is_ascii_message_list (mlp)) + has_nonascii = true; + } + + if (has_nonascii) + { + message_list_ty *mlp = mdlp->item[0]->messages; + + iconv_message_list (mlp, po_charset_utf8, po_charset_utf8, NULL); + } + } } |