summaryrefslogtreecommitdiffstats
path: root/src/xgettext.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/xgettext.c')
-rw-r--r--src/xgettext.c232
1 files changed, 187 insertions, 45 deletions
diff --git a/src/xgettext.c b/src/xgettext.c
index 4224c2e..52f6747 100644
--- a/src/xgettext.c
+++ b/src/xgettext.c
@@ -48,6 +48,9 @@
#include "stpcpy.h"
#include "po.h"
#include "message.h"
+#include "po-charset.h"
+#include "msgl-iconv.h"
+#include "msgl-ascii.h"
#include "po-time.h"
#include "write-po.h"
#include "format.h"
@@ -111,6 +114,24 @@ static char *output_dir;
/* If nonzero omit header with information about this run. */
int xgettext_omit_header;
+/* Canonicalized encoding name for all input files. */
+const char *xgettext_global_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_global_source_encoding to UTF-8 (except from
+ ASCII or UTF-8, when this conversion is a no-op). */
+iconv_t xgettext_global_source_iconv;
+#endif
+
+/* Canonicalized encoding name for the current input file. */
+const char *xgettext_current_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_current_source_encoding to UTF-8 (except from
+ ASCII or UTF-8, when this conversion is a no-op). */
+iconv_t xgettext_current_source_iconv;
+#endif
+
/* Long options. */
static const struct option long_options[] =
{
@@ -127,6 +148,7 @@ static const struct option long_options[] =
{ "files-from", required_argument, NULL, 'f' },
{ "force-po", no_argument, &force_po, 1 },
{ "foreign-user", no_argument, NULL, CHAR_MAX + 2 },
+ { "from-code", required_argument, NULL, CHAR_MAX + 3 },
{ "help", no_argument, NULL, 'h' },
{ "indent", no_argument, NULL, 'i' },
{ "join-existing", no_argument, NULL, 'j' },
@@ -137,7 +159,7 @@ static const struct option long_options[] =
{ "msgstr-suffix", optional_argument, NULL, 'M' },
{ "no-escape", no_argument, NULL, 'e' },
{ "no-location", no_argument, &line_comment, 0 },
- { "no-wrap", no_argument, NULL, CHAR_MAX + 3 },
+ { "no-wrap", no_argument, NULL, CHAR_MAX + 4 },
{ "omit-header", no_argument, &xgettext_omit_header, 1 },
{ "output", required_argument, NULL, 'o' },
{ "output-dir", required_argument, NULL, 'p' },
@@ -220,6 +242,7 @@ main (argc, argv)
/* Set initial value of variables. */
default_domain = MESSAGE_DOMAIN_DEFAULT;
+ xgettext_global_source_encoding = po_charset_ascii;
while ((optchar = getopt_long (argc, argv,
"ac::Cd:D:eEf:Fhijk::l:L:m::M::no:p:sTVw:x:",
@@ -366,7 +389,12 @@ main (argc, argv)
case CHAR_MAX + 2: /* --foreign-user */
copyright_holder = "";
break;
- case CHAR_MAX + 3: /* --no-wrap */
+ case CHAR_MAX + 3: /* --from-code */
+ xgettext_global_source_encoding = po_charset_canonicalize (optarg);
+ if (xgettext_global_source_encoding == NULL)
+ xgettext_global_source_encoding = po_charset_ascii;
+ break;
+ case CHAR_MAX + 4: /* --no-wrap */
message_page_width_ignore ();
break;
default:
@@ -453,6 +481,37 @@ xgettext cannot work without keywords to look for"));
for (cnt = optind; cnt < argc; ++cnt)
string_list_append_unique (file_list, argv[cnt]);
+ /* Allocate converter from xgettext_global_source_encoding to UTF-8 (except
+ from ASCII or UTF-8, when this conversion is a no-op). */
+ if (xgettext_global_source_encoding != po_charset_ascii
+ && xgettext_global_source_encoding != po_charset_utf8)
+ {
+#if HAVE_ICONV
+ iconv_t cd;
+
+ /* Avoid glibc-2.1 bug with EUC-KR. */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+ if (strcmp (xgettext_global_source_encoding, "EUC-KR") == 0)
+ cd = (iconv_t)(-1);
+ else
+# endif
+ cd = iconv_open (po_charset_utf8, xgettext_global_source_encoding);
+ if (cd == (iconv_t)(-1))
+ error (EXIT_FAILURE, 0, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
+and iconv() does not support this conversion."),
+ xgettext_global_source_encoding, po_charset_utf8,
+ basename (program_name));
+ xgettext_global_source_iconv = cd;
+#else
+ error (EXIT_FAILURE, 0, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
+This version was built without iconv()."),
+ xgettext_global_source_encoding, po_charset_utf8,
+ basename (program_name));
+#endif
+ }
+
/* Allocate a message list to remember all the messages. */
mdlp = msgdomain_list_alloc (true);
@@ -519,6 +578,13 @@ warning: file `%s' extension `%s' is unknown; will try C"), fname, extension);
if (!xgettext_omit_header)
finalize_header (mdlp);
+ /* Free the allocated converter. */
+#if HAVE_ICONV
+ if (xgettext_global_source_encoding != po_charset_ascii
+ && xgettext_global_source_encoding != po_charset_utf8)
+ iconv_close (xgettext_global_source_iconv);
+#endif
+
/* Sorting the list of messages. */
if (sort_by_filepos)
msgdomain_list_sort_by_filepos (mdlp);
@@ -589,6 +655,14 @@ By default the language is guessed depending on the input file name extension.\n
printf ("\n");
/* xgettext: no-wrap */
printf (_("\
+Input file interpretation:\n\
+ --from-code=NAME encoding of input files\n\
+ (except for Python, Tcl, Glade)\n\
+By default the input files are assumed to be in ASCII.\n\
+"));
+ printf ("\n");
+ /* xgettext: no-wrap */
+ printf (_("\
Operation mode:\n\
-j, --join-existing join messages with existing file\n\
-x, --exclude-file=FILE.po entries from FILE.po are not extracted\n\
@@ -890,6 +964,13 @@ extract_from_file (file_name, extractor, mdlp)
char *real_file_name;
FILE *fp = xgettext_open (file_name, &logical_file_name, &real_file_name);
+ /* Set the default for the source file encoding. May be overridden by
+ the extractor function. */
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+#if HAVE_ICONV
+ xgettext_current_source_iconv = xgettext_global_source_iconv;
+#endif
+
extractor (fp, real_file_name, logical_file_name, mdlp);
if (fp != stdin)
@@ -905,6 +986,36 @@ extract_from_file (file_name, extractor, mdlp)
static struct formatstring_parser *current_formatstring_parser;
+/* Convert the given string from xgettext_current_source_encoding to
+ the output file encoding (i.e. ASCII or UTF-8). */
+#define CONVERT_STRING(string) \
+ if (xgettext_current_source_encoding == po_charset_ascii) \
+ { \
+ if (!is_ascii_string (string)) \
+ { \
+ char buffer[21]; \
+ if (pos->line_number == (size_t)(-1)) \
+ buffer[0] = '\0'; \
+ else \
+ sprintf (buffer, ":%ld", (long) pos->line_number); \
+ error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \
+ pos->file_name, buffer); \
+ } \
+ } \
+ else if (xgettext_current_source_encoding != po_charset_utf8) \
+ { \
+ string = convert_string (xgettext_current_source_iconv, string); \
+ }
+
+#if !HAVE_ICONV
+/* If we don't have iconv(), the only supported values for
+ xgettext_global_source_encoding and thus also for
+ xgettext_current_source_encoding are ASCII and UTF-8.
+ convert_string() should not be called in this case. */
+#define convert_string(cd,string) (abort (), (string))
+#endif
+
+
message_ty *
remember_a_message (mlp, string, pos)
message_list_ty *mlp;
@@ -934,6 +1045,8 @@ remember_a_message (mlp, string, pos)
is_format[i] = undecided;
do_wrap = undecided;
+ CONVERT_STRING (msgid);
+
if (msgid[0] == '\0' && !xgettext_omit_header)
{
char buffer[21];
@@ -999,6 +1112,8 @@ meta information, not the empty string.\n")));
if (s == NULL)
break;
+ CONVERT_STRING (s);
+
/* To reduce the possibility of unwanted matches be do a two
step match: the line must contain `xgettext:' and one of
the possible format description strings. */
@@ -1102,6 +1217,8 @@ remember_a_message_plural (mp, string, pos)
msgid_plural = string;
+ CONVERT_STRING (msgid_plural);
+
/* See if the message is already a plural message. */
if (mp->msgid_plural == NULL)
{
@@ -1205,53 +1322,78 @@ finalize_header (mdlp)
{
/* If the generated PO file has plural forms, add a Plural-Forms template
to the constructed header. */
- bool has_plural;
- size_t i, j;
+ {
+ bool has_plural;
+ size_t i, j;
- has_plural = false;
- for (i = 0; i < mdlp->nitems; i++)
- {
- message_list_ty *mlp = mdlp->item[i]->messages;
+ has_plural = false;
+ for (i = 0; i < mdlp->nitems; i++)
+ {
+ message_list_ty *mlp = mdlp->item[i]->messages;
- for (j = 0; j < mlp->nitems; j++)
- {
- message_ty *mp = mlp->item[j];
+ for (j = 0; j < mlp->nitems; j++)
+ {
+ message_ty *mp = mlp->item[j];
- if (mp->msgid_plural != NULL)
- {
- has_plural = true;
- break;
- }
- }
- if (has_plural)
- break;
- }
+ if (mp->msgid_plural != NULL)
+ {
+ has_plural = true;
+ break;
+ }
+ }
+ if (has_plural)
+ break;
+ }
- if (has_plural)
- {
- message_ty *header = message_list_search (mdlp->item[0]->messages, "");
- if (header != NULL
- && strstr (header->msgstr, "Plural-Forms:") == NULL)
- {
- size_t insertpos = strlen (header->msgstr);
- const char *suffix;
- size_t suffix_len;
- char *new_msgstr;
-
- suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n";
- if (insertpos == 0 || header->msgstr[insertpos-1] == '\n')
- suffix++;
- suffix_len = strlen (suffix);
- new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len);
- memcpy (new_msgstr, header->msgstr, insertpos);
- memcpy (new_msgstr + insertpos, suffix, suffix_len);
- memcpy (new_msgstr + insertpos + suffix_len,
- header->msgstr + insertpos,
- header->msgstr_len - insertpos);
- header->msgstr = new_msgstr;
- header->msgstr_len = header->msgstr_len + suffix_len;
- }
- }
+ if (has_plural)
+ {
+ message_ty *header = message_list_search (mdlp->item[0]->messages, "");
+ if (header != NULL
+ && strstr (header->msgstr, "Plural-Forms:") == NULL)
+ {
+ size_t insertpos = strlen (header->msgstr);
+ const char *suffix;
+ size_t suffix_len;
+ char *new_msgstr;
+
+ suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n";
+ if (insertpos == 0 || header->msgstr[insertpos-1] == '\n')
+ suffix++;
+ suffix_len = strlen (suffix);
+ new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len);
+ memcpy (new_msgstr, header->msgstr, insertpos);
+ memcpy (new_msgstr + insertpos, suffix, suffix_len);
+ memcpy (new_msgstr + insertpos + suffix_len,
+ header->msgstr + insertpos,
+ header->msgstr_len - insertpos);
+ header->msgstr = new_msgstr;
+ header->msgstr_len = header->msgstr_len + suffix_len;
+ }
+ }
+ }
+
+ /* If not all the strings were plain ASCII, set the charset in the header
+ to UTF-8. All messages have already been converted to UTF-8 in
+ remember_a_message and remember_a_message_plural. */
+ {
+ bool has_nonascii = false;
+ size_t i;
+
+ for (i = 0; i < mdlp->nitems; i++)
+ {
+ message_list_ty *mlp = mdlp->item[i]->messages;
+
+ if (!is_ascii_message_list (mlp))
+ has_nonascii = true;
+ }
+
+ if (has_nonascii)
+ {
+ message_list_ty *mlp = mdlp->item[0]->messages;
+
+ iconv_message_list (mlp, po_charset_utf8, po_charset_utf8, NULL);
+ }
+ }
}