diff options
author | Bruno Haible <bruno@clisp.org> | 2005-09-26 09:21:16 +0000 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2009-06-23 12:12:51 +0200 |
commit | 720753e99a1f57d7403e4f3c4f61ad294d77fd40 (patch) | |
tree | c9d40946612b7b8a06fdcb3c0a0a7459d221bcc5 /gettext-tools/src | |
parent | 728072fe2137715eebd6314a79947c770a5b3ba1 (diff) | |
download | external_gettext-720753e99a1f57d7403e4f3c4f61ad294d77fd40.zip external_gettext-720753e99a1f57d7403e4f3c4f61ad294d77fd40.tar.gz external_gettext-720753e99a1f57d7403e4f3c4f61ad294d77fd40.tar.bz2 |
Support for Python source encodings (PEP 0263).
Diffstat (limited to 'gettext-tools/src')
-rw-r--r-- | gettext-tools/src/ChangeLog | 45 | ||||
-rw-r--r-- | gettext-tools/src/x-python.c | 1010 |
2 files changed, 872 insertions, 183 deletions
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index bc859f6..eb744df 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,5 +1,50 @@ 2005-09-25 Bruno Haible <bruno@clisp.org> + Support for Python source encodings (PEP 0263). + * x-python.c: Include progname.h, basename.h, xerror.h, strstr.h, + c-ctype.h, utf8-ucs4.h. + (phase1_pushback): Reduce size. + (UEOF): New macro. + (phase2_pushback, phase2_pushback_length): New variables. + (phase2_getc, phase2_ungetc): New functions. + (struct unicode_string_buffer): New structure type. + (init_unicode_string_buffer, unicode_string_buffer_append_unicode_grow, + unicode_string_buffer_append_unicode, unicode_string_buffer_result, + free_unicode_string_buffer): New functions. + (comment_buffer): New variable. + (buffer, bufmax, buflen): Remove variables. + (comment_start, comment_add, comment_line_end): Rewritten. + (comment_at_start): New function. + (xgettext_current_file_source_encoding): New variable. + (xgettext_current_file_source_iconv): New variable. + (set_current_file_source_encoding, try_to_extract_coding): New + functions. + (continuation_or_nonblank_line): New variable. + (phase3_getc): Renamed from phase2_getc. Use phase2_getc instead of + phase1_getc. Return a Unicode character. Call try_to_extract_coding + when seeing a comment among the first two lines. + (phase3_ungetc): Renamed from phase2_ungetc. Use phase2_ungetc instead + of phase1_ungetc. + (UNICODE, IS_UNICODE, UNICODE_VALUE): New macros. + (struct mixed_string_buffer): New structure type. + (init_mixed_string_buffer, mixed_string_buffer_append_byte, + mixed_string_buffer_append_unicode_grow, + mixed_string_buffer_append_unicode, + mixed_string_buffer_flush_utf16_surr, + mixed_string_buffer_flush_curr_buffer, mixed_string_buffer_append, + mixed_string_buffer_result, free_mixed_string_buffer): New functions. + (phase7_getuc): Use phase2_getc instead of phase1_getc. Return a + Unicode character except for \ooo and \xnn. + (phase5_get): Operate on the level of Unicode characters instead of + at the level of bytes. Use a mixed_string_buffer to accumulate a + string literal. + (extract_parenthesized): Set xgettext_current_source_encoding to UTF-8 + while passing UTF-8 strings to the xgettext main code. + (extract_python): Initialize xgettext_current_file_source_encoding and + xgettext_current_source_encoding. + +2005-09-25 Bruno Haible <bruno@clisp.org> + * x-csharp.c (phase2_getc): Fix mis-use of iconv() when the source encoding is neither ASCII nor UTF-8. diff --git a/gettext-tools/src/x-python.c b/gettext-tools/src/x-python.c index 768a7c2..21c686a 100644 --- a/gettext-tools/src/x-python.c +++ b/gettext-tools/src/x-python.c @@ -1,5 +1,5 @@ /* xgettext Python backend. - Copyright (C) 2002-2003 Free Software Foundation, Inc. + Copyright (C) 2002-2003, 2005 Free Software Foundation, Inc. This file was written by Bruno Haible <haible@clisp.cons.org>, 2002. @@ -33,11 +33,17 @@ #include "x-python.h" #include "error.h" #include "error-progname.h" +#include "progname.h" +#include "basename.h" +#include "xerror.h" #include "xalloc.h" #include "exit.h" +#include "strstr.h" +#include "c-ctype.h" #include "po-charset.h" #include "uniname.h" #include "utf16-ucs4.h" +#include "utf8-ucs4.h" #include "ucs4-utf8.h" #include "gettext.h" @@ -148,11 +154,14 @@ static int line_number; static FILE *fp; -/* 1. line_number handling. Also allow a lookahead. */ +/* 1. line_number handling. */ -static unsigned char phase1_pushback[max (9, UNINAME_MAX + 3)]; +/* Maximum used, roughly a safer MB_LEN_MAX. */ +#define MAX_PHASE1_PUSHBACK 16 +static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK]; static int phase1_pushback_length; +/* Read the next single byte from the input file. */ static int phase1_getc () { @@ -174,12 +183,12 @@ phase1_getc () } if (c == '\n') - line_number++; + ++line_number; return c; } -/* Supports max (9, UNINAME_MAX + 3) characters of pushback. */ +/* Supports MAX_PHASE1_PUSHBACK characters of pushback. */ static void phase1_ungetc (int c) { @@ -195,110 +204,770 @@ phase1_ungetc (int c) } -/* Accumulating comments. */ +/* Phase 2: Conversion to Unicode. + This is done early because PEP 0263 specifies that conversion to Unicode + conceptually occurs before tokenization. A test case where it matters + is with encodings like BIG5: when a double-byte character ending in 0x5C + is followed by '\' or 'u0021', the tokenizer must not treat the second + half of the double-byte character as a backslash. */ -static char *buffer; -static size_t bufmax; -static size_t buflen; +/* End-of-file indicator for functions returning an UCS-4 character. */ +#define UEOF -1 -static inline void -comment_start () +static int phase2_pushback[max (9, UNINAME_MAX + 3)]; +static int phase2_pushback_length; + +/* Read the next Unicode UCS-4 character from the input file. */ +static int +phase2_getc () { - buflen = 0; + if (phase2_pushback_length) + return phase2_pushback[--phase2_pushback_length]; + + if (xgettext_current_source_encoding == po_charset_ascii) + { + int c = phase1_getc (); + if (c == EOF) + return UEOF; + if (!c_isascii (c)) + { + char buffer[21]; + sprintf (buffer, ":%ld", (long) line_number); + multiline_error (xstrdup (""), + xasprintf (_("\ +Non-ASCII string at %s%s.\n\ +Please specify the source encoding through --from-code or through a comment\n\ +as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, buffer)); + exit (EXIT_FAILURE); + } + return c; + } + else if (xgettext_current_source_encoding != po_charset_utf8) + { +#if HAVE_ICONV + /* Use iconv on an increasing number of bytes. Read only as many bytes + through phase1_getc as needed. This is needed to give reasonable + interactive behaviour when fp is connected to an interactive tty. */ + unsigned char buf[MAX_PHASE1_PUSHBACK]; + size_t bufcount; + int c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[0] = (unsigned char) c; + bufcount = 1; + + for (;;) + { + unsigned char scratchbuf[6]; + const char *inptr = (const char *) &buf[0]; + size_t insize = bufcount; + char *outptr = (char *) &scratchbuf[0]; + size_t outsize = sizeof (scratchbuf); + + size_t res = iconv (xgettext_current_source_iconv, + (ICONV_CONST char **) &inptr, &insize, + &outptr, &outsize); + /* We expect that a character has been produced if and only if + some input bytes have been consumed. */ + if ((insize < bufcount) != (outsize < sizeof (scratchbuf))) + abort (); + if (outsize == sizeof (scratchbuf)) + { + /* No character has been produced. Must be an error. */ + if (res != (size_t)(-1)) + abort (); + + if (errno == EILSEQ) + { + /* An invalid multibyte sequence was encountered. */ + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Invalid multibyte sequence.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + } + else if (errno == EINVAL) + { + /* An incomplete multibyte character. */ + int c; + + if (bufcount == MAX_PHASE1_PUSHBACK) + { + /* An overlong incomplete multibyte sequence was + encountered. */ + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Long incomplete multibyte sequence.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + } + + /* Read one more byte and retry iconv. */ + c = phase1_getc (); + if (c == EOF) + { + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Incomplete multibyte sequence at end of file.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + } + if (c == '\n') + { + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Incomplete multibyte sequence at end of line.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number - 1)); + exit (EXIT_FAILURE); + } + buf[bufcount++] = (unsigned char) c; + } + else + error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"), + real_file_name, line_number); + } + else + { + size_t outbytes = sizeof (scratchbuf) - outsize; + size_t bytes = bufcount - insize; + unsigned int uc; + + /* We expect that one character has been produced. */ + if (bytes == 0) + abort (); + if (outbytes == 0) + abort (); + /* Push back the unused bytes. */ + while (insize > 0) + phase1_ungetc (buf[--insize]); + /* Convert the character from UTF-8 to UCS-4. */ + if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes) + { + /* scratchbuf contains an out-of-range Unicode character + (> 0x10ffff). */ + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Invalid multibyte sequence.\n\ +Please specify the source encoding through --from-code or through a comment\n\ +as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + } + return uc; + } + } +#else + /* If we don't have iconv(), the only supported values for + xgettext_global_source_encoding and thus also for + xgettext_current_source_encoding are ASCII and UTF-8. */ + abort (); +#endif + } + else + { + /* Read an UTF-8 encoded character. */ + unsigned char buf[6]; + unsigned int count; + int c; + unsigned int uc; + + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[0] = c; + count = 1; + + if (buf[0] >= 0xc0) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[1] = c; + count = 2; + } + + if (buf[0] >= 0xe0 + && ((buf[1] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[2] = c; + count = 3; + } + + if (buf[0] >= 0xf0 + && ((buf[1] ^ 0x80) < 0x40) + && ((buf[2] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[3] = c; + count = 4; + } + + if (buf[0] >= 0xf8 + && ((buf[1] ^ 0x80) < 0x40) + && ((buf[2] ^ 0x80) < 0x40) + && ((buf[3] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[4] = c; + count = 5; + } + + if (buf[0] >= 0xfc + && ((buf[1] ^ 0x80) < 0x40) + && ((buf[2] ^ 0x80) < 0x40) + && ((buf[3] ^ 0x80) < 0x40) + && ((buf[4] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[5] = c; + count = 6; + } + + u8_mbtouc (&uc, buf, count); + return uc; + } } -static inline void -comment_add (int c) +/* Supports max (9, UNINAME_MAX + 3) pushback characters. */ +static void +phase2_ungetc (int c) { - /* We assume the program source is in ISO-8859-1 (for consistency with - Python's \ooo and \xnn syntax inside strings), but we produce a POT - file in UTF-8 encoding. */ - size_t len = ((unsigned char) c < 0x80 ? 1 : 2); - if (buflen + len > bufmax) + if (c != UEOF) { - bufmax = 2 * bufmax + 10; - buffer = xrealloc (buffer, bufmax); + if (phase2_pushback_length == SIZEOF (phase2_pushback)) + abort (); + phase2_pushback[phase2_pushback_length++] = c; } - if ((unsigned char) c < 0x80) - buffer[buflen++] = c; - else +} + + +/* ========================= Accumulating strings. ======================== */ + +/* A string buffer type that allows appending Unicode characters. + Returns the entire string in UTF-8 encoding. */ + +struct unicode_string_buffer +{ + /* The part of the string that has already been converted to UTF-8. */ + char *utf8_buffer; + size_t utf8_buflen; + size_t utf8_allocated; +}; + +/* Initialize a 'struct unicode_string_buffer' to empty. */ +static inline void +init_unicode_string_buffer (struct unicode_string_buffer *bp) +{ + bp->utf8_buffer = NULL; + bp->utf8_buflen = 0; + bp->utf8_allocated = 0; +} + +/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ +static inline void +unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp, + size_t count) +{ + if (bp->utf8_buflen + count > bp->utf8_allocated) { - buffer[buflen++] = 0xc0 | ((unsigned char) c >> 6); - buffer[buflen++] = 0x80 | ((unsigned char) c & 0x3f); + size_t new_allocated = 2 * bp->utf8_allocated + 10; + if (new_allocated < bp->utf8_buflen + count) + new_allocated = bp->utf8_buflen + count; + bp->utf8_allocated = new_allocated; + bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); } } +/* Auxiliary function: Append a Unicode character to bp->utf8. + uc must be < 0x110000. */ +static inline void +unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp, + unsigned int uc) +{ + unsigned char utf8buf[6]; + int count = u8_uctomb (utf8buf, uc, 6); + + if (count < 0) + /* The caller should have ensured that uc is not out-of-range. */ + abort (); + + unicode_string_buffer_append_unicode_grow (bp, count); + memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); + bp->utf8_buflen += count; +} + +/* Return the string buffer's contents. */ +static char * +unicode_string_buffer_result (struct unicode_string_buffer *bp) +{ + /* NUL-terminate it. */ + unicode_string_buffer_append_unicode_grow (bp, 1); + bp->utf8_buffer[bp->utf8_buflen] = '\0'; + /* Return it. */ + return bp->utf8_buffer; +} + +/* Free the memory pointed to by a 'struct unicode_string_buffer'. */ +static inline void +free_unicode_string_buffer (struct unicode_string_buffer *bp) +{ + free (bp->utf8_buffer); +} + + +/* ======================== Accumulating comments. ======================== */ + + +/* Accumulating a single comment line. */ + +static struct unicode_string_buffer comment_buffer; + +static inline void +comment_start () +{ + comment_buffer.utf8_buflen = 0; +} + +static inline bool +comment_at_start () +{ + return (comment_buffer.utf8_buflen == 0); +} + static inline void +comment_add (int c) +{ + unicode_string_buffer_append_unicode (&comment_buffer, c); +} + +static inline const char * comment_line_end () { + char *buffer = unicode_string_buffer_result (&comment_buffer); + size_t buflen = strlen (buffer); + while (buflen >= 1 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) --buflen; - if (buflen >= bufmax) - { - bufmax = 2 * bufmax + 10; - buffer = xrealloc (buffer, bufmax); - } buffer[buflen] = '\0'; savable_comment_add (buffer); + return buffer; } + /* These are for tracking whether comments count as immediately before keyword. */ static int last_comment_line; static int last_non_comment_line; -/* 2. Outside strings, replace backslash-newline with nothing and a comment - with nothing. */ +/* ======================== Recognizing comments. ======================== */ + + +/* Recognizing the "coding" comment. + As specified in PEP 0263, it takes the form + "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}* + and is located in a comment in a line that + - is either the first or second line, + - is not a continuation line, + - contains no other tokens except this comment. */ + +/* Canonicalized encoding name for the current input file. */ +static const char *xgettext_current_file_source_encoding; + +#if HAVE_ICONV +/* Converter from xgettext_current_file_source_encoding to UTF-8 (except from + ASCII or UTF-8, when this conversion is a no-op). */ +static iconv_t xgettext_current_file_source_iconv; +#endif + +static inline void +set_current_file_source_encoding (const char *canon_encoding) +{ + xgettext_current_file_source_encoding = canon_encoding; + + if (xgettext_current_file_source_encoding != po_charset_ascii + && xgettext_current_file_source_encoding != po_charset_utf8) + { +#if HAVE_ICONV + iconv_t cd; + + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION + if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0) + cd = (iconv_t)(-1); + else +# endif + cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding); + if (cd == (iconv_t)(-1)) + error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\ +Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \ +and iconv() does not support this conversion."), + xgettext_current_file_source_encoding, po_charset_utf8, + basename (program_name)); + xgettext_current_file_source_iconv = cd; +#else + error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\ +Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \ +This version was built without iconv()."), + xgettext_global_source_encoding, po_charset_utf8, + basename (program_name)); +#endif + } + + xgettext_current_source_encoding = xgettext_current_file_source_encoding; +#if HAVE_ICONV + xgettext_current_source_iconv = xgettext_current_file_source_iconv; +#endif +} + +static inline void +try_to_extract_coding (const char *comment) +{ + const char *p = strstr (comment, "coding"); + + if (p != NULL) + { + p += 6; + if (*p == ':' || *p == '=') + { + p++; + while (*p == ' ' || *p == '\t') + p++; + { + const char *encoding_start = p; + + while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.') + p++; + { + const char *encoding_end = p; + + if (encoding_end > encoding_start) + { + /* Extract the encoding string. */ + size_t encoding_len = encoding_end - encoding_start; + char *encoding = (char *) xmalloc (encoding_len + 1); + + memcpy (encoding, encoding_start, encoding_len); + encoding[encoding_len] = '\0'; + + { + /* Canonicalize it. */ + const char *canon_encoding = po_charset_canonicalize (encoding); + if (canon_encoding == NULL) + { + error_at_line (0, 0, + logical_file_name, line_number - 1, _("\ +Unknown encoding \"%s\". Proceeding with ASCII instead."), + encoding); + canon_encoding = po_charset_ascii; + } + + /* Activate it. */ + set_current_file_source_encoding (canon_encoding); + } + + free (encoding); + } + } + } + } + } +} + +/* Tracking whether the current line is a continuation line or contains a + non-blank character. */ +static bool continuation_or_nonblank_line = false; + + +/* Phase 3: Outside strings, replace backslash-newline with nothing and a + comment with nothing. */ static int -phase2_getc () +phase3_getc () { int c; for (;;) { - c = phase1_getc (); + c = phase2_getc (); if (c == '\\') { - c = phase1_getc (); + c = phase2_getc (); if (c != '\n') { - phase1_ungetc (c); + phase2_ungetc (c); /* This shouldn't happen usually, because "A backslash is illegal elsewhere on a line outside a string literal." */ return '\\'; } /* Eat backslash-newline. */ + continuation_or_nonblank_line = true; } else if (c == '#') { /* Eat a comment. */ + const char *comment; + last_comment_line = line_number; comment_start (); for (;;) { - c = phase1_getc (); - if (c == EOF || c == '\n') + c = phase2_getc (); + if (c == UEOF || c == '\n') break; /* We skip all leading white space, but not EOLs. */ - if (!(buflen == 0 && (c == ' ' || c == '\t'))) + if (!(comment_at_start () && (c == ' ' || c == '\t'))) comment_add (c); } - comment_line_end (); + comment = comment_line_end (); + if (line_number - 1 <= 2 && !continuation_or_nonblank_line) + try_to_extract_coding (comment); + continuation_or_nonblank_line = false; return c; } else - return c; + { + if (c == '\n') + continuation_or_nonblank_line = false; + else if (!(c == ' ' || c == '\t' || c == '\f')) + continuation_or_nonblank_line = true; + return c; + } } } /* Supports only one pushback character. */ static void -phase2_ungetc (int c) +phase3_ungetc (int c) +{ + phase2_ungetc (c); +} + + +/* ========================= Accumulating strings. ======================== */ + +/* Return value of phase7_getuc when EOF is reached. */ +#define P7_EOF (-1) +#define P7_STRING_END (-2) + +/* Convert an UTF-16 or UTF-32 code point to a return value that can be + distinguished from a single-byte return value. */ +#define UNICODE(code) (0x100 + (code)) + +/* Test a return value of phase7_getuc whether it designates an UTF-16 or + UTF-32 code point. */ +#define IS_UNICODE(p7_result) ((p7_result) >= 0x100) + +/* Extract the UTF-16 or UTF-32 code of a return value that satisfies + IS_UNICODE. */ +#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100) + +/* A string buffer type that allows appending bytes (in the + xgettext_current_source_encoding) or Unicode characters. + Returns the entire string in UTF-8 encoding. */ + +struct mixed_string_buffer +{ + /* The part of the string that has already been converted to UTF-8. */ + char *utf8_buffer; + size_t utf8_buflen; + size_t utf8_allocated; + /* The first half of an UTF-16 surrogate character. */ + unsigned short utf16_surr; + /* The part of the string that is still in the source encoding. */ + char *curr_buffer; + size_t curr_buflen; + size_t curr_allocated; +}; + +/* Initialize a 'struct mixed_string_buffer' to empty. */ +static inline void +init_mixed_string_buffer (struct mixed_string_buffer *bp) +{ + bp->utf8_buffer = NULL; + bp->utf8_buflen = 0; + bp->utf8_allocated = 0; + bp->utf16_surr = 0; + bp->curr_buffer = NULL; + bp->curr_buflen = 0; + bp->curr_allocated = 0; +} + +/* Auxiliary function: Append a byte to bp->curr. */ +static inline void +mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c) +{ + if (bp->curr_buflen == bp->curr_allocated) + { + bp->curr_allocated = 2 * bp->curr_allocated + 10; + bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); + } + bp->curr_buffer[bp->curr_buflen++] = c; +} + +/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ +static inline void +mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count) +{ + if (bp->utf8_buflen + count > bp->utf8_allocated) + { + size_t new_allocated = 2 * bp->utf8_allocated + 10; + if (new_allocated < bp->utf8_buflen + count) + new_allocated = bp->utf8_buflen + count; + bp->utf8_allocated = new_allocated; + bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); + } +} + +/* Auxiliary function: Append a Unicode character to bp->utf8. + uc must be < 0x110000. */ +static inline void +mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc) +{ + unsigned char utf8buf[6]; + int count = u8_uctomb (utf8buf, uc, 6); + + if (count < 0) + /* The caller should have ensured that uc is not out-of-range. */ + abort (); + + mixed_string_buffer_append_unicode_grow (bp, count); + memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); + bp->utf8_buflen += count; +} + +/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */ +static inline void +mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp) +{ + if (bp->utf16_surr != 0) + { + /* A half surrogate is invalid, therefore use U+FFFD instead. */ + mixed_string_buffer_append_unicode (bp, 0xfffd); + bp->utf16_surr = 0; + } +} + +/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */ +static inline void +mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno) +{ + if (bp->curr_buflen > 0) + { + char *curr; + size_t count; + + mixed_string_buffer_append_byte (bp, '\0'); + + /* Convert from the source encoding to UTF-8. */ + curr = from_current_source_encoding (bp->curr_buffer, + logical_file_name, lineno); + + /* Append it to bp->utf8_buffer. */ + count = strlen (curr); + mixed_string_buffer_append_unicode_grow (bp, count); + memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count); + bp->utf8_buflen += count; + + if (curr != bp->curr_buffer) + free (curr); + bp->curr_buflen = 0; + } +} + +/* Append a character or Unicode character to a 'struct mixed_string_buffer'. */ +static void +mixed_string_buffer_append (struct mixed_string_buffer *bp, int c) +{ + if (IS_UNICODE (c)) + { + /* Append a Unicode character. */ + + /* Switch from multibyte character mode to Unicode character mode. */ + mixed_string_buffer_flush_curr_buffer (bp, line_number); + + /* Test whether this character and the previous one form a Unicode + surrogate character pair. */ + if (bp->utf16_surr != 0 + && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))) + { + unsigned short utf16buf[2]; + unsigned int uc; + + utf16buf[0] = bp->utf16_surr; + utf16buf[1] = UNICODE_VALUE (c); + if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2) + abort (); + + mixed_string_buffer_append_unicode (bp, uc); + bp->utf16_surr = 0; + } + else + { + mixed_string_buffer_flush_utf16_surr (bp); + + if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) + bp->utf16_surr = UNICODE_VALUE (c); + else + mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c)); + } + } + else + { + /* Append a single byte. */ + + /* Switch from Unicode character mode to multibyte character mode. */ + mixed_string_buffer_flush_utf16_surr (bp); + + /* When a newline is seen, convert the accumulated multibyte sequence. + This ensures a correct line number in the error message in case of + a conversion error. The "- 1" is to account for the newline. */ + if (c == '\n') + mixed_string_buffer_flush_curr_buffer (bp, line_number - 1); + + mixed_string_buffer_append_byte (bp, (unsigned char) c); + } +} + +/* Return the string buffer's contents. */ +static char * +mixed_string_buffer_result (struct mixed_string_buffer *bp) +{ + /* Flush all into bp->utf8_buffer. */ + mixed_string_buffer_flush_utf16_surr (bp); + mixed_string_buffer_flush_curr_buffer (bp, line_number); + /* NUL-terminate it. */ + mixed_string_buffer_append_unicode_grow (bp, 1); + bp->utf8_buffer[bp->utf8_buflen] = '\0'; + /* Return it. */ + return bp->utf8_buffer; +} + +/* Free the memory pointed to by a 'struct mixed_string_buffer'. */ +static inline void +free_mixed_string_buffer (struct mixed_string_buffer *bp) { - phase1_ungetc (c); + free (bp->utf8_buffer); + free (bp->curr_buffer); } @@ -336,11 +1005,8 @@ struct token_ty u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...} ur"abc" \unnnn The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two - \unnnn items. The \ooo and \xnn values are ISO-8859-1 values: u"\xff" and - u"\u00ff" are the same. */ - -#define P7_EOF (-1) -#define P7_STRING_END (-2) + \unnnn items. The \ooo and \xnn values are in the current source encoding. + */ static int phase7_getuc (int quote_char, @@ -351,26 +1017,26 @@ phase7_getuc (int quote_char, for (;;) { - /* Use phase 1, because phase 2 elides comments. */ - c = phase1_getc (); + /* Use phase 2, because phase 3 elides comments. */ + c = phase2_getc (); - if (c == EOF) + if (c == UEOF) return P7_EOF; if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0)) { if (triple) { - int c1 = phase1_getc (); + int c1 = phase2_getc (); if (c1 == quote_char) { - int c2 = phase1_getc (); + int c2 = phase2_getc (); if (c2 == quote_char) return P7_STRING_END; - phase1_ungetc (c2); + phase2_ungetc (c2); } - phase1_ungetc (c1); - return c; + phase2_ungetc (c1); + return UNICODE (c); } else return P7_STRING_END; @@ -381,7 +1047,7 @@ phase7_getuc (int quote_char, if (triple) { *backslash_counter = 0; - return '\n'; + return UNICODE ('\n'); } /* In r"..." and ur"..." strings, newline is only allowed immediately after an odd number of backslashes (although the @@ -389,9 +1055,9 @@ phase7_getuc (int quote_char, if (!(interpret_ansic || (*backslash_counter & 1) == 0)) { *backslash_counter = 0; - return '\n'; + return UNICODE ('\n'); } - phase1_ungetc (c); + phase2_ungetc (c); error_with_progname = false; error (0, 0, _("%s:%d: warning: unterminated string"), logical_file_name, line_number); @@ -402,7 +1068,7 @@ phase7_getuc (int quote_char, if (c != '\\') { *backslash_counter = 0; - return c; + return UNICODE (c); } /* Backslash handling. */ @@ -410,15 +1076,15 @@ phase7_getuc (int quote_char, if (!interpret_ansic && !interpret_unicode) { ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } /* Dispatch according to the character following the backslash. */ - c = phase1_getc (); - if (c == EOF) + c = phase2_getc (); + if (c == UEOF) { ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } if (interpret_ansic) @@ -428,60 +1094,60 @@ phase7_getuc (int quote_char, continue; case '\\': ++*backslash_counter; - return c; + return UNICODE (c); case '\'': case '"': *backslash_counter = 0; - return c; + return UNICODE (c); case 'a': *backslash_counter = 0; - return '\a'; + return UNICODE ('\a'); case 'b': *backslash_counter = 0; - return '\b'; + return UNICODE ('\b'); case 'f': *backslash_counter = 0; - return '\f'; + return UNICODE ('\f'); case 'n': *backslash_counter = 0; - return '\n'; + return UNICODE ('\n'); case 'r': *backslash_counter = 0; - return '\r'; + return UNICODE ('\r'); case 't': *backslash_counter = 0; - return '\t'; + return UNICODE ('\t'); case 'v': *backslash_counter = 0; - return '\v'; + return UNICODE ('\v'); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { int n = c - '0'; - c = phase1_getc (); - if (c != EOF) + c = phase2_getc (); + if (c != UEOF) { if (c >= '0' && c <= '7') { n = (n << 3) + (c - '0'); - c = phase1_getc (); - if (c != EOF) + c = phase2_getc (); + if (c != UEOF) { if (c >= '0' && c <= '7') n = (n << 3) + (c - '0'); else - phase1_ungetc (c); + phase2_ungetc (c); } } else - phase1_ungetc (c); + phase2_ungetc (c); } *backslash_counter = 0; return (unsigned char) n; } case 'x': { - int c1 = phase1_getc (); + int c1 = phase2_getc (); int n1; if (c1 >= '0' && c1 <= '9') @@ -495,7 +1161,7 @@ phase7_getuc (int quote_char, if (n1 >= 0) { - int c2 = phase1_getc (); + int c2 = phase2_getc (); int n2; if (c2 >= '0' && c2 <= '9') @@ -513,12 +1179,12 @@ phase7_getuc (int quote_char, return (unsigned char) ((n1 << 4) + n2); } - phase1_ungetc (c2); + phase2_ungetc (c2); } - phase1_ungetc (c1); - phase1_ungetc (c); + phase2_ungetc (c1); + phase2_ungetc (c); ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } } @@ -532,7 +1198,7 @@ phase7_getuc (int quote_char, for (i = 0; i < 4; i++) { - int c1 = phase1_getc (); + int c1 = phase2_getc (); if (c1 >= '0' && c1 <= '9') n = (n << 4) + (c1 - '0'); @@ -542,18 +1208,18 @@ phase7_getuc (int quote_char, n = (n << 4) + (c1 - 'a' + 10); else { - phase1_ungetc (c1); + phase2_ungetc (c1); while (--i >= 0) - phase1_ungetc (buf[i]); - phase1_ungetc (c); + phase2_ungetc (buf[i]); + phase2_ungetc (c); ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } buf[i] = c1; } *backslash_counter = 0; - return n; + return UNICODE (n); } if (interpret_ansic) @@ -566,7 +1232,7 @@ phase7_getuc (int quote_char, for (i = 0; i < 8; i++) { - int c1 = phase1_getc (); + int c1 = phase2_getc (); if (c1 >= '0' && c1 <= '9') n = (n << 4) + (c1 - '0'); @@ -576,12 +1242,12 @@ phase7_getuc (int quote_char, n = (n << 4) + (c1 - 'a' + 10); else { - phase1_ungetc (c1); + phase2_ungetc (c1); while (--i >= 0) - phase1_ungetc (buf[i]); - phase1_ungetc (c); + phase2_ungetc (buf[i]); + phase2_ungetc (c); ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } buf[i] = c1; @@ -589,7 +1255,7 @@ phase7_getuc (int quote_char, if (n < 0x110000) { *backslash_counter = 0; - return n; + return UNICODE (n); } error_with_progname = false; @@ -598,15 +1264,15 @@ phase7_getuc (int quote_char, error_with_progname = true; while (--i >= 0) - phase1_ungetc (buf[i]); - phase1_ungetc (c); + phase2_ungetc (buf[i]); + phase2_ungetc (c); ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } if (c == 'N') { - int c1 = phase1_getc (); + int c1 = phase2_getc (); if (c1 == '{') { unsigned char buf[UNINAME_MAX + 1]; @@ -615,16 +1281,16 @@ phase7_getuc (int quote_char, for (i = 0; i < UNINAME_MAX; i++) { - int c2 = phase1_getc (); + int c2 = phase2_getc (); if (!(c2 >= ' ' && c2 <= '~')) { - phase1_ungetc (c2); + phase2_ungetc (c2); while (--i >= 0) - phase1_ungetc (buf[i]); - phase1_ungetc (c1); - phase1_ungetc (c); + phase2_ungetc (buf[i]); + phase2_ungetc (c1); + phase2_ungetc (c); ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } if (c2 == '}') break; @@ -636,24 +1302,24 @@ phase7_getuc (int quote_char, if (n != UNINAME_INVALID) { *backslash_counter = 0; - return n; + return UNICODE (n); } - phase1_ungetc ('}'); + phase2_ungetc ('}'); while (--i >= 0) - phase1_ungetc (buf[i]); + phase2_ungetc (buf[i]); } - phase1_ungetc (c1); - phase1_ungetc (c); + phase2_ungetc (c1); + phase2_ungetc (c); ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } } } - phase1_ungetc (c); + phase2_ungetc (c); ++*backslash_counter; - return '\\'; + return UNICODE ('\\'); } } @@ -681,11 +1347,11 @@ phase5_get (token_ty *tp) for (;;) { tp->line_number = line_number; - c = phase2_getc (); + c = phase3_getc (); switch (c) { - case EOF: + case UEOF: tp->type = token_type_eof; return; @@ -712,8 +1378,8 @@ phase5_get (token_ty *tp) { case '.': { - int c1 = phase2_getc (); - phase2_ungetc (c1); + int c1 = phase3_getc (); + phase3_ungetc (c1); if (!(c1 >= '0' && c1 <= '9')) { @@ -751,7 +1417,7 @@ phase5_get (token_ty *tp) buffer = xrealloc (buffer, bufmax); } buffer[bufpos++] = c; - c = phase2_getc (); + c = phase3_getc (); switch (c) { case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': @@ -769,7 +1435,7 @@ phase5_get (token_ty *tp) case '5': case '6': case '7': case '8': case '9': continue; default: - phase2_ungetc (c); + phase3_ungetc (c); break; } break; @@ -787,9 +1453,7 @@ phase5_get (token_ty *tp) /* Strings. */ { - static unsigned short *buffer; - static int bufmax; - int bufpos; + struct mixed_string_buffer literal; int quote_char; bool interpret_ansic; bool interpret_unicode; @@ -798,7 +1462,7 @@ phase5_get (token_ty *tp) case 'R': case 'r': { - int c1 = phase1_getc (); + int c1 = phase2_getc (); if (c1 == '"' || c1 == '\'') { quote_char = c1; @@ -806,13 +1470,13 @@ phase5_get (token_ty *tp) interpret_unicode = false; goto string; } - phase1_ungetc (c1); + phase2_ungetc (c1); goto symbol; } case 'U': case 'u': { - int c1 = phase1_getc (); + int c1 = phase2_getc (); if (c1 == '"' || c1 == '\'') { quote_char = c1; @@ -822,7 +1486,7 @@ phase5_get (token_ty *tp) } if (c1 == 'R' || c1 == 'r') { - int c2 = phase1_getc (); + int c2 = phase2_getc (); if (c2 == '"' || c2 == '\'') { quote_char = c2; @@ -830,9 +1494,9 @@ phase5_get (token_ty *tp) interpret_unicode = true; goto string; } - phase1_ungetc (c2); + phase2_ungetc (c2); } - phase1_ungetc (c1); + phase2_ungetc (c1); goto symbol; } @@ -843,75 +1507,40 @@ phase5_get (token_ty *tp) string: triple = false; { - int c1 = phase1_getc (); + int c1 = phase2_getc (); if (c1 == quote_char) { - int c2 = phase1_getc (); + int c2 = phase2_getc (); if (c2 == quote_char) triple = true; else { - phase1_ungetc (c2); - phase1_ungetc (c1); + phase2_ungetc (c2); + phase2_ungetc (c1); } } else - phase1_ungetc (c1); + phase2_ungetc (c1); } backslash_counter = 0; - /* Start accumulating the string. We store the string in - UTF-16 before converting it to UTF-8. Why not converting - every character directly to UTF-8? Because a string can - contain surrogates like u"\uD800\uDF00", and we must - combine them to a single UTF-8 character. */ - bufpos = 0; + /* Start accumulating the string. */ + init_mixed_string_buffer (&literal); for (;;) { int uc = phase7_getuc (quote_char, triple, interpret_ansic, interpret_unicode, &backslash_counter); - unsigned int len; if (uc == P7_EOF || uc == P7_STRING_END) break; - assert (uc >= 0 && uc < 0x110000); - len = (uc < 0x10000 ? 1 : 2); - if (bufpos + len > bufmax) - { - bufmax = 2 * bufmax + 10; - buffer = - xrealloc (buffer, bufmax * sizeof (unsigned short)); - } - if (uc < 0x10000) - buffer[bufpos++] = uc; - else - { - buffer[bufpos++] = 0xd800 + ((uc - 0x10000) >> 10); - buffer[bufpos++] = 0xdc00 + ((uc - 0x10000) & 0x3ff); - } - } - /* Now convert from UTF-16 to UTF-8. */ - { - int pos; - unsigned char *utf8_string; - unsigned char *q; - - /* Each UTF-16 word needs 3 bytes at worst. */ - utf8_string = (unsigned char *) xmalloc (3 * bufpos + 1); - for (pos = 0, q = utf8_string; pos < bufpos; ) - { - unsigned int uc; - int n; + if (IS_UNICODE (uc)) + assert (UNICODE_VALUE (uc) >= 0 + && UNICODE_VALUE (uc) < 0x110000); - pos += u16_mbtouc (&uc, buffer + pos, bufpos - pos); - n = u8_uctomb (q, uc, 6); - assert (n > 0); - q += n; - } - *q = '\0'; - assert (q - utf8_string <= 3 * bufpos); - tp->string = (char *) utf8_string; - } + mixed_string_buffer_append (&literal, uc); + } + tp->string = xstrdup (mixed_string_buffer_result (&literal)); + free_mixed_string_buffer (&literal); tp->comment = add_reference (savable_comment); tp->type = token_type_string; return; @@ -1124,9 +1753,11 @@ extract_parenthesized (message_list_ty *mlp, if (extract_all) { + xgettext_current_source_encoding = po_charset_utf8; savable_comment_to_xgettext_comment (token.comment); remember_a_message (mlp, token.string, inner_context, &pos); savable_comment_reset (); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; } else { @@ -1137,18 +1768,22 @@ extract_parenthesized (message_list_ty *mlp, /* Seen an msgid. */ message_ty *mp; + xgettext_current_source_encoding = po_charset_utf8; savable_comment_to_xgettext_comment (token.comment); mp = remember_a_message (mlp, token.string, inner_context, &pos); savable_comment_reset (); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; if (plural_commas > 0) plural_mp = mp; } else { /* Seen an msgid_plural. */ + xgettext_current_source_encoding = po_charset_utf8; remember_a_message_plural (plural_mp, token.string, inner_context, &pos); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; plural_mp = NULL; } } @@ -1184,9 +1819,6 @@ extract_python (FILE *f, { message_list_ty *mlp = mdlp->item[0]->messages; - /* We convert our strings to UTF-8 encoding. */ - xgettext_current_source_encoding = po_charset_utf8; - fp = f; real_file_name = real_filename; logical_file_name = xstrdup (logical_filename); @@ -1195,6 +1827,18 @@ extract_python (FILE *f, last_comment_line = -1; last_non_comment_line = -1; + xgettext_current_file_source_encoding = xgettext_global_source_encoding; +#if HAVE_ICONV + xgettext_current_file_source_iconv = xgettext_global_source_iconv; +#endif + + xgettext_current_source_encoding = xgettext_current_file_source_encoding; +#if HAVE_ICONV + xgettext_current_source_iconv = xgettext_current_file_source_iconv; +#endif + + continuation_or_nonblank_line = false; + open_pbb = 0; flag_context_list_table = flag_table; |