diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/ChangeLog | 36 | ||||
-rw-r--r-- | src/msgcomm.c | 3 | ||||
-rw-r--r-- | src/po-charset.c | 2 | ||||
-rw-r--r-- | src/po-charset.h | 3 | ||||
-rw-r--r-- | src/po-lex.c | 1272 | ||||
-rw-r--r-- | src/po-lex.h | 18 | ||||
-rw-r--r-- | src/xgettext.c | 6 |
7 files changed, 940 insertions, 400 deletions
diff --git a/src/ChangeLog b/src/ChangeLog index 7738401..8279689 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,39 @@ +2001-07-01 Bruno Haible <haible@clisp.cons.org> + + * po-charset.h (po_lex_charset): New declaration. + * po-charset.c (po_lex_charset): Export variable. + * po-lex.h: Include xerror.h. + (gram_pos_column): New declaration. + (po_gram_error): Also output current column number. + (po_gram_error_at_line): Work around ## problem in gcc. + * po-lex.c: Include linebreak.h and utf8-ucs4.h. + (gram_pos_column): New variable. + (po_gram_error): Also output current column number. + (MBCHAR_BUF_SIZE): New macro. + (struct mbchar, mbchar_t): New types. + (memcpy_small, mb_iseof, mb_ptr, mb_len, mb_iseq, mb_isnul, mb_cmp, + mb_equal, mb_isascii): New functions. + (MB_UNPRINTABLE_WIDTH): New macro. + (mb_width, mb_putc, mb_setascii, mb_copy): New functions. + (NPUSHBACK): New macro. + (struct mbfile, mbfile_t): New types. + (signal_eilseq): New variable. + (mbfile_init, mbfile_getc, mbfile_ungetc): New functions. + (mbf): New variable. + (fp): Remove variable. + (lex_open): Initialize mbf, gram_pos_column, signal_eilseq. + (lex_close): Reset mbf, gram_pos_column, signal_eilseq. + (lex_getc): Return a multibyte character. Update gram_pos_column. + (lex_ungetc): Take a multibyte character. Update gram_pos_column. + (keyword_p): Use po_gram_error_at_line instead of po_gram_error. + No column number needed here. + (control_sequence): Read multibyte characters instead of bytes. + (po_gram_lex): Likewise. + * xgettext.c (exclude_directive_domain): Use po_gram_error_at_line + instead of po_gram_error. No column number needed here. + (extract_directive_domain): Likewise. + * msgcomm.c (extract_directive_domain): Likewise. + 2001-06-30 Bruno Haible <haible@clisp.cons.org> * message.h: Include stdbool.h. diff --git a/src/msgcomm.c b/src/msgcomm.c index 3c87336..b5b33ed 100644 --- a/src/msgcomm.c +++ b/src/msgcomm.c @@ -508,7 +508,8 @@ extract_directive_domain (that, name) po_ty *that; char *name; { - po_gram_error (_("this file may not contain domain directives")); + po_gram_error_at_line (&gram_pos, + _("this file may not contain domain directives")); } diff --git a/src/po-charset.c b/src/po-charset.c index 13cb87a..84ac634 100644 --- a/src/po-charset.c +++ b/src/po-charset.c @@ -92,7 +92,7 @@ po_charset_canonicalize (charset) } /* The PO file's encoding, as specified in the header entry. */ -static const char *po_lex_charset; +const char *po_lex_charset; #if HAVE_ICONV /* Converter from the PO file's encoding to UTF-8. */ diff --git a/src/po-charset.h b/src/po-charset.h index 440e909..cf3481e 100644 --- a/src/po-charset.h +++ b/src/po-charset.h @@ -28,6 +28,9 @@ compared using ==. */ extern const char *po_charset_canonicalize PARAMS ((const char *charset)); +/* The PO file's encoding, as specified in the header entry. */ +extern const char *po_lex_charset; + #if HAVE_ICONV /* Converter from the PO file's encoding to UTF-8. */ extern iconv_t po_lex_iconv; diff --git a/src/po-lex.c b/src/po-lex.c index 25060ec..c9f8fa6 100644 --- a/src/po-lex.c +++ b/src/po-lex.c @@ -1,7 +1,8 @@ /* GNU gettext - internationalization aids Copyright (C) 1995-1999, 2000, 2001 Free Software Foundation, Inc. - This file was written by Peter Miller <millerp@canb.auug.org.au> + This file was written by Peter Miller <millerp@canb.auug.org.au>. + Multibyte character handling by Bruno Haible <haible@clisp.cons.org>. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -24,9 +25,17 @@ #include <ctype.h> #include <errno.h> +#include <limits.h> #include <stdio.h> +#include <stdlib.h> +#include <string.h> #include <sys/types.h> +#if HAVE_ICONV +# include <iconv.h> +#endif + +#include "linebreak.h" #include "libgettext.h" #define _(str) gettext(str) @@ -51,6 +60,10 @@ #include "open-po.h" #include "po-gram-gen2.h" +#if HAVE_ICONV +# include "utf8-ucs4.h" +#endif + #if HAVE_C_BACKSLASH_A # define ALERT_CHAR '\a' #else @@ -58,63 +71,21 @@ #endif -static FILE *fp; +/* Current position within the PO file. */ lex_pos_ty gram_pos; -unsigned int gram_max_allowed_errors = 20; -static bool po_lex_obsolete; -static bool pass_comments = false; -bool pass_obsolete_entries = false; - - -/* Prototypes for local functions. Needed to ensure compiler checking of - function argument counts despite of K&R C function definition syntax. */ -static int lex_getc PARAMS ((void)); -static void lex_ungetc PARAMS ((int ch)); -static int keyword_p PARAMS ((const char *s)); -static int control_sequence PARAMS ((void)); - - -/* Open the PO file FNAME and prepare its lexical analysis. */ -void -lex_open (fname) - const char *fname; -{ - fp = open_po_file (fname, &gram_pos.file_name); - if (!fp) - error (EXIT_FAILURE, errno, - _("error while opening \"%s\" for reading"), fname); +int gram_pos_column; - gram_pos.line_number = 1; - po_lex_obsolete = false; - po_lex_charset_init (); -} - - -/* Terminate lexical analysis and close the current PO file. */ -void -lex_close () -{ - if (error_message_count > 0) - error (EXIT_FAILURE, 0, - ngettext ("found %d fatal error", "found %d fatal errors", - error_message_count), - error_message_count); - if (fp != stdin) - fclose (fp); - fp = NULL; - gram_pos.file_name = NULL; - gram_pos.line_number = 0; - error_message_count = 0; - po_lex_obsolete = false; - po_lex_charset_close (); -} +/* Error handling during the parsing of a PO file. + These functions can access gram_pos and gram_pos_column. */ +#if !(__STDC__ && \ + ((defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) \ + || (defined __GNUC__ && __GNUC__ >= 2))) /* CAUTION: If you change this function, you must also make identical - changes to the macros of the same name in src/po-lex.h */ + changes to the macro of the same name in src/po-lex.h */ -#if !__STDC__ || !defined __GNUC__ || __GNUC__ == 1 /* VARARGS1 */ void # if defined VA_START && __STDC__ @@ -130,17 +101,20 @@ po_gram_error (fmt, va_alist) char *buffer; VA_START (ap, fmt); - vasprintf (&buffer, fmt, ap); va_end (ap); error_with_progname = false; - error_at_line (0, 0, gram_pos.file_name, gram_pos.line_number, "%s", buffer); + error (0, 0, "%s:%d:%d: %s" gram_pos.file_name, gram_pos.line_number, + gram_pos_column + 1, buffer); error_with_progname = true; # else + char *totalfmt = xasprintf ("%s%s", "%s:%d:%d: ", fmt); + error_with_progname = false; - error_at_line (0, 0, gram_pos.file_name, gram_pos.line_number, fmt, - a1, a2, a3, a4, a5, a6, a7, a8); + error (0, 0, totalfmt, gram_pos.file_name, gram_pos.line_number, + gram_pos_column + 1, a1, a2, a3, a4, a5, a6, a7, a8); error_with_progname = true; + free (totalfmt); # endif /* Some messages need more than one line. Continuation lines are @@ -152,7 +126,6 @@ po_gram_error (fmt, va_alist) error (EXIT_FAILURE, 0, _("too many errors, aborting")); } - /* CAUTION: If you change this function, you must also make identical changes to the macro of the same name in src/po-lex.h */ @@ -172,7 +145,6 @@ po_gram_error_at_line (pp, fmt, va_alist) char *buffer; VA_START (ap, fmt); - vasprintf (&buffer, fmt, ap); va_end (ap); error_with_progname = false; @@ -194,64 +166,606 @@ po_gram_error_at_line (pp, fmt, va_alist) else if (error_message_count >= gram_max_allowed_errors) error (EXIT_FAILURE, 0, _("too many errors, aborting")); } + +#endif + + +/* The lowest level of PO file parsing converts bytes to multibyte characters. + This is needed + 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first + translation phase maps bytes to characters. + 2. to keep track of the current column, for the sake of precise error + location. Emacs compile.el interprets the column in error messages + by default as a screen column number, not as character number. + 3. to avoid skipping backslash-newline in the midst of a multibyte + character. If XY is a multibyte character, X \ newline Y is invalid. + */ + +/* Multibyte character data type. */ +/* Note this depends on po_lex_charset and po_lex_iconv, which get set + while the file is being parsed. */ + +#define MBCHAR_BUF_SIZE 24 + +struct mbchar +{ + size_t bytes; /* number of bytes of current character, > 0 */ +#if HAVE_ICONV + bool uc_valid; /* true if uc is a valid Unicode character */ + unsigned int uc; /* if uc_valid: the current character */ +#endif + char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */ +}; + +/* We want to pass multibyte characters by reference automatically, + therefore we use an array type. */ +typedef struct mbchar mbchar_t[1]; + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +static inline void memcpy_small PARAMS ((void *dst, const void *src, size_t n)); +static inline bool mb_iseof PARAMS ((const mbchar_t mbc)); +static inline const char *mb_ptr PARAMS ((const mbchar_t mbc)); +static inline size_t mb_len PARAMS ((const mbchar_t mbc)); +static inline bool mb_iseq PARAMS ((const mbchar_t mbc, char sc)); +static inline bool mb_isnul PARAMS ((const mbchar_t mbc)); +static inline int mb_cmp PARAMS ((const mbchar_t mbc1, const mbchar_t mbc2)); +static inline bool mb_equal PARAMS ((const mbchar_t mbc1, const mbchar_t mbc2)); +static inline bool mb_isascii PARAMS ((const mbchar_t mbc)); +static int mb_width PARAMS ((const mbchar_t mbc)); +static inline void mb_putc PARAMS ((const mbchar_t mbc, FILE *stream)); +static inline void mb_setascii PARAMS ((mbchar_t mbc, char sc)); +static inline void mb_copy PARAMS ((mbchar_t new, const mbchar_t old)); + +/* A version of memcpy optimized for the case n <= 1. */ +static inline void +memcpy_small (dst, src, n) + void *dst; + const void *src; + size_t n; +{ + if (n > 0) + { + char *q = (char *) dst; + const char *p = (const char *) src; + + *q = *p; + if (--n > 0) + do *++q = *++p; while (--n > 0); + } +} + +/* EOF (not a real character) is represented with bytes = 0 and + uc_valid = false. */ +static inline bool +mb_iseof (mbc) + const mbchar_t mbc; +{ + return (mbc->bytes == 0); +} + +/* Access the current character. */ +static inline const char * +mb_ptr (mbc) + const mbchar_t mbc; +{ + return mbc->buf; +} +static inline size_t +mb_len (mbc) + const mbchar_t mbc; +{ + return mbc->bytes; +} + +/* Comparison of characters. */ + +static inline bool +mb_iseq (mbc, sc) + const mbchar_t mbc; + char sc; +{ +#if HAVE_ICONV + if (mbc->uc_valid) + return (mbc->uc == sc); + else +#endif + return (mbc->bytes == 1 && mbc->buf[0] == sc); +} + +static inline bool +mb_isnul (mbc) + const mbchar_t mbc; +{ +#if HAVE_ICONV + if (mbc->uc_valid) + return (mbc->uc == 0); + else +#endif + return (mbc->bytes == 1 && mbc->buf[0] == 0); +} + +static inline int +mb_cmp (mbc1, mbc2) + const mbchar_t mbc1; + const mbchar_t mbc2; +{ +#if HAVE_ICONV + if (mbc1->uc_valid && mbc2->uc_valid) + return (int) mbc1->uc - (int) mbc2->uc; + else #endif + return (mbc1->bytes == mbc2->bytes + ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) + : mbc1->bytes < mbc2->bytes + ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1) + : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1)); +} +static inline bool +mb_equal (mbc1, mbc2) + const mbchar_t mbc1; + const mbchar_t mbc2; +{ +#if HAVE_ICONV + if (mbc1->uc_valid && mbc2->uc_valid) + return mbc1->uc == mbc2->uc; + else +#endif + return (mbc1->bytes == mbc2->bytes + && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0); +} + +/* <ctype.h>, <wctype.h> classification. */ + +static inline bool +mb_isascii (mbc) + const mbchar_t mbc; +{ +#if HAVE_ICONV + if (mbc->uc_valid) + return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F); + else +#endif + return (mbc->bytes == 1 +#if CHAR_MIN < 0x00 /* to avoid gcc warning */ + && mbc->buf[0] >= 0x00 +#endif +#if CHAR_MAX > 0x7F /* to avoid gcc warning */ + && mbc->buf[0] <= 0x7F +#endif + ); +} + +/* Extra <wchar.h> function. */ + +/* Unprintable characters appear as a small box of width 1. */ +#define MB_UNPRINTABLE_WIDTH 1 -/* Read a single character, dealing with backslash-newline. */ static int -lex_getc () +mb_width (mbc) + const mbchar_t mbc; +{ +#if HAVE_ICONV + if (mbc->uc_valid) + { + unsigned int uc = mbc->uc; + const char *encoding = + (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : ""); + int w = uc_width (uc, encoding); + /* For unprintable characters, arbitrarily return 0 for control + characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */ + if (w >= 0) + return w; + if (uc >= 0x0000 && uc <= 0x001F) + { + if (uc == 0x0009) + return 8 - (gram_pos_column & 7); + return 0; + } + if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029)) + return 0; + return MB_UNPRINTABLE_WIDTH; + } + else +#endif + { + if (mbc->bytes == 1) + { + if (mbc->buf[0] >= 0x00 && mbc->buf[0] <= 0x1F) + { + if (mbc->buf[0] == 0x09) + return 8 - (gram_pos_column & 7); + return 0; + } + if (mbc->buf[0] == 0x7F) + return 0; + } + return MB_UNPRINTABLE_WIDTH; + } +} + +/* Output. */ +static inline void +mb_putc (mbc, stream) + const mbchar_t mbc; + FILE *stream; +{ + fwrite (mbc->buf, 1, mbc->bytes, stream); +} + +/* Assignment. */ +static inline void +mb_setascii (mbc, sc) + mbchar_t mbc; + char sc; +{ + mbc->bytes = 1; +#if HAVE_ICONV + mbc->uc_valid = 1; + mbc->uc = sc; +#endif + mbc->buf[0] = sc; +} + +/* Copying a character. */ +static inline void +mb_copy (new, old) + mbchar_t new; + const mbchar_t old; +{ + memcpy_small (&new->buf[0], &old->buf[0], old->bytes); + new->bytes = old->bytes; +#if HAVE_ICONV + if ((new->uc_valid = old->uc_valid)) + new->uc = old->uc; +#endif +} + + +/* Multibyte character input. */ + +/* Number of characters that can be pushed back. + We need 1 for lex_getc, plus 1 for lex_ungetc. */ +#define NPUSHBACK 2 + +/* Data type of a multibyte character input stream. */ +struct mbfile +{ + FILE *fp; + bool eof_seen; + int have_pushback; + unsigned int bufcount; + char buf[MBCHAR_BUF_SIZE]; + struct mbchar pushback[NPUSHBACK]; +}; + +/* We want to pass multibyte streams by reference automatically, + therefore we use an array type. */ +typedef struct mbfile mbfile_t[1]; + +/* Whether invalid multibyte sequences in the input shall be signalled + or silently tolerated. */ +static bool signal_eilseq; + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +static inline void mbfile_init PARAMS ((mbfile_t mbf, FILE *stream)); +static void mbfile_getc PARAMS ((mbchar_t mbc, mbfile_t mbf)); +static void mbfile_ungetc PARAMS ((const mbchar_t mbc, mbfile_t mbf)); + +static inline void +mbfile_init (mbf, stream) + mbfile_t mbf; + FILE *stream; { - int c; + mbf->fp = stream; + mbf->eof_seen = false; + mbf->have_pushback = 0; + mbf->bufcount = 0; +} + +static void +mbfile_getc (mbc, mbf) + mbchar_t mbc; + mbfile_t mbf; +{ + size_t bytes; + + /* If EOF has already been seen, don't use getc. This matters if + mbf->fp is connected to an interactive tty. */ + if (mbf->eof_seen) + goto eof; + /* Return character pushed back, if there is one. */ + if (mbf->have_pushback > 0) + { + mbf->have_pushback--; + mb_copy (mbc, &mbf->pushback[mbf->have_pushback]); + return; + } + + /* Before using iconv, we need at least one byte. */ + if (mbf->bufcount == 0) + { + int c = getc (mbf->fp); + if (c == EOF) + { + mbf->eof_seen = true; + goto eof; + } + mbf->buf[0] = (unsigned char) c; + mbf->bufcount++; + } + +#if HAVE_ICONV + if (po_lex_iconv != (iconv_t)(-1)) + { + /* Use iconv on an increasing number of bytes. Read only as many + bytes from mbf->fp as needed. This is needed to give reasonable + interactive behaviour when mbf->fp is connected to an interactive + tty. */ + for (;;) + { + char scratchbuf[64]; + const char *inptr = &mbf->buf[0]; + size_t insize = mbf->bufcount; + char *outptr = &scratchbuf[0]; + size_t outsize = sizeof (scratchbuf); + + if (iconv (po_lex_iconv, + (ICONV_CONST char **) &inptr, &insize, + &outptr, &outsize) + == (size_t)(-1)) + { + /* We expect that no character has been produced. */ + if (insize < mbf->bufcount) + abort (); + if (outsize < sizeof (scratchbuf)) + abort (); + + if (errno == EILSEQ) + { + /* An invalid multibyte sequence was encountered. */ + /* Return a single byte. */ + if (signal_eilseq) + po_gram_error (_("invalid multibyte sequence")); + bytes = 1; + mbc->uc_valid = false; + break; + } + else if (errno == EINVAL) + { + /* An incomplete multibyte character. */ + int c; + + if (mbf->bufcount == MBCHAR_BUF_SIZE) + { + /* An overlong incomplete multibyte sequence was + encountered. */ + /* Return a single byte. */ + bytes = 1; + mbc->uc_valid = false; + break; + } + + /* Read one more byte and retry iconv. */ + c = getc (mbf->fp); + if (c == EOF) + { + mbf->eof_seen = true; + if (signal_eilseq) + po_gram_error (_("\ +incomplete multibyte sequence at end of file")); + bytes = mbf->bufcount; + mbc->uc_valid = false; + break; + } + mbf->buf[mbf->bufcount++] = (unsigned char) c; + if (c == '\n') + { + if (signal_eilseq) + po_gram_error (_("\ +incomplete multibyte sequence at end of line")); + bytes = mbf->bufcount - 1; + mbc->uc_valid = false; + break; + } + } + else + error (EXIT_FAILURE, errno, _("iconv failure")); + } + else + { + size_t outbytes = sizeof (scratchbuf) - outsize; + bytes = mbf->bufcount - insize; + + /* We expect that one character has been produced. */ + if (bytes == 0) + abort (); + if (outbytes == 0) + abort (); + /* Convert it from UTF-8 to UCS-4. */ + mbc->uc_valid = true; + if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) != outbytes) + abort (); + break; + } + } + } + else +#endif + { + /* Return a single byte. */ + bytes = 1; +#if HAVE_ICONV + mbc->uc_valid = false; +#endif + } + + /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ + memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes); + mbc->bytes = bytes; + + mbf->bufcount -= bytes; + if (mbf->bufcount > 0) + { + /* It's not worth calling memmove() for so few bytes. */ + unsigned int count = mbf->bufcount; + char *p = &mbf->buf[0]; + + do + { + *p = *(p + bytes); + p++; + } + while (--count > 0); + } + return; + +eof: + /* An mbchar_t with bytes == 0 is used to indicate EOF. */ + mbc->bytes = 0; +#if HAVE_ICONV + mbc->uc_valid = false; +#endif + return; +} + +static void +mbfile_ungetc (mbc, mbf) + const mbchar_t mbc; + mbfile_t mbf; +{ + if (mbf->have_pushback >= NPUSHBACK) + abort (); + mb_copy (&mbf->pushback[mbf->have_pushback], mbc); + mbf->have_pushback++; +} + + +/* Lexer variables. */ + +static mbfile_t mbf; +unsigned int gram_max_allowed_errors = 20; +static bool po_lex_obsolete; +static bool pass_comments = false; +bool pass_obsolete_entries = false; + + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +static void lex_getc PARAMS ((mbchar_t mbc)); +static void lex_ungetc PARAMS ((const mbchar_t mbc)); +static int keyword_p PARAMS ((const char *s)); +static int control_sequence PARAMS ((void)); + + +/* Open the PO file FNAME and prepare its lexical analysis. */ +void +lex_open (fname) + const char *fname; +{ + FILE *fp = open_po_file (fname, &gram_pos.file_name); + if (!fp) + error (EXIT_FAILURE, errno, + _("error while opening \"%s\" for reading"), fname); + mbfile_init (mbf, fp); + + gram_pos.line_number = 1; + gram_pos_column = 0; + signal_eilseq = true; + po_lex_obsolete = false; + po_lex_charset_init (); +} + + +/* Terminate lexical analysis and close the current PO file. */ +void +lex_close () +{ + if (error_message_count > 0) + error (EXIT_FAILURE, 0, + ngettext ("found %d fatal error", "found %d fatal errors", + error_message_count), + error_message_count); + + if (mbf->fp != stdin) + fclose (mbf->fp); + mbf->fp = NULL; + gram_pos.file_name = NULL; + gram_pos.line_number = 0; + gram_pos_column = 0; + signal_eilseq = false; + error_message_count = 0; + po_lex_obsolete = false; + po_lex_charset_close (); +} + + +/* Read a single character, dealing with backslash-newline. + Also keep track of the current line number and column number. */ +static void +lex_getc (mbc) + mbchar_t mbc; +{ for (;;) { - c = getc (fp); - switch (c) + mbfile_getc (mbc, mbf); + + if (mb_iseof (mbc)) { - case EOF: - if (ferror (fp)) + if (ferror (mbf->fp)) error (EXIT_FAILURE, errno, _("error while reading \"%s\""), gram_pos.file_name); - return EOF; + break; + } - case '\n': - ++gram_pos.line_number; - return '\n'; + if (mb_iseq (mbc, '\n')) + { + gram_pos.line_number++; + gram_pos_column = 0; + break; + } + + gram_pos_column += mb_width (mbc); + + if (mb_iseq (mbc, '\\')) + { + mbchar_t mbc2; - case '\\': - c = getc (fp); - if (c != '\n') + mbfile_getc (mbc2, mbf); + + if (!mb_iseq (mbc2, '\n')) { - if (c != EOF) - ungetc (c, fp); - return '\\'; + if (!mb_iseof (mbc2)) + mbfile_ungetc (mbc2, mbf); + break; } - ++gram_pos.line_number; - break; - default: - return c; + gram_pos.line_number++; + gram_pos_column = 0; } + else + break; } } static void -lex_ungetc (c) - int c; +lex_ungetc (mbc) + const mbchar_t mbc; { - switch (c) + if (!mb_iseof (mbc)) { - case EOF: - break; - - case '\n': - --gram_pos.line_number; - /* FALLTHROUGH */ - - default: - ungetc (c, fp); - break; + if (mb_iseq (mbc, '\n')) + /* Decrement the line number, but don't care about the column. */ + gram_pos.line_number--; + else + /* Decrement the column number. Also works well enough for tabs. */ + gram_pos_column -= mb_width (mbc); + + mbfile_ungetc (mbc, mbf); } } @@ -268,7 +782,7 @@ keyword_p (s) return MSGID_PLURAL; if (!strcmp (s, "msgstr")) return MSGSTR; - po_gram_error (_("keyword \"%s\" unknown"), s); + po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s); return NAME; } @@ -276,101 +790,107 @@ keyword_p (s) static int control_sequence () { - int c; + mbchar_t mbc; int val; int max; - c = lex_getc (); - switch (c) - { - case 'n': - return '\n'; + lex_getc (mbc); + if (mb_len (mbc) == 1) + switch (mb_ptr (mbc) [0]) + { + case 'n': + return '\n'; - case 't': - return '\t'; + case 't': + return '\t'; - case 'b': - return '\b'; + case 'b': + return '\b'; - case 'r': - return '\r'; + case 'r': + return '\r'; - case 'f': - return '\f'; + case 'f': + return '\f'; - case 'v': - return '\v'; + case 'v': + return '\v'; - case 'a': - return ALERT_CHAR; + case 'a': + return ALERT_CHAR; - case '\\': - case '"': - return c; + case '\\': + case '"': + return mb_ptr (mbc) [0]; - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - val = 0; - max = 0; - for (;;) - { - /* Warning: not portable, can't depend on '0'..'7' ordering. */ - val = val * 8 + (c - '0'); - if (++max == 3) - break; - c = lex_getc (); - switch (c) - { - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - continue; - - default: + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + val = 0; + max = 0; + for (;;) + { + char c = mb_ptr (mbc) [0]; + /* Warning: not portable, can't depend on '0'..'7' ordering. */ + val = val * 8 + (c - '0'); + if (++max == 3) break; - } - lex_ungetc (c); - break; - } - return val; + lex_getc (mbc); + if (mb_len (mbc) == 1) + switch (mb_ptr (mbc) [0]) + { + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + continue; - case 'x': - c = lex_getc (); - if (c == EOF || !isxdigit (c)) - break; + default: + break; + } + lex_ungetc (mbc); + break; + } + return val; - val = 0; - for (;;) - { - val *= 16; - if (isdigit (c)) - /* Warning: not portable, can't depend on '0'..'9' ordering */ - val += c - '0'; - else if (isupper (c)) - /* Warning: not portable, can't depend on 'A'..'F' ordering */ - val += c - 'A' + 10; - else - /* Warning: not portable, can't depend on 'a'..'f' ordering */ - val += c - 'a' + 10; + case 'x': + lex_getc (mbc); + if (mb_iseof (mbc) || mb_len (mbc) != 1 || !isxdigit (mb_ptr (mbc) [0])) + break; - c = lex_getc (); - switch (c) - { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - continue; + val = 0; + for (;;) + { + char c = mb_ptr (mbc) [0]; + val *= 16; + if (isdigit (c)) + /* Warning: not portable, can't depend on '0'..'9' ordering */ + val += c - '0'; + else if (isupper (c)) + /* Warning: not portable, can't depend on 'A'..'F' ordering */ + val += c - 'A' + 10; + else + /* Warning: not portable, can't depend on 'a'..'f' ordering */ + val += c - 'a' + 10; + + lex_getc (mbc); + if (mb_len (mbc) == 1) + switch (mb_ptr (mbc) [0]) + { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + continue; - default: - break; - } - lex_ungetc (c); - break; - } - return val; + default: + break; + } + lex_ungetc (mbc); + break; + } + return val; - /* FIXME: \u and \U are not handled. */ - } + /* FIXME: \u and \U are not handled. */ + } + lex_ungetc (mbc); po_gram_error (_("invalid control sequence")); return ' '; } @@ -383,274 +903,248 @@ po_gram_lex () { static char *buf; static size_t bufmax; - int c; + mbchar_t mbc; size_t bufpos; for (;;) { - c = lex_getc (); - switch (c) - { - case EOF: - /* Yacc want this for end of file. */ - return 0; + lex_getc (mbc); - case '\n': - po_lex_obsolete = false; - break; + if (mb_iseof (mbc)) + /* Yacc want this for end of file. */ + return 0; - case ' ': - case '\t': - case '\r': - case '\f': - case '\v': - break; + if (mb_len (mbc) == 1) + switch (mb_ptr (mbc) [0]) + { + case '\n': + po_lex_obsolete = false; + /* Ignore whitespace, not relevant for the grammar. */ + break; - case '#': - c = lex_getc (); - if (c == '~') - /* A pseudo-comment beginning with #~ is found. This is - not a comment. It is the format for obsolete entries. - We simply discard the "#~" prefix. The following - characters are expected to be well formed. */ - { - po_lex_obsolete = true; - break; - } + case ' ': + case '\t': + case '\r': + case '\f': + case '\v': + /* Ignore whitespace, not relevant for the grammar. */ + break; - /* Accumulate comments into a buffer. If we have been asked - to pass comments, generate a COMMENT token, otherwise - discard it. */ - if (pass_comments) - { - bufpos = 0; - while (1) - { - if (bufpos >= bufmax) - { - bufmax += 100; - buf = xrealloc (buf, bufmax); - } - if (c == EOF || c == '\n') - break; + case '#': + lex_getc (mbc); + if (mb_iseq (mbc, '~')) + /* A pseudo-comment beginning with #~ is found. This is + not a comment. It is the format for obsolete entries. + We simply discard the "#~" prefix. The following + characters are expected to be well formed. */ + { + po_lex_obsolete = true; + break; + } - buf[bufpos++] = c; - c = lex_getc (); - } - buf[bufpos] = 0; + /* Accumulate comments into a buffer. If we have been asked + to pass comments, generate a COMMENT token, otherwise + discard it. */ + signal_eilseq = false; + if (pass_comments) + { + bufpos = 0; + while (1) + { + while (bufpos + mb_len (mbc) >= bufmax) + { + bufmax += 100; + buf = xrealloc (buf, bufmax); + } + if (mb_iseof (mbc) || mb_iseq (mbc, '\n')) + break; - po_gram_lval.string.string = buf; - po_gram_lval.string.pos = gram_pos; - po_gram_lval.string.obsolete = po_lex_obsolete; - po_lex_obsolete = false; - return COMMENT; - } - else - { - /* We do this in separate loop because collecting large - comments while they get not passed to the upper layers - is not very effective. */ - while (c != EOF && c != '\n') - c = lex_getc (); - po_lex_obsolete = false; - } - break; + memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); + bufpos += mb_len (mbc); - case '"': - /* Accumulate a string. */ - { -#if HAVE_ICONV - size_t bufmbpos = 0; -#endif + lex_getc (mbc); + } + buf[bufpos] = '\0'; + + po_gram_lval.string.string = buf; + po_gram_lval.string.pos = gram_pos; + po_gram_lval.string.obsolete = po_lex_obsolete; + po_lex_obsolete = false; + signal_eilseq = true; + return COMMENT; + } + else + { + /* We do this in separate loop because collecting large + comments while they get not passed to the upper layers + is not very effective. */ + while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n')) + lex_getc (mbc); + po_lex_obsolete = false; + signal_eilseq = true; + } + break; + case '"': + /* Accumulate a string. */ bufpos = 0; while (1) { - if (bufpos >= bufmax) + lex_getc (mbc); + while (bufpos + mb_len (mbc) >= bufmax) { bufmax += 100; buf = xrealloc (buf, bufmax); } - c = lex_getc (); - if (c == EOF) + if (mb_iseof (mbc)) { - po_gram_error (_("end-of-file within string")); + po_gram_error_at_line (&gram_pos, + _("end-of-file within string")); break; } - if (c == '\n') + if (mb_iseq (mbc, '\n')) { - po_gram_error (_("end-of-line within string")); + po_gram_error_at_line (&gram_pos, + _("end-of-line within string")); break; } -#if HAVE_ICONV - /* Interpret c only if it is the first byte of a multi-byte - character. Don't interpret it as ASCII when it is the - second byte. This is needed for the BIG5, BIG5HKSCS, GBK, - GB18030, SJIS, JOHAB encodings. */ - if (po_lex_iconv == (iconv_t)(-1) || bufmbpos == bufpos) -#endif + if (mb_iseq (mbc, '"')) + break; + if (mb_iseq (mbc, '\\')) { - if (c == '"') - break; - - if (c == '\\') - { - buf[bufpos++] = control_sequence (); -#if HAVE_ICONV - bufmbpos++; -#endif - continue; - } + buf[bufpos++] = control_sequence (); + continue; } - /* Add c to the accumulator. */ - buf[bufpos++] = c; -#if HAVE_ICONV - if (po_lex_iconv != (iconv_t)(-1)) - { - /* If c terminates a multibyte character, set - bufmbpos = bufpos. Otherwise keep bufmbpos - pointing at the start of the multibyte character. */ - char scratchbuf[64]; - const char *inptr = &buf[bufmbpos]; - size_t insize = bufpos - bufmbpos; - char *outptr = &scratchbuf[0]; - size_t outsize = sizeof (scratchbuf); - if (iconv (po_lex_iconv, - (ICONV_CONST char **) &inptr, &insize, - &outptr, &outsize) - == (size_t)(-1) - && errno == EILSEQ) - { - po_gram_error (_("invalid multibyte sequence")); - bufmbpos = bufpos; - } - else - bufmbpos = inptr - buf; - } -#endif + /* Add mbc to the accumulator. */ + memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); + bufpos += mb_len (mbc); } - buf[bufpos] = 0; + buf[bufpos] = '\0'; /* FIXME: Treatment of embedded \000 chars is incorrect. */ po_gram_lval.string.string = xstrdup (buf); po_gram_lval.string.pos = gram_pos; po_gram_lval.string.obsolete = po_lex_obsolete; return STRING; - } - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': - case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': - case 's': case 't': case 'u': case 'v': case 'w': case 'x': - case 'y': case 'z': - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': - case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': - case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': - case 'Y': case 'Z': - case '_': case '$': - bufpos = 0; - for (;;) - { - if (bufpos + 1 >= bufmax) - { - bufmax += 100; - buf = xrealloc (buf, bufmax); - } - buf[bufpos++] = c; - c = lex_getc (); - switch (c) - { - default: - break; - case 'a': case 'b': case 'c': case 'd': - case 'e': case 'f': case 'g': case 'h': - case 'i': case 'j': case 'k': case 'l': - case 'm': case 'n': case 'o': case 'p': - case 'q': case 'r': case 's': case 't': - case 'u': case 'v': case 'w': case 'x': - case 'y': case 'z': - case 'A': case 'B': case 'C': case 'D': - case 'E': case 'F': case 'G': case 'H': - case 'I': case 'J': case 'K': case 'L': - case 'M': case 'N': case 'O': case 'P': - case 'Q': case 'R': case 'S': case 'T': - case 'U': case 'V': case 'W': case 'X': - case 'Y': case 'Z': - case '_': case '$': - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - case '8': case '9': - continue; - } - break; - } - lex_ungetc (c); + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': case '$': + bufpos = 0; + for (;;) + { + char c = mb_ptr (mbc) [0]; + if (bufpos + 1 >= bufmax) + { + bufmax += 100; + buf = xrealloc (buf, bufmax); + } + buf[bufpos++] = c; + lex_getc (mbc); + if (mb_len (mbc) == 1) + switch (mb_ptr (mbc) [0]) + { + default: + break; + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case '_': case '$': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + continue; + } + break; + } + lex_ungetc (mbc); - buf[bufpos] = 0; + buf[bufpos] = '\0'; - c = keyword_p (buf); - if (c == NAME) { - po_gram_lval.string.string = xstrdup (buf); - po_gram_lval.string.pos = gram_pos; - po_gram_lval.string.obsolete = po_lex_obsolete; - } - else - { - po_gram_lval.pos.pos = gram_pos; - po_gram_lval.pos.obsolete = po_lex_obsolete; - } - return c; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - bufpos = 0; - for (;;) - { - if (bufpos + 1 >= bufmax) + int k = keyword_p (buf); + if (k == NAME) { - bufmax += 100; - buf = xrealloc (buf, bufmax + 1); + po_gram_lval.string.string = xstrdup (buf); + po_gram_lval.string.pos = gram_pos; + po_gram_lval.string.obsolete = po_lex_obsolete; } - buf[bufpos++] = c; - c = lex_getc (); - switch (c) + else { - default: - break; - - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - case '8': case '9': - continue; + po_gram_lval.pos.pos = gram_pos; + po_gram_lval.pos.obsolete = po_lex_obsolete; } - break; + return k; } - lex_ungetc (c); - buf[bufpos] = 0; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + bufpos = 0; + for (;;) + { + char c = mb_ptr (mbc) [0]; + if (bufpos + 1 >= bufmax) + { + bufmax += 100; + buf = xrealloc (buf, bufmax + 1); + } + buf[bufpos++] = c; + lex_getc (mbc); + if (mb_len (mbc) == 1) + switch (mb_ptr (mbc) [0]) + { + default: + break; - po_gram_lval.number.number = atol (buf); - po_gram_lval.number.pos = gram_pos; - po_gram_lval.number.obsolete = po_lex_obsolete; - return NUMBER; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + continue; + } + break; + } + lex_ungetc (mbc); - case '[': - po_gram_lval.pos.pos = gram_pos; - po_gram_lval.pos.obsolete = po_lex_obsolete; - return '['; + buf[bufpos] = '\0'; - case ']': - po_gram_lval.pos.pos = gram_pos; - po_gram_lval.pos.obsolete = po_lex_obsolete; - return ']'; + po_gram_lval.number.number = atol (buf); + po_gram_lval.number.pos = gram_pos; + po_gram_lval.number.obsolete = po_lex_obsolete; + return NUMBER; - default: - /* This will cause a syntax error. */ - return JUNK; - } + case '[': + po_gram_lval.pos.pos = gram_pos; + po_gram_lval.pos.obsolete = po_lex_obsolete; + return '['; + + case ']': + po_gram_lval.pos.pos = gram_pos; + po_gram_lval.pos.obsolete = po_lex_obsolete; + return ']'; + + default: + /* This will cause a syntax error. */ + return JUNK; + } + else + /* This will cause a syntax error. */ + return JUNK; } } diff --git a/src/po-lex.h b/src/po-lex.h index bd1eb48..da2f5f2 100644 --- a/src/po-lex.h +++ b/src/po-lex.h @@ -25,6 +25,7 @@ #include "error.h" #include "progname.h" #include "pos.h" +#include "xerror.h" /* Lexical analyzer for reading PO files. */ @@ -33,6 +34,7 @@ /* Current position within the PO file. */ extern lex_pos_ty gram_pos; +extern int gram_pos_column; /* Number of parse errors within a PO file that cause the program to terminate. Cf. error_message_count, declared in <error.h>. */ @@ -68,17 +70,18 @@ extern void po_lex_pass_obsolete_entries PARAMS ((bool flag)); # define po_gram_error(fmt, ...) \ do { \ + char *totalfmt = xasprintf ("%s%s", "%s:%d:%d: ", fmt); \ error_with_progname = false; \ - error_at_line (0, 0, gram_pos.file_name, gram_pos.line_number, \ - fmt, __VA_ARGS__); \ + error (0, 0, totalfmt, gram_pos.file_name, gram_pos.line_number, \ + gram_pos_column + 1, __VA_ARGS__); \ error_with_progname = true; \ + free (totalfmt); \ if (*fmt == '.') \ --error_message_count; \ else if (error_message_count >= gram_max_allowed_errors) \ error (1, 0, _("too many errors, aborting")); \ } while (0) - /* CAUTION: If you change this macro, you must also make identical changes to the function of the same name in src/po-lex.c */ @@ -102,17 +105,18 @@ extern void po_lex_pass_obsolete_entries PARAMS ((bool flag)); # define po_gram_error(fmt, args...) \ do { \ + char *totalfmt = xasprintf ("%s%s", "%s:%d:%d: ", fmt); \ error_with_progname = false; \ - error_at_line (0, 0, gram_pos.file_name, gram_pos.line_number, \ - fmt, ## args); \ + error (0, 0, totalfmt, gram_pos.file_name, gram_pos.line_number, \ + gram_pos_column + 1 , ## args); \ error_with_progname = true; \ + free (totalfmt); \ if (*fmt == '.') \ --error_message_count; \ else if (error_message_count >= gram_max_allowed_errors) \ error (1, 0, _("too many errors, aborting")); \ } while (0) - /* CAUTION: If you change this macro, you must also make identical changes to the function of the same name in src/po-lex.c */ @@ -120,7 +124,7 @@ extern void po_lex_pass_obsolete_entries PARAMS ((bool flag)); do { \ error_with_progname = false; \ error_at_line (0, 0, (pos)->file_name, (pos)->line_number, \ - fmt, ## args); \ + fmt , ## args); \ error_with_progname = true; \ if (*fmt == '.') \ --error_message_count; \ diff --git a/src/xgettext.c b/src/xgettext.c index 712570a..9d3a599 100644 --- a/src/xgettext.c +++ b/src/xgettext.c @@ -673,7 +673,8 @@ exclude_directive_domain (pop, name) po_ty *pop; char *name; { - po_gram_error (_("this file may not contain domain directives")); + po_gram_error_at_line (&gram_pos, + _("this file may not contain domain directives")); } @@ -1090,7 +1091,8 @@ extract_directive_domain (that, name) po_ty *that; char *name; { - po_gram_error (_("this file may not contain domain directives")); + po_gram_error_at_line (&gram_pos, + _("this file may not contain domain directives")); } |