diff options
author | Bruno Haible <bruno@clisp.org> | 2003-02-14 14:23:42 +0000 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2009-06-23 12:09:39 +0200 |
commit | 44b2e0ed471d80fc04a0d99f456954a5ebc18a08 (patch) | |
tree | c6cb5592bd0c505f713168d3620b8a5d84bf8c5b /gettext-tools/src/x-c.c | |
parent | 16e610c636d1fc8dac47d685e9bafb138c01dd1e (diff) | |
download | external_gettext-44b2e0ed471d80fc04a0d99f456954a5ebc18a08.zip external_gettext-44b2e0ed471d80fc04a0d99f456954a5ebc18a08.tar.gz external_gettext-44b2e0ed471d80fc04a0d99f456954a5ebc18a08.tar.bz2 |
Move src/x-c.c to gettext-tools/src/x-c.c.
Diffstat (limited to 'gettext-tools/src/x-c.c')
-rw-r--r-- | gettext-tools/src/x-c.c | 1487 |
1 files changed, 1487 insertions, 0 deletions
diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c new file mode 100644 index 0000000..16c86c0 --- /dev/null +++ b/gettext-tools/src/x-c.c @@ -0,0 +1,1487 @@ +/* xgettext C/C++/ObjectiveC backend. + Copyright (C) 1995-1998, 2000-2002 Free Software Foundation, Inc. + + This file was written by Peter Miller <millerp@canb.auug.org.au> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "message.h" +#include "x-c.h" +#include "xgettext.h" +#include "error.h" +#include "progname.h" +#include "xmalloc.h" +#include "exit.h" +#include "hash.h" +#include "gettext.h" + +#define _(s) gettext(s) + + +/* The ANSI C standard defines several phases of translation: + + 1. Terminate line by \n, regardless of the external representation + of a text line. Stdio does this for us. + + 2. Convert trigraphs to their single character equivalents. + + 3. Concatenate each line ending in backslash (\) with the following + line. + + 4. Replace each comment with a space character. + + 5. Parse each resulting logical line as preprocessing tokens a + white space. + + 6. Recognize and carry out directives (it also expands macros on + non-directive lines, which we do not do here). + + 7. Replaces escape sequences within character strings with their + single character equivalents (we do this in step 5, because we + don't have to worry about the #include argument). + + 8. Concatenates adjacent string literals to form single string + literals (because we don't expand macros, there are a few things + we will miss). + + 9. Converts the remaining preprocessing tokens to C tokens and + discards any white space from the translation unit. + + This lexer implements the above, and presents the scanner (in + xgettext.c) with a stream of C tokens. The comments are + accumulated in a buffer, and given to xgettext when asked for. */ + +enum xgettext_token_type_ty +{ + xgettext_token_type_eof, + xgettext_token_type_keyword, + xgettext_token_type_lparen, + xgettext_token_type_rparen, + xgettext_token_type_comma, + xgettext_token_type_string_literal, + xgettext_token_type_symbol +}; +typedef enum xgettext_token_type_ty xgettext_token_type_ty; + +typedef struct xgettext_token_ty xgettext_token_ty; +struct xgettext_token_ty +{ + xgettext_token_type_ty type; + + /* These fields are used only for xgettext_token_type_keyword. */ + int argnum1; + int argnum2; + + /* This field is used only for xgettext_token_type_string_literal. */ + char *string; + + /* These fields are only for + xgettext_token_type_keyword, + xgettext_token_type_string_literal. */ + lex_pos_ty pos; +}; + + +enum token_type_ty +{ + token_type_character_constant, + token_type_eof, + token_type_eoln, + token_type_hash, + token_type_lparen, + token_type_rparen, + token_type_comma, + token_type_name, + token_type_number, + token_type_string_literal, + token_type_symbol, + token_type_white_space +}; +typedef enum token_type_ty token_type_ty; + +typedef struct token_ty token_ty; +struct token_ty +{ + token_type_ty type; + char *string; /* for token_type_name, token_type_string_literal */ + long number; + int line_number; +}; + + +/* ========================= Lexer customization. ========================= */ + +static bool trigraphs = false; + +void +x_c_trigraphs () +{ + trigraphs = true; +} + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_c_extract_all () +{ + extract_all = true; +} + + +void +x_c_keyword (const char *name) +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + int argnum1; + int argnum2; + const char *colon; + + if (keywords.table == NULL) + init_hash (&keywords, 100); + + split_keywordspec (name, &end, &argnum1, &argnum2); + + /* The characters between name and end should form a valid C identifier. + A colon means an invalid parse in split_keywordspec(). */ + colon = strchr (name, ':'); + if (colon == NULL || colon >= end) + { + if (argnum1 == 0) + argnum1 = 1; + insert_entry (&keywords, name, end - name, + (void *) (long) (argnum1 + (argnum2 << 10))); + } + } +} + +bool +x_c_any_keywords () +{ + return (keywords.filled > 0) || default_keywords; +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + x_c_keyword ("gettext"); + x_c_keyword ("dgettext:2"); + x_c_keyword ("dcgettext:2"); + x_c_keyword ("ngettext:1,2"); + x_c_keyword ("dngettext:2,3"); + x_c_keyword ("dcngettext:2,3"); + x_c_keyword ("gettext_noop"); + default_keywords = false; + } +} + + +/* ================== Reading of characters and tokens. =================== */ + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + + +/* 1. Terminate line by \n, regardless of the external representation of + a text line. Stdio does this for us, we just need to check that + there are no I/O errors, and cope with potentially 2 characters of + pushback, not just the one that ungetc can cope with. */ + +/* Maximum used guaranteed to be < 4. */ +static unsigned char phase1_pushback[4]; +static int phase1_pushback_length; + + +static int +phase1_getc () +{ + int c; + + if (phase1_pushback_length) + { + c = phase1_pushback[--phase1_pushback_length]; + if (c == '\n') + ++line_number; + return c; + } + for (;;) + { + c = getc (fp); + switch (c) + { + case EOF: + if (ferror (fp)) + { + bomb: + error (EXIT_FAILURE, errno, _("\ +error while reading \"%s\""), real_file_name); + } + return EOF; + + case '\n': + ++line_number; + return '\n'; + + case '\\': + c = getc (fp); + if (c == EOF) + { + if (ferror (fp)) + goto bomb; + return '\\'; + } + if (c != '\n') + { + ungetc (c, fp); + return '\\'; + } + ++line_number; + break; + + default: + return c; + } + } +} + + +static void +phase1_ungetc (int c) +{ + switch (c) + { + case EOF: + break; + + case '\n': + --line_number; + /* FALLTHROUGH */ + + default: + phase1_pushback[phase1_pushback_length++] = c; + break; + } +} + + +/* 2. Convert trigraphs to their single character equivalents. Most + sane human beings vomit copiously at the mention of trigraphs, which + is why they are an option. */ + +/* Maximum used guaranteed to be < 4. */ +static unsigned char phase2_pushback[4]; +static int phase2_pushback_length; + + +static int +phase2_getc () +{ + int c; + + if (phase2_pushback_length) + return phase2_pushback[--phase2_pushback_length]; + if (!trigraphs) + return phase1_getc (); + + c = phase1_getc (); + if (c != '?') + return c; + c = phase1_getc (); + if (c != '?') + { + phase1_ungetc (c); + return '?'; + } + c = phase1_getc (); + switch (c) + { + case '(': + return '['; + case '/': + return '\\'; + case ')': + return ']'; + case '\'': + return '^'; + case '<': + return '{'; + case '!': + return '|'; + case '>': + return '}'; + case '-': + return '~'; + case '#': + return '='; + } + phase1_ungetc (c); + phase1_ungetc ('?'); + return '?'; +} + + +static void +phase2_ungetc (int c) +{ + if (c != EOF) + phase2_pushback[phase2_pushback_length++] = c; +} + + +/* 3. Concatenate each line ending in backslash (\) with the following + line. Basically, all you need to do is elide "\\\n" sequences from + the input. */ + +/* Maximum used guaranteed to be < 4. */ +static unsigned char phase3_pushback[4]; +static int phase3_pushback_length; + + +static int +phase3_getc () +{ + if (phase3_pushback_length) + return phase3_pushback[--phase3_pushback_length]; + for (;;) + { + int c = phase2_getc (); + if (c != '\\') + return c; + c = phase2_getc (); + if (c != '\n') + { + phase2_ungetc (c); + return '\\'; + } + } +} + + +static void +phase3_ungetc (int c) +{ + if (c != EOF) + phase3_pushback[phase3_pushback_length++] = c; +} + + +/* Accumulating comments. */ + +static char *buffer; +static size_t bufmax; +static size_t buflen; + +static inline void +comment_start () +{ + buflen = 0; +} + +static inline void +comment_add (int c) +{ + if (buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen++] = c; +} + +static inline void +comment_line_end (size_t chars_to_remove) +{ + buflen -= chars_to_remove; + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + if (chars_to_remove == 0 && buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen] = '\0'; + xgettext_comment_add (buffer); +} + + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; +static int newline_count; + + +/* 4. Replace each comment that is not inside a character constant or + string literal with a space character. We need to remember the + comment for later, because it may be attached to a keyword string. + We also optionally understand C++ comments. */ + +static int +phase4_getc () +{ + int c; + bool last_was_star; + + c = phase3_getc (); + if (c != '/') + return c; + c = phase3_getc (); + switch (c) + { + default: + phase3_ungetc (c); + return '/'; + + case '*': + /* C comment. */ + comment_start (); + last_was_star = false; + for (;;) + { + c = phase3_getc (); + if (c == EOF) + break; + /* We skip all leading white space, but not EOLs. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + switch (c) + { + case '\n': + comment_line_end (1); + comment_start (); + last_was_star = false; + continue; + + case '*': + last_was_star = true; + continue; + + case '/': + if (last_was_star) + { + comment_line_end (2); + break; + } + /* FALLTHROUGH */ + + default: + last_was_star = false; + continue; + } + break; + } + last_comment_line = newline_count; + return ' '; + + case '/': + /* C++ or ISO C 99 comment. */ + comment_start (); + for (;;) + { + c = phase3_getc (); + if (c == '\n' || c == EOF) + break; + comment_add (c); + } + comment_line_end (0); + last_comment_line = newline_count; + return '\n'; + } +} + + +static void +phase4_ungetc (int c) +{ + phase3_ungetc (c); +} + + +/* 7. Replace escape sequences within character strings with their + single character equivalents. This is called from phase 5, because + we don't have to worry about the #include argument. There are + pathological cases which could bite us (like the DOS directory + separator), but just pretend it can't happen. */ + +#define P7_QUOTES (1000 + '"') +#define P7_QUOTE (1000 + '\'') +#define P7_NEWLINE (1000 + '\n') + +static int +phase7_getc () +{ + int c, n, j; + + /* Use phase 3, because phase 4 elides comments. */ + c = phase3_getc (); + + /* Return a magic newline indicator, so that we can distinguish + between the user requesting a newline in the string (e.g. using + "\n" or "\012") from the user failing to terminate the string or + character constant. The ANSI C standard says: 3.1.3.4 Character + Constants contain ``any character except single quote, backslash or + newline; or an escape sequence'' and 3.1.4 String Literals contain + ``any character except double quote, backslash or newline; or an + escape sequence''. + + Most compilers give a fatal error in this case, however gcc is + stupidly silent, even though this is a very common typo. OK, so + gcc --pedantic will tell me, but that gripes about too much other + stuff. Could I have a ``gcc -Wnewline-in-string'' option, or + better yet a ``gcc -fno-newline-in-string'' option, please? Gcc is + also inconsistent between string literals and character constants: + you may not embed newlines in character constants; try it, you get + a useful diagnostic. --PMiller */ + if (c == '\n') + return P7_NEWLINE; + + if (c == '"') + return P7_QUOTES; + if (c == '\'') + return P7_QUOTE; + if (c != '\\') + return c; + c = phase3_getc (); + switch (c) + { + default: + /* Unknown escape sequences really should be an error, but just + ignore them, and let the real compiler complain. */ + phase3_ungetc (c); + return '\\'; + + case '"': + case '\'': + case '?': + case '\\': + return c; + + case 'a': + return '\a'; + case 'b': + return '\b'; + + /* The \e escape is preculiar to gcc, and assumes an ASCII + character set (or superset). We don't provide support for it + here. */ + + case 'f': + return '\f'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'v': + return '\v'; + + case 'x': + c = phase3_getc (); + switch (c) + { + default: + phase3_ungetc (c); + phase3_ungetc ('x'); + return '\\'; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + break; + } + n = 0; + for (;;) + { + switch (c) + { + default: + phase3_ungetc (c); + return n; + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + n = n * 16 + c - '0'; + break; + + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + n = n * 16 + 10 + c - 'A'; + break; + + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + n = n * 16 + 10 + c - 'a'; + break; + } + c = phase3_getc (); + } + return n; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + n = 0; + for (j = 0; j < 3; ++j) + { + n = n * 8 + c - '0'; + c = phase3_getc (); + switch (c) + { + default: + break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + continue; + } + break; + } + phase3_ungetc (c); + return n; + } +} + + +static void +phase7_ungetc (int c) +{ + phase3_ungetc (c); +} + + +/* Free the memory pointed to by a 'struct token_ty'. */ +static inline void +free_token (token_ty *tp) +{ + if (tp->type == token_type_name || tp->type == token_type_string_literal) + free (tp->string); +} + + +/* 5. Parse each resulting logical line as preprocessing tokens and + white space. Preprocessing tokens and C tokens don't always match. */ + +/* Maximum used guaranteed to be < 4. */ +static token_ty phase5_pushback[4]; +static int phase5_pushback_length; + + +static void +phase5_get (token_ty *tp) +{ + static char *buffer; + static int bufmax; + int bufpos; + int c; + + if (phase5_pushback_length) + { + *tp = phase5_pushback[--phase5_pushback_length]; + return; + } + tp->string = NULL; + tp->number = 0; + tp->line_number = line_number; + c = phase4_getc (); + switch (c) + { + case EOF: + tp->type = token_type_eof; + return; + + case '\n': + tp->type = token_type_eoln; + return; + + case ' ': + case '\f': + case '\t': + for (;;) + { + c = phase4_getc (); + switch (c) + { + case ' ': + case '\f': + case '\t': + continue; + + default: + phase4_ungetc (c); + break; + } + break; + } + tp->type = token_type_white_space; + return; + + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': + case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': + case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': + case 'V': case 'W': case 'X': case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': + case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': + case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': + case 'v': case 'w': case 'x': case 'y': case 'z': + bufpos = 0; + for (;;) + { + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + c = phase4_getc (); + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + continue; + + default: + phase4_ungetc (c); + break; + } + break; + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = 0; + tp->string = xstrdup (buffer); + tp->type = token_type_name; + return; + + case '.': + c = phase4_getc (); + phase4_ungetc (c); + switch (c) + { + default: + tp->type = token_type_symbol; + return; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c = '.'; + break; + } + /* FALLTHROUGH */ + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + /* The preprocessing number token is more "generous" than the C + number tokens. This is mostly due to token pasting (another + thing we can ignore here). */ + bufpos = 0; + for (;;) + { + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + c = phase4_getc (); + switch (c) + { + case 'e': + case 'E': + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + c = phase4_getc (); + if (c != '+' || c != '-') + { + phase4_ungetc (c); + break; + } + continue; + + case 'A': case 'B': case 'C': case 'D': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case '.': + continue; + + default: + phase4_ungetc (c); + break; + } + break; + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = 0; + tp->type = token_type_number; + tp->number = atol (buffer); + return; + + case '\'': + /* We could worry about the 'L' before wide character constants, + but ignoring it has no effect unless one of the keywords is + "L". Just pretend it won't happen. Also, we don't need to + remember the character constant. */ + for (;;) + { + c = phase7_getc (); + if (c == P7_NEWLINE) + { + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated character constant"), + logical_file_name, line_number - 1); + error_with_progname = true; + phase7_ungetc ('\n'); + break; + } + if (c == EOF || c == P7_QUOTE) + break; + } + tp->type = token_type_character_constant; + return; + + case '"': + /* We could worry about the 'L' before wide string constants, + but since gettext's argument is not a wide character string, + let the compiler complain about the argument not matching the + prototype. Just pretend it won't happen. */ + bufpos = 0; + for (;;) + { + c = phase7_getc (); + if (c == P7_NEWLINE) + { + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated string literal"), + logical_file_name, line_number - 1); + error_with_progname = true; + phase7_ungetc ('\n'); + break; + } + if (c == EOF || c == P7_QUOTES) + break; + if (c == P7_QUOTE) + c = '\''; + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } + if (bufpos >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = 0; + tp->type = token_type_string_literal; + tp->string = xstrdup (buffer); + return; + + case '(': + tp->type = token_type_lparen; + return; + + case ')': + tp->type = token_type_rparen; + return; + + case ',': + tp->type = token_type_comma; + return; + + case '#': + tp->type = token_type_hash; + return; + + default: + /* We could carefully recognize each of the 2 and 3 character + operators, but it is not necessary, as we only need to recognize + gettext invocations. Don't bother. */ + tp->type = token_type_symbol; + return; + } +} + + +static void +phase5_unget (token_ty *tp) +{ + if (tp->type != token_type_eof) + phase5_pushback[phase5_pushback_length++] = *tp; +} + + +/* X. Recognize a leading # symbol. Leave leading hash as a hash, but + turn hash in the middle of a line into a plain symbol token. This + makes the phase 6 easier. */ + +static void +phaseX_get (token_ty *tp) +{ + static bool middle; /* false at the beginning of a line, true otherwise. */ + + phase5_get (tp); + + if (tp->type == token_type_eoln || tp->type == token_type_eof) + middle = false; + else + { + if (middle) + { + /* Turn hash in the middle of a line into a plain symbol token. */ + if (tp->type == token_type_hash) + tp->type = token_type_symbol; + } + else + { + /* When we see leading whitespace followed by a hash sign, + discard the leading white space token. The hash is all + phase 6 is interested in. */ + if (tp->type == token_type_white_space) + { + token_ty next; + + phase5_get (&next); + if (next.type == token_type_hash) + *tp = next; + else + phase5_unget (&next); + } + middle = true; + } + } +} + + +/* 6. Recognize and carry out directives (it also expands macros on + non-directive lines, which we do not do here). The only directive + we care about are the #line and #define directive. We throw all the + others away. */ + +/* Maximum used guaranteed to be < 4. */ +static token_ty phase6_pushback[4]; +static int phase6_pushback_length; + + +static void +phase6_get (token_ty *tp) +{ + static token_ty *buf; + static int bufmax; + int bufpos; + int j; + + if (phase6_pushback_length) + { + *tp = phase6_pushback[--phase6_pushback_length]; + return; + } + for (;;) + { + /* Get the next token. If it is not a '#' at the beginning of a + line (ignoring whitespace), return immediately. */ + phaseX_get (tp); + if (tp->type != token_type_hash) + return; + + /* Accumulate the rest of the directive in a buffer, until the + "define" keyword is seen or until end of line. */ + bufpos = 0; + for (;;) + { + phaseX_get (tp); + if (tp->type == token_type_eoln || tp->type == token_type_eof) + break; + + /* Before the "define" keyword and inside other directives + white space is irrelevant. So just throw it away. */ + if (tp->type != token_type_white_space) + { + /* If it is a #define directive, return immediately, + thus treating the body of the #define directive like + normal input. */ + if (bufpos == 0 + && tp->type == token_type_name + && strcmp (tp->string, "define") == 0) + return; + + /* Accumulate. */ + if (bufpos >= bufmax) + { + bufmax += 100; + buf = xrealloc (buf, bufmax * sizeof (buf[0])); + } + buf[bufpos++] = *tp; + } + } + + /* If it is a #line directive, with no macros to expand, act on + it. Ignore all other directives. */ + if (bufpos >= 3 && buf[0].type == token_type_name + && strcmp (buf[0].string, "line") == 0 + && buf[1].type == token_type_number + && buf[2].type == token_type_string_literal) + { + logical_file_name = xstrdup (buf[2].string); + line_number = buf[1].number; + } + if (bufpos >= 2 && buf[0].type == token_type_number + && buf[1].type == token_type_string_literal) + { + logical_file_name = xstrdup (buf[1].string); + line_number = buf[0].number; + } + + /* Release the storage held by the directive. */ + for (j = 0; j < bufpos; ++j) + free_token (&buf[j]); + + /* We must reset the selected comments. */ + xgettext_comment_reset (); + } +} + + +static void +phase6_unget (token_ty *tp) +{ + if (tp->type != token_type_eof) + phase6_pushback[phase6_pushback_length++] = *tp; +} + + +/* 8a. Convert ISO C 99 section 7.8.1 format string directives to string + literal placeholders. */ + +/* Test for an ISO C 99 section 7.8.1 format string directive. */ +static bool +is_inttypes_macro (const char *name) +{ + /* Syntax: + P R I { d | i | o | u | x | X } + { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR } */ + if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I') + { + name += 3; + if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u' + || name[0] == 'x' || name[0] == 'X') + { + name += 1; + if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X' + && name[3] == '\0') + return true; + if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R' + && name[3] == '\0') + return true; + if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A' + && name[3] == 'S' && name[4] == 'T') + name += 5; + else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S' + && name[3] == 'T') + name += 4; + if (name[0] == '8' && name[1] == '\0') + return true; + if (name[0] == '1' && name[1] == '6' && name[2] == '\0') + return true; + if (name[0] == '3' && name[1] == '2' && name[2] == '\0') + return true; + if (name[0] == '6' && name[1] == '4' && name[2] == '\0') + return true; + } + } + return false; +} + +static void +phase8a_get (token_ty *tp) +{ + phase6_get (tp); + if (tp->type == token_type_name && is_inttypes_macro (tp->string)) + { + /* Turn PRIdXXX into "<PRIdXXX>". */ + size_t len = strlen (tp->string); + char *new_string = (char *) xmalloc (len + 3); + new_string[0] = '<'; + memcpy (new_string + 1, tp->string, len); + new_string[len + 1] = '>'; + new_string[len + 2] = '\0'; + free (tp->string); + tp->string = new_string; + tp->type = token_type_string_literal; + } +} + +static void +phase8a_unget (token_ty *tp) +{ + phase6_unget (tp); +} + + +/* 8. Concatenate adjacent string literals to form single string + literals (because we don't expand macros, there are a few things we + will miss). */ + +static void +phase8_get (token_ty *tp) +{ + phase8a_get (tp); + if (tp->type != token_type_string_literal) + return; + for (;;) + { + token_ty tmp; + size_t len; + + phase8a_get (&tmp); + if (tmp.type == token_type_white_space) + continue; + if (tmp.type == token_type_eoln) + continue; + if (tmp.type != token_type_string_literal) + { + phase8a_unget (&tmp); + return; + } + len = strlen (tp->string); + tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1); + strcpy (tp->string + len, tmp.string); + free (tmp.string); + } +} + + +/* 9. Convert the remaining preprocessing tokens to C tokens and + discards any white space from the translation unit. */ + +static void +x_c_lex (xgettext_token_ty *tp) +{ + for (;;) + { + token_ty token; + void *keyword_value; + + phase8_get (&token); + switch (token.type) + { + case token_type_eof: + tp->type = xgettext_token_type_eof; + return; + + case token_type_white_space: + break; + + case token_type_eoln: + /* We have to track the last occurrence of a string. One + mode of xgettext allows to group an extracted message + with a comment for documentation. The rule which states + which comment is assumed to be grouped with the message + says it should immediately precede it. Our + interpretation: between the last line of the comment and + the line in which the keyword is found must be no line + with non-white space tokens. */ + ++newline_count; + if (last_non_comment_line > last_comment_line) + xgettext_comment_reset (); + break; + + case token_type_name: + last_non_comment_line = newline_count; + + if (find_entry (&keywords, token.string, strlen (token.string), + &keyword_value) + == 0) + { + tp->type = xgettext_token_type_keyword; + tp->argnum1 = (int) (long) keyword_value & ((1 << 10) - 1); + tp->argnum2 = (int) (long) keyword_value >> 10; + tp->pos.file_name = logical_file_name; + tp->pos.line_number = token.line_number; + } + else + tp->type = xgettext_token_type_symbol; + free (token.string); + return; + + case token_type_lparen: + last_non_comment_line = newline_count; + + tp->type = xgettext_token_type_lparen; + return; + + case token_type_rparen: + last_non_comment_line = newline_count; + + tp->type = xgettext_token_type_rparen; + return; + + case token_type_comma: + last_non_comment_line = newline_count; + + tp->type = xgettext_token_type_comma; + return; + + case token_type_string_literal: + last_non_comment_line = newline_count; + + tp->type = xgettext_token_type_string_literal; + tp->string = token.string; + tp->pos.file_name = logical_file_name; + tp->pos.line_number = token.line_number; + return; + + default: + last_non_comment_line = newline_count; + + tp->type = xgettext_token_type_symbol; + return; + } + } +} + + +/* ========================= Extracting strings. ========================== */ + +/* The file is broken into tokens. Scan the token stream, looking for + a keyword, followed by a left paren, followed by a string. When we + see this sequence, we have something to remember. We assume we are + looking at a valid C or C++ program, and leave the complaints about + the grammar to the compiler. + + Normal handling: Look for + keyword ( ... msgid ... ) + Plural handling: Look for + keyword ( ... msgid ... msgid_plural ... ) + + We use recursion because the arguments before msgid or between msgid + and msgid_plural can contain subexpressions of the same form. */ + + +/* Extract messages until the next balanced closing parenthesis. + Extracted messages are added to MLP. + When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and, + if also a plural argument shall be extracted, PLURAL_COMMAS > 0, + otherwise PLURAL_COMMAS = 0. + When no specific argument shall be extracted, COMMAS_TO_SKIP < 0. + Return true upon eof, false upon closing parenthesis. */ +static bool +extract_parenthesized (message_list_ty *mlp, + int commas_to_skip, int plural_commas) +{ + /* Remember the message containing the msgid, for msgid_plural. */ + message_ty *plural_mp = NULL; + + /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ + int state; + /* Parameters of the keyword just seen. Defined only in state 1. */ + int next_commas_to_skip = -1; + int next_plural_commas = 0; + + /* Start state is 0. */ + state = 0; + + for (;;) + { + xgettext_token_ty token; + + x_c_lex (&token); + switch (token.type) + { + case xgettext_token_type_keyword: + /* No need to bother if we extract all strings anyway. */ + if (extract_all) + continue; + next_commas_to_skip = token.argnum1 - 1; + next_plural_commas = (token.argnum2 > token.argnum1 + ? token.argnum2 - token.argnum1 : 0); + state = 1; + continue; + + case xgettext_token_type_lparen: + /* No need to recurse if we extract all strings anyway. */ + if (extract_all) + continue; + if (state + ? extract_parenthesized (mlp, next_commas_to_skip, + next_plural_commas) + : extract_parenthesized (mlp, -1, 0)) + return true; + state = 0; + continue; + + case xgettext_token_type_rparen: + /* No need to return if we extract all strings anyway. */ + if (extract_all) + continue; + return false; + + case xgettext_token_type_comma: + /* No need to bother if we extract all strings anyway. */ + if (extract_all) + continue; + if (commas_to_skip >= 0) + { + if (commas_to_skip > 0) + commas_to_skip--; + else + if (plural_mp != NULL && plural_commas > 0) + { + commas_to_skip = plural_commas - 1; + plural_commas = 0; + } + else + commas_to_skip = -1; + } + state = 0; + continue; + + case xgettext_token_type_string_literal: + if (extract_all) + remember_a_message (mlp, token.string, &token.pos); + else + { + if (commas_to_skip == 0) + { + if (plural_mp == NULL) + { + /* Seen an msgid. */ + message_ty *mp = remember_a_message (mlp, token.string, + &token.pos); + if (plural_commas > 0) + plural_mp = mp; + } + else + { + /* Seen an msgid_plural. */ + remember_a_message_plural (plural_mp, token.string, + &token.pos); + plural_mp = NULL; + } + } + else + free (token.string); + state = 0; + } + continue; + + case xgettext_token_type_symbol: + state = 0; + continue; + + case xgettext_token_type_eof: + return true; + + default: + abort (); + } + } +} + + +void +extract_c (FILE *f, + const char *real_filename, const char *logical_filename, + msgdomain_list_ty *mdlp) +{ + message_list_ty *mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + newline_count = 0; + last_comment_line = -1; + last_non_comment_line = -1; + + init_keywords (); + + /* Eat tokens until eof is seen. When extract_parenthesized returns + due to an unbalanced closing parenthesis, just restart it. */ + while (!extract_parenthesized (mlp, -1, 0)) + ; + + /* Close scanner. */ + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} |