/* xgettext Tcl backend. Copyright (C) 2002 Free Software Foundation, Inc. This file was written by Bruno Haible , 2002. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif #include #include #include #include #include #include #include #include "message.h" #include "x-tcl.h" #include "xgettext.h" #include "error.h" #include "xmalloc.h" #include "exit.h" #include "hash.h" #include "c-ctype.h" #include "po-charset.h" #include "msgl-ascii.h" #include "msgl-iconv.h" #include "ucs4-utf8.h" #include "gettext.h" #define _(s) gettext(s) /* The Tcl syntax is defined in the Tcl.n manual page. Summary of Tcl syntax: Like sh syntax, except that `...` is replaced with [...]. In detail: - In a preprocessing pass, backslash-newline-anywhitespace is replaced with single space. - Input is broken into words, which are then subject to command substitution [...] , variable substitution $var, backslash substitution \escape. - Strings are enclosed in "..."; command substitution, variable substitution and backslash substitutions are performed here as well. - {...} is a string without substitutions. - The list of resulting words is split into commands by semicolon and newline. - '#' at the beginning of a command introduces a comment until end of line. The parser is implemented in tcl8.3.3/generic/tclParse.c. */ /* ====================== Keyword set customization. ====================== */ /* If true extract all strings. */ static bool extract_all = false; static hash_table keywords; static bool default_keywords = true; void x_tcl_extract_all () { extract_all = true; } void x_tcl_keyword (const char *name) { if (name == NULL) default_keywords = false; else { const char *end; int argnum1; int argnum2; if (keywords.table == NULL) init_hash (&keywords, 100); split_keywordspec (name, &end, &argnum1, &argnum2); /* The characters between name and end should form a valid Tcl function name. A leading "::" is redundant. */ if (end - name >= 2 && name[0] == ':' && name[1] == ':') name += 2; if (argnum1 == 0) argnum1 = 1; insert_entry (&keywords, name, end - name, (void *) (long) (argnum1 + (argnum2 << 10))); } } /* Finish initializing the keywords hash table. Called after argument processing, before each file is processed. */ static void init_keywords () { if (default_keywords) { x_tcl_keyword ("::msgcat::mc"); default_keywords = false; } } /* ======================== Reading of characters. ======================== */ /* Real filename, used in error messages about the input file. */ static const char *real_file_name; /* Logical filename and line number, used to label the extracted messages. */ static char *logical_file_name; static int line_number; /* The input file stream. */ static FILE *fp; /* Fetch the next character from the input file. */ static int do_getc () { int c = getc (fp); if (c == EOF) { if (ferror (fp)) error (EXIT_FAILURE, errno, _("\ error while reading \"%s\""), real_file_name); } else if (c == '\n') line_number++; return c; } /* Put back the last fetched character, not EOF. */ static void do_ungetc (int c) { if (c == '\n') line_number--; ungetc (c, fp); } /* Combine backslash followed by newline and additional whitespace to a single space. Cope with potentially 2 characters of pushback. */ /* An int that becomes a space when casted to 'unsigned char'. */ #define BS_NL (UCHAR_MAX + 1 + ' ') /* Maximum used guaranteed to be < 4. */ static int phase1_pushback[4]; static int phase1_pushback_length; static int phase1_getc () { int c; if (phase1_pushback_length) { c = phase1_pushback[--phase1_pushback_length]; if (c == '\n' || c == BS_NL) ++line_number; return c; } c = do_getc (); if (c != '\\') return c; c = do_getc (); if (c != '\n') { if (c != EOF) do_ungetc (c); return '\\'; } for (;;) { c = do_getc (); if (!(c == ' ' || c == '\t')) break; } if (c != EOF) do_ungetc (c); return BS_NL; } static void phase1_ungetc (int c) { switch (c) { case EOF: break; case '\n': case BS_NL: --line_number; /* FALLTHROUGH */ default: phase1_pushback[phase1_pushback_length++] = c; break; } } /* Keep track of brace nesting depth. When a word starts with an opening brace, a character group begins that ends with the corresponding closing brace. In theory these character groups are string literals, but they are used by so many Tcl primitives (proc, if, ...) as representing command lists, that we treat them as command lists. */ /* An int that becomes a closing brace when casted to 'unsigned char'. */ #define CL_BRACE (UCHAR_MAX + 1 + '}') /* Maximum used guaranteed to be < 4. */ static int phase2_pushback[4]; static int phase2_pushback_length; /* Brace nesting depth inside the current character group. */ static int brace_depth; static int phase2_push () { int previous_depth = brace_depth; brace_depth = 1; return previous_depth; } static void phase2_pop (int previous_depth) { brace_depth = previous_depth; } static int phase2_getc () { int c; if (phase2_pushback_length) { c = phase2_pushback[--phase2_pushback_length]; if (c == '\n' || c == BS_NL) ++line_number; else if (c == '{') ++brace_depth; else if (c == '}') --brace_depth; return c; } c = phase1_getc (); if (c == '{') ++brace_depth; else if (c == '}') { if (--brace_depth == 0) c = CL_BRACE; } return c; } static void phase2_ungetc (int c) { if (c != EOF) { switch (c) { case '\n': case BS_NL: --line_number; break; case '{': --brace_depth; break; case '}': ++brace_depth; break; } phase2_pushback[phase2_pushback_length++] = c; } } /* ========================== Reading of tokens. ========================== */ /* A token consists of a sequence of characters. */ struct token { int allocated; /* number of allocated 'token_char's */ int charcount; /* number of used 'token_char's */ char *chars; /* the token's constituents */ }; /* Initialize a 'struct token'. */ static inline void init_token (struct token *tp) { tp->allocated = 10; tp->chars = (char *) xmalloc (tp->allocated * sizeof (char)); tp->charcount = 0; } /* Free the memory pointed to by a 'struct token'. */ static inline void free_token (struct token *tp) { free (tp->chars); } /* Ensure there is enough room in the token for one more character. */ static inline void grow_token (struct token *tp) { if (tp->charcount == tp->allocated) { tp->allocated *= 2; tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char)); } } /* ========================= Accumulating comments ========================= */ static char *buffer; static size_t bufmax; static size_t buflen; static inline void comment_start () { buflen = 0; } static inline void comment_add (int c) { if (buflen >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[buflen++] = c; } static inline void comment_line_end () { while (buflen >= 1 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) --buflen; if (buflen >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[buflen] = '\0'; xgettext_comment_add (buffer); } /* These are for tracking whether comments count as immediately before keyword. */ static int last_comment_line; static int last_non_comment_line; /* ========================= Accumulating messages ========================= */ static message_list_ty *mlp; /* ========================== Reading of commands ========================== */ /* We are only interested in constant strings (e.g. "msgcat::mc" or other string literals). Other words need not to be represented precisely. */ enum word_type { t_string, /* constant string */ t_other, /* other string */ t_separator, /* command separator: semicolon or newline */ t_bracket, /* ']' pseudo word */ t_brace, /* '}' pseudo word */ t_eof /* EOF marker */ }; struct word { enum word_type type; struct token *token; /* for t_string */ int line_number_at_start; /* for t_string */ }; /* Free the memory pointed to by a 'struct word'. */ static inline void free_word (struct word *wp) { if (wp->type == t_string) { free_token (wp->token); free (wp->token); } } /* Convert a t_string token to a char*. */ static char * string_of_word (const struct word *wp) { char *str; int n; if (!(wp->type == t_string)) abort (); n = wp->token->charcount; str = (char *) xmalloc (n + 1); memcpy (str, wp->token->chars, n); str[n] = '\0'; return str; } /* Read an escape sequence. The value is an ISO-8859-1 character (in the range 0x00..0xff) or a Unicode character (in the range 0x0000..0xffff). */ static int do_getc_escaped () { int c; c = phase1_getc (); switch (c) { case EOF: return '\\'; case 'a': return '\a'; case 'b': return '\b'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\v'; case 'x': { int n = 0; unsigned int i; for (i = 0;; i++) { c = phase1_getc (); if (c == EOF || !c_isxdigit ((unsigned char) c)) break; if (c >= '0' && c <= '9') n = (n << 4) + (c - '0'); else if (c >= 'A' && c <= 'F') n = (n << 4) + (c - 'A' + 10); else if (c >= 'a' && c <= 'f') n = (n << 4) + (c - 'a' + 10); } phase1_ungetc (c); return (i > 0 ? (unsigned char) n : 'x'); } case 'u': { int n = 0; unsigned int i; for (i = 0; i < 4; i++) { c = phase1_getc (); if (c == EOF || !c_isxdigit ((unsigned char) c)) break; if (c >= '0' && c <= '9') n = (n << 4) + (c - '0'); else if (c >= 'A' && c <= 'F') n = (n << 4) + (c - 'A' + 10); else if (c >= 'a' && c <= 'f') n = (n << 4) + (c - 'a' + 10); } phase1_ungetc (c); return (i > 0 ? n : 'u'); } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { int n = c - '0'; c = phase1_getc (); if (c != EOF) { if (c >= '0' && c <= '7') { n = (n << 3) + (c - '0'); c = phase1_getc (); if (c != EOF) { if (c >= '0' && c <= '7') n = (n << 3) + (c - '0'); else phase1_ungetc (c); } } else phase1_ungetc (c); } return (unsigned char) n; } default: /* Note: If c is non-ASCII, Tcl's behaviour is undefined here. */ return (unsigned char) c; } } enum terminator { te_space_separator, /* looking for space semicolon newline */ te_space_separator_bracket, /* looking for space semicolon newline ']' */ te_paren, /* looking for ')' */ te_quote /* looking for '"' */ }; /* Forward declaration of local functions. */ static enum word_type read_command_list (int looking_for); /* Accumulate tokens into the given word. 'looking_for' denotes a parse terminator combination. */ static int accumulate_word (struct word *wp, enum terminator looking_for) { int c; for (;;) { c = phase2_getc (); if (c == EOF || c == CL_BRACE) return c; if ((looking_for == te_space_separator || looking_for == te_space_separator_bracket) && (c == ' ' || c == BS_NL || c == '\t' || c == '\v' || c == '\f' || c == '\r' || c == ';' || c == '\n')) return c; if (looking_for == te_space_separator_bracket && c == ']') return c; if (looking_for == te_paren && c == ')') return c; if (looking_for == te_quote && c == '"') return c; if (c == '$') { /* Distinguish $varname, ${varname} and lone $. */ c = phase2_getc (); if (c == '{') { /* ${varname} */ do c = phase2_getc (); while (c != EOF && c != '}'); wp->type = t_other; } else { bool nonempty = false; for (; c != EOF && c != CL_BRACE; c = phase2_getc ()) { if (c_isalnum ((unsigned char) c) || (c == '_')) { nonempty = true; continue; } if (c == ':') { c = phase2_getc (); if (c == ':') { do c = phase2_getc (); while (c == ':'); phase2_ungetc (c); nonempty = true; continue; } phase2_ungetc (c); c = ':'; } break; } if (c == '(') { /* $varname(index) */ struct word index_word; index_word.type = t_other; c = accumulate_word (&index_word, te_paren); if (c != EOF && c != ')') phase2_ungetc (c); wp->type = t_other; } else { phase2_ungetc (c); if (nonempty) { /* $varname */ wp->type = t_other; } else { /* lone $ */ if (wp->type == t_string) { grow_token (wp->token); wp->token->chars[wp->token->charcount++] = '$'; } } } } } else if (c == '[') { read_command_list (']'); wp->type = t_other; } else if (c == '\\') { unsigned int uc; unsigned char utf8buf[6]; int count; int i; uc = do_getc_escaped (); assert (uc < 0x10000); count = u8_uctomb (utf8buf, uc, 6); assert (count > 0); if (wp->type == t_string) for (i = 0; i < count; i++) { grow_token (wp->token); wp->token->chars[wp->token->charcount++] = utf8buf[i]; } } else { if (wp->type == t_string) { grow_token (wp->token); wp->token->chars[wp->token->charcount++] = (unsigned char) c; } } } } /* Read the next word. 'looking_for' denotes a parse terminator, either ']' or '\0'. */ static void read_word (struct word *wp, int looking_for) { int c; do c = phase2_getc (); while (c == ' ' || c == BS_NL); if (c == EOF) { wp->type = t_eof; return; } if (c == CL_BRACE) { wp->type = t_brace; last_non_comment_line = line_number; return; } if (c == '\n') { /* Comments assumed to be grouped with a message must immediately precede it, with no non-whitespace token on a line between both. */ if (last_non_comment_line > last_comment_line) xgettext_comment_reset (); wp->type = t_separator; return; } if (c == ';') { wp->type = t_separator; last_non_comment_line = line_number; return; } if (looking_for == ']' && c == ']') { wp->type = t_bracket; last_non_comment_line = line_number; return; } if (c == '{') { int previous_depth; enum word_type terminator; /* Start a new nested character group, which lasts until the next balanced '}' (ignoring \} things). */ previous_depth = phase2_push () - 1; /* Interpret it as a command list. */ terminator = read_command_list ('\0'); if (terminator == t_brace) phase2_pop (previous_depth); wp->type = t_other; last_non_comment_line = line_number; return; } wp->type = t_string; wp->token = (struct token *) xmalloc (sizeof (struct token)); init_token (wp->token); wp->line_number_at_start = line_number; if (c == '"') { c = accumulate_word (wp, te_quote); if (c != EOF && c != '"') phase2_ungetc (c); } else { phase2_ungetc (c); c = accumulate_word (wp, looking_for == ']' ? te_space_separator_bracket : te_space_separator); if (c != EOF) phase2_ungetc (c); } if (wp->type != t_string) { free_token (wp->token); free (wp->token); } last_non_comment_line = line_number; } /* Read the next command. 'looking_for' denotes a parse terminator, either ']' or '\0'. Returns the type of the word that terminated the command: t_separator or t_bracket (only if looking_for is ']') or t_brace or t_eof. */ static enum word_type read_command (int looking_for) { int c; /* Skip whitespace and comments. */ for (;;) { c = phase2_getc (); if (c == ' ' || c == BS_NL || c == '\t' || c == '\v' || c == '\f' || c == '\r') continue; if (c == '#') { /* Skip a comment up to end of line. */ last_comment_line = line_number; comment_start (); for (;;) { c = phase2_getc (); if (c == EOF || c == CL_BRACE || c == '\n') break; comment_add (c); } comment_line_end (); continue; } break; } phase2_ungetc (c); /* Read the words that make up the command. */ { int arg = 0; /* Current argument number. */ int argnum1 = 0; /* First string position. */ int argnum2 = 0; /* Plural string position. */ message_ty *plural_mp = NULL; /* Remember the msgid. */ for (;; arg++) { struct word inner; read_word (&inner, looking_for); /* Recognize end of command. */ if (inner.type == t_separator || inner.type == t_bracket || inner.type == t_brace || inner.type == t_eof) return inner.type; if (extract_all) { if (inner.type == t_string) { lex_pos_ty pos; pos.file_name = logical_file_name; pos.line_number = inner.line_number_at_start; remember_a_message (mlp, string_of_word (&inner), &pos); } } else { if (arg == 0) { /* This is the function position. */ if (inner.type == t_string) { char *function_name = string_of_word (&inner); char *stripped_name; void *keyword_value; /* A leading "::" is redundant. */ stripped_name = function_name; if (function_name[0] == ':' && function_name[1] == ':') stripped_name += 2; if (find_entry (&keywords, stripped_name, strlen (stripped_name), &keyword_value) == 0) { argnum1 = (int) (long) keyword_value & ((1 << 10) - 1); argnum2 = (int) (long) keyword_value >> 10; } free (function_name); } } else { /* These are the argument positions. Extract a string if we have reached the right argument position. */ if (arg == argnum1) { if (inner.type == t_string) { lex_pos_ty pos; message_ty *mp; pos.file_name = logical_file_name; pos.line_number = inner.line_number_at_start; mp = remember_a_message (mlp, string_of_word (&inner), &pos); if (argnum2 > 0) plural_mp = mp; } } else if (arg == argnum2) { if (inner.type == t_string && plural_mp != NULL) { lex_pos_ty pos; pos.file_name = logical_file_name; pos.line_number = inner.line_number_at_start; remember_a_message_plural (plural_mp, string_of_word (&inner), &pos); } } } } free_word (&inner); } } } /* Read a list of commands. 'looking_for' denotes a parse terminator, either ']' or '\0'. Returns the type of the word that terminated the command list: t_bracket (only if looking_for is ']') or t_brace or t_eof. */ static enum word_type read_command_list (int looking_for) { for (;;) { enum word_type terminator; terminator = read_command (looking_for); if (terminator != t_separator) return terminator; } } void extract_tcl (FILE *f, const char *real_filename, const char *logical_filename, msgdomain_list_ty *mdlp) { mlp = mdlp->item[0]->messages; /* We convert our strings to UTF-8 encoding. */ xgettext_current_source_encoding = po_charset_utf8; fp = f; real_file_name = real_filename; logical_file_name = xstrdup (logical_filename); line_number = 1; /* Initially, no brace is open. */ brace_depth = 1000000; last_comment_line = -1; last_non_comment_line = -1; init_keywords (); /* Eat tokens until eof is seen. */ read_command_list ('\0'); fp = NULL; real_file_name = NULL; logical_file_name = NULL; line_number = 0; }