diff options
Diffstat (limited to 'src/x-librep.c')
-rw-r--r-- | src/x-librep.c | 1151 |
1 files changed, 1151 insertions, 0 deletions
diff --git a/src/x-librep.c b/src/x-librep.c new file mode 100644 index 0000000..0c3773c --- /dev/null +++ b/src/x-librep.c @@ -0,0 +1,1151 @@ +/* xgettext librep backend. + Copyright (C) 2001 Free Software Foundation, Inc. + + This file was written by Bruno Haible <haible@clisp.cons.org>, 2001. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "message.h" +#include "x-librep.h" +#include "xgettext.h" +#include "error.h" +#include "xmalloc.h" +#include "system.h" +#include "libgettext.h" + +#define _(s) gettext(s) + +#if HAVE_C_BACKSLASH_A +# define ALERT_CHAR '\a' +#else +# define ALERT_CHAR '\7' +#endif + + +/* Summary of librep syntax: + - ';' starts a comment until end of line. + - Block comments start with '#|' and end with '|#'. + - Numbers are constituted of an optional prefix (#b, #B for binary, + #o, #O for octal, #d, #D for decimal, #x, #X for hexadecimal, + #e, #E for exact, #i, #I for inexact), an optional sign (+ or -), and + the digits. + - Characters are written as '?' followed by the character, possibly + with an escape sequence, for examples '?a', '?\n', '?\177'. + - Strings are delimited by double quotes. Backslash introduces an escape + sequence. The following are understood: '\n', '\r', '\f', '\t', '\a', + '\\', '\^C', '\012' (octal), '\x12' (hexadecimal). + - Symbols: can contain meta-characters - whitespace or any from ()[]'";|\' - + if preceded by backslash or enclosed in |...|. + - Keywords: written as #:SYMBOL. + - () delimit lists. + - [] delimit vectors. + The reader is implemented in librep-0.14/src/lisp.c. */ + + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +struct token; +struct object; +static void init_keywords PARAMS ((void)); +static int do_getc PARAMS ((void)); +static void do_ungetc PARAMS ((int c)); +static inline void init_token PARAMS ((struct token *tp)); +static inline void free_token PARAMS ((struct token *tp)); +static inline void grow_token PARAMS ((struct token *tp)); +static bool read_token PARAMS ((struct token *tp, const int *first)); +static inline void comment_start PARAMS ((void)); +static inline void comment_add PARAMS ((int c)); +static inline void comment_line_end PARAMS ((size_t chars_to_remove)); +static inline void free_object PARAMS ((struct object *op)); +static char * string_of_object PARAMS ((const struct object *op)); +static int do_getc_escaped PARAMS ((int c)); +static void read_object PARAMS ((struct object *op)); + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_librep_extract_all () +{ + extract_all = true; +} + + +void +x_librep_keyword (name) + const char *name; +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + int argnum1; + int argnum2; + const char *colon; + + if (keywords.table == NULL) + init_hash (&keywords, 100); + + split_keywordspec (name, &end, &argnum1, &argnum2); + + /* The characters between name and end should form a valid Lisp + symbol. */ + colon = strchr (name, ':'); + if (colon == NULL || colon >= end) + { + if (argnum1 == 0) + argnum1 = 1; + insert_entry (&keywords, name, end - name, + (void *) (long) (argnum1 + (argnum2 << 10))); + } + } +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + x_librep_keyword ("_"); + default_keywords = false; + } +} + + +/* ======================== Reading of characters. ======================== */ + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + + +/* Fetch the next character from the input file. */ +static int +do_getc () +{ + int c = getc (fp); + + if (c == EOF) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("\ +error while reading \"%s\""), real_file_name); + } + else if (c == '\n') + line_number++; + + return c; +} + +/* Put back the last fetched character, not EOF. */ +static void +do_ungetc (c) + int c; +{ + if (c == '\n') + line_number--; + ungetc (c, fp); +} + + +/* ========================== Reading of tokens. ========================== */ + + +/* A token consists of a sequence of characters. */ +struct token +{ + int allocated; /* number of allocated 'token_char's */ + int charcount; /* number of used 'token_char's */ + char *chars; /* the token's constituents */ +}; + +/* Initialize a 'struct token'. */ +static inline void +init_token (tp) + struct token *tp; +{ + tp->allocated = 10; + tp->chars = (char *) xmalloc (tp->allocated * sizeof (char)); + tp->charcount = 0; +} + +/* Free the memory pointed to by a 'struct token'. */ +static inline void +free_token (tp) + struct token *tp; +{ + free (tp->chars); +} + +/* Ensure there is enough room in the token for one more character. */ +static inline void +grow_token (tp) + struct token *tp; +{ + if (tp->charcount == tp->allocated) + { + tp->allocated *= 2; + tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char)); + } +} + +/* Read the next token. If 'first' is given, it points to the first + character, which has already been read. Returns true for a symbol, + false for a number. */ +static bool +read_token (tp, first) + struct token *tp; + const int *first; +{ + int c; + /* Variables for speculative number parsing: */ + int radix = -1; + int nfirst = 0; + bool exact = true; + bool rational = false; + bool exponent = false; + bool had_sign = false; + bool expecting_prefix = false; + + init_token (tp); + + if (first) + c = *first; + else + c = do_getc (); + + for (;; c = do_getc ()) + { + switch (c) + { + case EOF: + goto done; + + case ' ': case '\t': case '\n': case '\f': case '\r': + case '(': case ')': case '[': case ']': + case '\'': case '"': case ';': case ',': case '`': + goto done; + + case '\\': + radix = 0; + c = do_getc (); + if (c == EOF) + /* Invalid, but be tolerant. */ + break; + grow_token (tp); + tp->chars[tp->charcount++] = c; + break; + + case '|': + radix = 0; + for (;;) + { + c = do_getc (); + if (c == EOF || c == '|') + break; + grow_token (tp); + tp->chars[tp->charcount++] = c; + } + break; + + default: + if (radix != 0) + { + if (expecting_prefix) + { + switch (c) + { + case 'B': case 'b': + radix = 2; + break; + case 'O': case 'o': + radix = 8; + break; + case 'D': case 'd': + radix = 10; + break; + case 'X': case 'x': + radix = 16; + break; + case 'E': case 'e': + case 'I': case 'i': + break; + default: + radix = 0; + break; + } + expecting_prefix = false; + nfirst = tp->charcount + 1; + } + else if (tp->charcount == nfirst + && (c == '+' || c == '-' || c == '#')) + { + if (c == '#') + { + if (had_sign) + radix = 0; + else + expecting_prefix = true; + } + else + had_sign = true; + nfirst = tp->charcount + 1; + } + else + { + switch (radix) + { + case -1: + if (c == '.') + { + radix = 10; + exact = false; + } + else if (!(c >= '0' && c <= '9')) + radix = 0; + else if (c == '0') + radix = 1; + else + radix = 10; + break; + + case 1: + switch (c) + { + case 'X': case 'x': + radix = 16; + nfirst = tp->charcount + 1; + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + radix = 8; + nfirst = tp->charcount; + break; + case '.': case 'E': case 'e': + radix = 10; + exact = false; + break; + case '/': + radix = 10; + rational = true; + break; + default: + radix = 0; + break; + } + break; + + default: + switch (c) + { + case '.': + if (exact && radix == 10 && !rational) + exact = false; + else + radix = 0; + break; + case '/': + if (exact && !rational) + rational = true; + else + radix = 0; + break; + case 'E': case 'e': + if (radix == 10) + { + if (!rational && !exponent) + { + exponent = true; + exact = false; + } + else + radix = 0; + break; + } + /*FALLTHROUGH*/ + default: + if (exponent && (c == '+' || c == '-')) + break; + if ((radix <= 10 + && !(c >= '0' && c <= '0' + radix - 1)) + || (radix == 16 && !isxdigit (c))) + radix = 0; + break; + } + break; + } + } + } + else + { + if (c == '#') + goto done; + } + grow_token (tp); + tp->chars[tp->charcount++] = c; + } + } + done: + if (c != EOF) + do_ungetc (c); + if (radix > 0 && nfirst < tp->charcount) + return false; /* number */ + else + return true; /* symbol */ +} + + +/* ========================= Accumulating comments ========================= */ + + +static char *buffer; +static size_t bufmax; +size_t buflen; + +static inline void +comment_start () +{ + buflen = 0; +} + +static inline void +comment_add (c) + int c; +{ + if (buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen++] = c; +} + +static inline void +comment_line_end (chars_to_remove) + size_t chars_to_remove; +{ + buflen -= chars_to_remove; + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + if (chars_to_remove == 0 && buflen >= bufmax) + { + bufmax += 100; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen] = '\0'; + xgettext_comment_add (buffer); +} + + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; + + +/* ========================= Accumulating messages ========================= */ + + +static message_list_ty *mlp; + + +/* ============== Reading of objects. See CLHS 2 "Syntax". ============== */ + + +/* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings. + Other objects need not to be represented precisely. */ +enum object_type +{ + t_symbol, /* symbol */ + t_string, /* string */ + t_other, /* other kind of real object */ + t_dot, /* '.' pseudo object */ + t_close, /* ')' or ']' pseudo object */ + t_eof /* EOF marker */ +}; + +struct object +{ + enum object_type type; + struct token *token; /* for t_symbol and t_string */ + int line_number_at_start; /* for t_string */ +}; + +/* Free the memory pointed to by a 'struct object'. */ +static inline void +free_object (op) + struct object *op; +{ + if (op->type == t_symbol || op->type == t_string) + { + free_token (op->token); + free (op->token); + } +} + +/* Convert a t_string token to a char*. */ +static char * +string_of_object (op) + const struct object *op; +{ + char *str; + const char *p; + char *q; + int n; + + if (!(op->type == t_symbol || op->type == t_string)) + abort (); + n = op->token->charcount; + str = (char *) xmalloc (n + 1); + q = str; + for (p = op->token->chars; n > 0; n--) + *q++ = *p++; + *q = '\0'; + return str; +} + +/* Returns the character represented by an escape sequence. */ +static int +do_getc_escaped (c) + int c; +{ + switch (c) + { + case 'n': + return '\n'; + case 'r': + return '\r'; + case 'f': + return '\f'; + case 't': + return '\t'; + case 'v': + return '\v'; + case 'a': + return ALERT_CHAR; + case '^': + c = do_getc (); + if (c == EOF) + return EOF; + return c & 0x1f; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + int n = c - '0'; + + c = do_getc (); + if (c != EOF) + { + if (c >= '0' && c <= '7') + { + n = (n << 3) + (c - '0'); + c = do_getc (); + if (c != EOF) + { + if (c >= '0' && c <= '7') + n = (n << 3) + (c - '0'); + else + do_ungetc (c); + } + } + else + do_ungetc (c); + } + return (unsigned char) n; + } + case 'x': + { + int n = 0; + + for (;;) + { + c = do_getc (); + if (c == EOF) + break; + else if (c >= '0' && c <= '9') + n = (n << 4) + (c - '0'); + else if (c >= 'A' && c <= 'F') + n = (n << 4) + (c - 'A' + 10); + else if (c >= 'a' && c <= 'f') + n = (n << 4) + (c - 'a' + 10); + else + { + do_ungetc (c); + break; + } + } + return (unsigned char) n; + } + default: + return c; + } +} + +/* Read the next object. */ +static void +read_object (op) + struct object *op; +{ + for (;;) + { + int c; + + c = do_getc (); + + switch (c) + { + case EOF: + op->type = t_eof; + return; + + case '\n': + /* Comments assumed to be grouped with a message must immediately + precede it, with no non-whitespace token on a line between + both. */ + if (last_non_comment_line > last_comment_line) + xgettext_comment_reset (); + continue; + + case ' ': case '\t': case '\f': case '\r': + continue; + + case '(': + { + int arg = 0; /* Current argument number. */ + int argnum1 = 0; /* First string position. */ + int argnum2 = 0; /* Plural string position. */ + message_ty *plural_mp = NULL; /* Remember the msgid. */ + + for (;; arg++) + { + struct object inner; + + read_object (&inner); + + /* Recognize end of list. */ + if (inner.type == t_close) + { + op->type = t_other; + /* Don't bother converting "()" to "NIL". */ + last_non_comment_line = line_number; + return; + } + + /* Dots are not allowed in every position. + But be tolerant. */ + + /* EOF inside list is illegal. But be tolerant. */ + if (inner.type == t_eof) + break; + + /* No need to bother if we extract all strings anyway. */ + if (!extract_all) + { + if (arg == 0) + { + /* This is the function position. */ + if (inner.type == t_symbol) + { + char *symbol_name = string_of_object (&inner); + void *keyword_value; + + if (find_entry (&keywords, + symbol_name, strlen (symbol_name), + &keyword_value) + == 0) + { + argnum1 = (int) (long) keyword_value & ((1 << 10) - 1); + argnum2 = (int) (long) keyword_value >> 10; + } + + free (symbol_name); + } + } + else + { + /* These are the argument positions. + Extract a string if we have reached the right + argument position. */ + if (arg == argnum1) + { + if (inner.type == t_string) + { + lex_pos_ty pos; + message_ty *mp; + + pos.file_name = logical_file_name; + pos.line_number = inner.line_number_at_start; + mp = remember_a_message (mlp, string_of_object (&inner), &pos); + if (argnum2 > 0) + plural_mp = mp; + } + } + else if (arg == argnum2) + { + if (inner.type == t_string && plural_mp != NULL) + { + lex_pos_ty pos; + + pos.file_name = logical_file_name; + pos.line_number = inner.line_number_at_start; + remember_a_message_plural (plural_mp, string_of_object (&inner), &pos); + } + } + } + } + + free_object (&inner); + } + } + op->type = t_other; + last_non_comment_line = line_number; + return; + + case '[': + { + for (;;) + { + struct object inner; + + read_object (&inner); + + /* Recognize end of vector. */ + if (inner.type == t_close) + { + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + /* Dots are not allowed. But be tolerant. */ + + /* EOF inside vector is illegal. But be tolerant. */ + if (inner.type == t_eof) + break; + + free_object (&inner); + } + } + op->type = t_other; + last_non_comment_line = line_number; + return; + + case ')': case ']': + /* Tell the caller about the end of list or vector. + Unmatched closing parenthesis is illegal. But be tolerant. */ + op->type = t_close; + last_non_comment_line = line_number; + return; + + case ',': + { + int c = do_getc (); + /* The ,@ handling inside lists is wrong anyway, because + ,@form expands to an unknown number of elements. */ + if (c != EOF && c != '@') + do_ungetc (c); + } + /*FALLTHROUGH*/ + case '\'': + case '`': + { + struct object inner; + + read_object (&inner); + + /* Dots and EOF are not allowed here. But be tolerant. */ + + free_object (&inner); + + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case ';': + { + bool all_semicolons = true; + + last_comment_line = line_number; + comment_start (); + for (;;) + { + int c = do_getc (); + if (c == EOF || c == '\n' || c == '\f' || c == '\r') + break; + if (c != ';') + all_semicolons = false; + if (!all_semicolons) + comment_add (c); + } + comment_line_end (0); + continue; + } + + case '"': + { + op->token = (struct token *) xmalloc (sizeof (struct token)); + init_token (op->token); + op->line_number_at_start = line_number; + for (;;) + { + int c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + break; + if (c == '"') + break; + if (c == '\\') + { + c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + break; + if (c == '\n') + /* Ignore escaped newline. */ + ; + else + { + c = do_getc_escaped (c); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + break; + grow_token (op->token); + op->token->chars[op->token->charcount++] = c; + } + } + else + { + grow_token (op->token); + op->token->chars[op->token->charcount++] = c; + } + } + op->type = t_string; + + if (extract_all) + { + lex_pos_ty pos; + + pos.file_name = logical_file_name; + pos.line_number = op->line_number_at_start; + remember_a_message (mlp, string_of_object (op), &pos); + } + last_non_comment_line = line_number; + return; + } + + case '?': + c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + ; + else if (c == '\\') + { + c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + ; + else + { + c = do_getc_escaped (c); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + ; + } + } + op->type = t_other; + last_non_comment_line = line_number; + return; + + case '#': + /* Dispatch macro handling. */ + c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + { + op->type = t_other; + return; + } + + switch (c) + { + case '!': + if (ftell (fp) == 2) + /* Skip comment until !# */ + { + c = do_getc (); + for (;;) + { + if (c == EOF) + break; + if (c == '!') + { + c = do_getc (); + if (c == EOF || c == '#') + break; + } + else + c = do_getc (); + } + if (c == EOF) + { + /* EOF not allowed here. But be tolerant. */ + op->type = t_eof; + return; + } + continue; + } + /*FALLTHROUGH*/ + case '\'': + case '[': + case '(': + case ':': + { + struct object inner; + read_object (&inner); + /* Dots and EOF are not allowed here. + But be tolerant. */ + free_object (&inner); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case '|': + { + int depth = 0; + + comment_start (); + c = do_getc (); + for (;;) + { + if (c == EOF) + break; + if (c == '|') + { + c = do_getc (); + if (c == EOF) + break; + if (c == '#') + { + if (depth == 0) + { + comment_line_end (0); + break; + } + depth--; + comment_add ('|'); + comment_add ('#'); + c = do_getc (); + } + else + comment_add ('|'); + } + else if (c == '#') + { + c = do_getc (); + if (c == EOF) + break; + comment_add ('#'); + if (c == '|') + { + depth++; + comment_add ('|'); + c = do_getc (); + } + } + else + { + /* We skip all leading white space. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + if (c == '\n') + { + comment_line_end (1); + comment_start (); + } + c = do_getc (); + } + } + if (c == EOF) + { + /* EOF not allowed here. But be tolerant. */ + op->type = t_eof; + return; + } + last_comment_line = line_number; + continue; + } + + case '\\': + { + struct token token; + int first = '\\'; + read_token (&token, &first); + free_token (&token); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case 'T': case 't': + case 'F': case 'f': + op->type = t_other; + last_non_comment_line = line_number; + return; + + case 'B': case 'b': + case 'O': case 'o': + case 'D': case 'd': + case 'X': case 'x': + case 'E': case 'e': + case 'I': case 'i': + { + struct token token; + do_ungetc (c); + c = '#'; + read_token (&token, &c); + free_token (&token); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + default: + /* Invalid input. Be tolerant, no error message. */ + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + /*NOTREACHED*/ + abort (); + + default: + /* Read a token. */ + { + bool symbol; + + op->token = (struct token *) xmalloc (sizeof (struct token)); + symbol = read_token (op->token, &c); + if (op->token->charcount == 1 && op->token->chars[0] == '.') + { + free_token (op->token); + free (op->token); + op->type = t_dot; + last_non_comment_line = line_number; + return; + } + if (!symbol) + { + free_token (op->token); + free (op->token); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + /* Distinguish between "foo" and "foo#bar". */ + c = do_getc (); + if (c == '#') + { + struct token second_token; + + free_token (op->token); + free (op->token); + read_token (&second_token, NULL); + free_token (&second_token); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + else + { + if (c != EOF) + do_ungetc (c); + op->type = t_symbol; + last_non_comment_line = line_number; + return; + } + } + } + } +} + + +void +extract_librep (f, real_filename, logical_filename, mdlp) + FILE *f; + const char *real_filename; + const char *logical_filename; + msgdomain_list_ty *mdlp; +{ + mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + last_comment_line = -1; + last_non_comment_line = -1; + + init_keywords (); + + /* Eat tokens until eof is seen. When read_object returns + due to an unbalanced closing parenthesis, just restart it. */ + do + { + struct object toplevel_object; + + read_object (&toplevel_object); + + if (toplevel_object.type == t_eof) + break; + } + while (!feof (fp)); + + /* Close scanner. */ + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} |