/* xgettext Smalltalk backend. Copyright (C) 2002 Free Software Foundation, Inc. This file was written by Bruno Haible , 2002. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif #include #include #include #include "message.h" #include "x-smalltalk.h" #include "xgettext.h" #include "error.h" #include "xmalloc.h" #include "exit.h" #include "gettext.h" #define _(s) gettext(s) /* The relevant parts of the Smalltalk syntax are: stringliteral ::= string | stringconst | symconst stringconst ::= "#"string string ::= "'"[char]*"'" symconst ::= "#"symbol symbol ::= id | binsel | keysel[keysel]* keysel ::= id":" id ::= letter[letter|digit]* letter ::= "A".."Z" | "a".."z" digit ::= "0".."9" binsel ::= selchar[selchar] selchar ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">" | "=" | "&" | "@" | "?" | "%" | "\" Strings can contain any characters; to include the string delimiter itself, it must be duplicated. Character constants are written "$"char Comments are enclosed within double quotes. In well-formed expressions, {} and [] and () are balanced. */ enum token_type_ty { token_type_eof, token_type_uniq, /* # */ token_type_symbol, /* symbol */ token_type_string_literal, /* string, stringconst, symbolconst */ token_type_other /* misc. operator */ }; typedef enum token_type_ty token_type_ty; typedef struct token_ty token_ty; struct token_ty { token_type_ty type; char *string; /* for token_type_string_literal, token_type_symbol */ int line_number; }; /* ======================== Reading of characters. ======================== */ /* Real filename, used in error messages about the input file. */ static const char *real_file_name; /* Logical filename and line number, used to label the extracted messages. */ static char *logical_file_name; static int line_number; /* The input file stream. */ static FILE *fp; /* 1. line_number handling. */ static int phase1_getc () { int c = getc (fp); if (c == EOF) { if (ferror (fp)) error (EXIT_FAILURE, errno, _("error while reading \"%s\""), real_file_name); return EOF; } if (c == '\n') line_number++; return c; } static void phase1_ungetc (int c) { if (c != EOF) { if (c == '\n') --line_number; ungetc (c, fp); } } /* Accumulating comments. */ static char *buffer; static size_t bufmax; static size_t buflen; static inline void comment_start () { buflen = 0; } static inline void comment_add (int c) { if (buflen >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[buflen++] = c; } static inline void comment_line_end () { while (buflen >= 1 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) --buflen; if (buflen >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[buflen] = '\0'; xgettext_comment_add (buffer); } /* These are for tracking whether comments count as immediately before keyword. */ static int last_comment_line; static int last_non_comment_line; /* 2. Combine characters into tokens. Discard comments and whitespace. */ /* Maximum used guaranteed to be < 2. */ static token_ty phase2_pushback[2]; static int phase2_pushback_length; static void phase2_get (token_ty *tp) { static char *buffer; static int bufmax; int bufpos; int c; if (phase2_pushback_length) { *tp = phase2_pushback[--phase2_pushback_length]; return; } tp->string = NULL; for (;;) { tp->line_number = line_number; c = phase1_getc (); switch (c) { case EOF: tp->type = token_type_eof; return; case '"': { /* Comment. */ int lineno; comment_start (); lineno = line_number; for (;;) { c = phase1_getc (); if (c == '"' || c == EOF) break; if (c == '\n') { comment_line_end (); comment_start (); } else { /* We skip all leading white space, but not EOLs. */ if (!(buflen == 0 && (c == ' ' || c == '\t'))) comment_add (c); } } comment_line_end (); last_comment_line = lineno; continue; } case '\n': if (last_non_comment_line > last_comment_line) xgettext_comment_reset (); /* FALLTHROUGH */ case ' ': case '\t': case '\r': /* Ignore whitespace. */ continue; } last_non_comment_line = tp->line_number; switch (c) { case '\'': /* String literal. */ bufpos = 0; for (;;) { c = phase1_getc (); if (c == EOF) break; if (c == '\'') { c = phase1_getc (); if (c != '\'') { phase1_ungetc (c); break; } } if (bufpos >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[bufpos++] = c; } if (bufpos >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[bufpos] = 0; tp->type = token_type_string_literal; tp->string = xstrdup (buffer); return; case '+': case '-': case '*': case '/': case '~': case '|': case ',': case '<': case '>': case '=': case '&': case '@': case '?': case '%': case '\\': { char *name; int c2 = phase1_getc (); switch (c2) { case '+': case '-': case '*': case '/': case '~': case '|': case ',': case '<': case '>': case '=': case '&': case '@': case '?': case '%': name = xmalloc (3); name[0] = c; name[1] = c2; name[2] = '\0'; tp->type = token_type_symbol; tp->string = name; return; default: phase1_ungetc (c2); break; } name = xmalloc (2); name[0] = c; name[1] = '\0'; tp->type = token_type_symbol; tp->string = name; return; } case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': /* Recognize id or id":"[id":"]* or id":"[id":"]*id. */ bufpos = 0; for (;;) { if (bufpos >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[bufpos++] = c; c = phase1_getc (); switch (c) { case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': continue; case ':': if (bufpos >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[bufpos++] = c; c = phase1_getc (); switch (c) { case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': continue; default: phase1_ungetc (c); break; } break; default: phase1_ungetc (c); break; } break; } if (bufpos >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax); } buffer[bufpos] = '\0'; tp->string = xstrdup (buffer); tp->type = token_type_symbol; return; case '#': /* Uniquification operator. */ tp->type = token_type_uniq; return; case '$': c = phase1_getc (); tp->type = token_type_other; return; default: tp->type = token_type_other; return; } } } static void phase2_unget (token_ty *tp) { if (tp->type != token_type_eof) phase2_pushback[phase2_pushback_length++] = *tp; } /* 3. Combine "# string_literal" and "# symbol" to a single token. */ static void x_smalltalk_lex (token_ty *tp) { phase2_get (tp); if (tp->type == token_type_uniq) { token_ty token2; phase2_get (&token2); if (token2.type == token_type_symbol || token2.type == token_type_string_literal) { tp->type = token_type_string_literal; tp->string = token2.string; } else phase2_unget (&token2); } } /* ========================= Extracting strings. ========================== */ /* The file is broken into tokens. Scan the token stream, looking for the following patterns NLS ? NLS at: NLS at: plural: where is one of string_literal # string_literal # symbol */ void extract_smalltalk (FILE *f, const char *real_filename, const char *logical_filename, msgdomain_list_ty *mdlp) { message_list_ty *mlp = mdlp->item[0]->messages; fp = f; real_file_name = real_filename; logical_file_name = xstrdup (logical_filename); line_number = 1; last_comment_line = -1; last_non_comment_line = -1; /* Eat tokens until eof is seen. */ { /* 0 when no "NLS" has been seen. 1 after "NLS". 2 after "NLS ?". 3 after "NLS at:". 4 after "NLS at: ". 5 after "NLS at: plural:". */ int state; /* Remember the message containing the msgid, for msgid_plural. Non-NULL in states 4, 5. */ message_ty *plural_mp = NULL; /* Start state is 0. */ state = 0; for (;;) { token_ty token; x_smalltalk_lex (&token); switch (token.type) { case token_type_symbol: state = (strcmp (token.string, "NLS") == 0 ? 1 : strcmp (token.string, "?") == 0 && state == 1 ? 2 : strcmp (token.string, "at:") == 0 && state == 1 ? 3 : strcmp (token.string, "plural:") == 0 && state == 4 ? 5 : 0); free (token.string); break; case token_type_string_literal: if (state == 2) { lex_pos_ty pos; pos.file_name = logical_file_name; pos.line_number = token.line_number; remember_a_message (mlp, token.string, &pos); state = 0; break; } if (state == 3) { lex_pos_ty pos; pos.file_name = logical_file_name; pos.line_number = token.line_number; plural_mp = remember_a_message (mlp, token.string, &pos); state = 4; break; } if (state == 5) { lex_pos_ty pos; pos.file_name = logical_file_name; pos.line_number = token.line_number; remember_a_message_plural (plural_mp, token.string, &pos); state = 0; break; } state = 0; free (token.string); break; case token_type_uniq: case token_type_other: state = 0; break; case token_type_eof: break; default: abort (); } if (token.type == token_type_eof) break; } } /* Close scanner. */ fp = NULL; real_file_name = NULL; logical_file_name = NULL; line_number = 0; }