/* Reading NeXTstep/GNUstep .strings files. Copyright (C) 2003 Free Software Foundation, Inc. Written by Bruno Haible , 2003. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifdef HAVE_CONFIG_H # include #endif /* Specification. */ #include "read-stringtable.h" #include #include #include #include #include #include #include "error.h" #include "error-progname.h" #include "read-po-abstract.h" #include "po-hash.h" #include "xalloc.h" #include "exit.h" #include "utf8-ucs4.h" #include "ucs4-utf8.h" #include "gettext.h" #define _(str) gettext (str) /* The format of NeXTstep/GNUstep .strings files is documented in gnustep-base-1.8.0/Tools/make_strings/Using.txt and in the comments of method propertyListFromStringsFileFormat in gnustep-base-1.8.0/Source/NSString.m In summary, it's a Objective-C like file with pseudo-assignments of the form "key" = "value"; where the key is the msgid and the value is the msgstr. The implementation of the parser of .strings files is in gnustep-base-1.8.0/Source/NSString.m function GSPropertyListFromStringsFormat (indirectly called from NSBundle's method localizedStringForKey). A test case is in gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example */ /* Handling of comments: We copy all comments from the .strings file to the PO file. This is not really needed; it's a service for translators who don't like PO files and prefer to maintain the .strings file. */ /* Real filename, used in error messages about the input file. */ static const char *real_file_name; /* File name and line number. */ extern lex_pos_ty gram_pos; /* The input file stream. */ static FILE *fp; /* Phase 1: Read a byte. Max. 4 pushback characters. */ static unsigned char phase1_pushback[4]; static int phase1_pushback_length; static int phase1_getc () { int c; if (phase1_pushback_length) return phase1_pushback[--phase1_pushback_length]; c = getc (fp); if (c == EOF) { if (ferror (fp)) error (EXIT_FAILURE, errno, _("error while reading \"%s\""), real_file_name); return EOF; } return c; } static void phase1_ungetc (int c) { if (c != EOF) phase1_pushback[phase1_pushback_length++] = c; } /* Phase 2: Read an UCS-4 character. Max. 2 pushback characters. */ /* End-of-file indicator for functions returning an UCS-4 character. */ #define UEOF -1 static int phase2_pushback[4]; static int phase2_pushback_length; /* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each with a BOM!), or otherwise the locale-dependent default encoding is used. Since we don't want to depend on the locale here, we use ISO-8859-1 instead. */ enum enc { enc_undetermined, enc_ucs2be, enc_ucs2le, enc_utf8, enc_iso8859_1 }; static enum enc encoding; static int phase2_getc () { if (phase2_pushback_length) return phase2_pushback[--phase2_pushback_length]; if (encoding == enc_undetermined) { /* Determine the input file's encoding. */ int c0, c1; c0 = phase1_getc (); if (c0 == EOF) return UEOF; c1 = phase1_getc (); if (c1 == EOF) { phase1_ungetc (c0); encoding = enc_iso8859_1; } else if (c0 == 0xfe && c1 == 0xff) encoding = enc_ucs2be; else if (c0 == 0xff && c1 == 0xfe) encoding = enc_ucs2le; else { int c2; c2 = phase1_getc (); if (c2 == EOF) { phase1_ungetc (c1); phase1_ungetc (c0); encoding = enc_iso8859_1; } else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf) encoding = enc_utf8; else { phase1_ungetc (c2); phase1_ungetc (c1); phase1_ungetc (c0); encoding = enc_iso8859_1; } } } switch (encoding) { case enc_ucs2be: /* Read an UCS-2BE encoded character. */ { int c0, c1; c0 = phase1_getc (); if (c0 == EOF) return UEOF; c1 = phase1_getc (); if (c1 == EOF) return UEOF; return (c0 << 8) + c1; } case enc_ucs2le: /* Read an UCS-2LE encoded character. */ { int c0, c1; c0 = phase1_getc (); if (c0 == EOF) return UEOF; c1 = phase1_getc (); if (c1 == EOF) return UEOF; return c0 + (c1 << 8); } case enc_utf8: /* Read an UTF-8 encoded character. */ { unsigned char buf[6]; unsigned int count; int c; unsigned int uc; c = phase1_getc (); if (c == EOF) return UEOF; buf[0] = c; count = 1; if (buf[0] >= 0xc0) { c = phase1_getc (); if (c == EOF) return UEOF; buf[1] = c; count = 2; } if (buf[0] >= 0xe0 && ((buf[1] ^ 0x80) < 0x40)) { c = phase1_getc (); if (c == EOF) return UEOF; buf[2] = c; count = 3; } if (buf[0] >= 0xf0 && ((buf[1] ^ 0x80) < 0x40) && ((buf[2] ^ 0x80) < 0x40)) { c = phase1_getc (); if (c == EOF) return UEOF; buf[3] = c; count = 4; } if (buf[0] >= 0xf8 && ((buf[1] ^ 0x80) < 0x40) && ((buf[2] ^ 0x80) < 0x40) && ((buf[3] ^ 0x80) < 0x40)) { c = phase1_getc (); if (c == EOF) return UEOF; buf[4] = c; count = 5; } if (buf[0] >= 0xfc && ((buf[1] ^ 0x80) < 0x40) && ((buf[2] ^ 0x80) < 0x40) && ((buf[3] ^ 0x80) < 0x40) && ((buf[4] ^ 0x80) < 0x40)) { c = phase1_getc (); if (c == EOF) return UEOF; buf[5] = c; count = 6; } u8_mbtouc (&uc, buf, count); return uc; } case enc_iso8859_1: /* Read an ISO-8859-1 encoded character. */ { int c = phase1_getc (); if (c == EOF) return UEOF; return c; } default: abort (); } } static void phase2_ungetc (int c) { if (c != UEOF) phase2_pushback[phase2_pushback_length++] = c; } /* Phase 3: Read an UCS-4 character, with line number handling. */ static int phase3_getc () { int c = phase2_getc (); if (c == '\n') gram_pos.line_number++; return c; } static void phase3_ungetc (int c) { if (c == '\n') --gram_pos.line_number; phase2_ungetc (c); } /* Convert from UCS-4 to UTF-8. */ static char * conv_from_ucs4 (const int *buffer, size_t buflen) { unsigned char *utf8_string; size_t pos; unsigned char *q; /* Each UCS-4 word needs 6 bytes at worst. */ utf8_string = (unsigned char *) xmalloc (6 * buflen + 1); for (pos = 0, q = utf8_string; pos < buflen; ) { unsigned int uc; int n; uc = buffer[pos++]; n = u8_uctomb (q, uc, 6); assert (n > 0); q += n; } *q = '\0'; assert (q - utf8_string <= 6 * buflen); return (char *) utf8_string; } /* Parse a string enclosed in double-quotes. Input is UCS-4 encoded. Return the string in UTF-8 encoding, or NULL if the input doesn't represent a valid string enclosed in double-quotes. */ static char * parse_escaped_string (const int *string, size_t length) { static int *buffer; static size_t bufmax; static size_t buflen; const int *string_limit = string + length; int c; if (string == string_limit) return NULL; c = *string++; if (c != '"') return NULL; buflen = 0; for (;;) { if (string == string_limit) return NULL; c = *string++; if (c == '"') break; if (c == '\\') { if (string == string_limit) return NULL; c = *string++; if (c >= '0' && c <= '7') { unsigned int n = 0; int j = 0; for (;;) { n = n * 8 + (c - '0'); if (++j == 3) break; if (string == string_limit) break; c = *string; if (!(c >= '0' && c <= '7')) break; string++; } c = n; } else if (c == 'u' || c == 'U') { unsigned int n = 0; int j; for (j = 0; j < 4; j++) { if (string == string_limit) break; c = *string; if (c >= '0' && c <= '9') n = n * 16 + (c - '0'); else if (c >= 'A' && c <= 'F') n = n * 16 + (c - 'A' + 10); else if (c >= 'a' && c <= 'f') n = n * 16 + (c - 'a' + 10); else break; string++; } c = n; } else switch (c) { case 'a': c = '\a'; break; case 'b': c = '\b'; break; case 't': c = '\t'; break; case 'r': c = '\r'; break; case 'n': c = '\n'; break; case 'v': c = '\v'; break; case 'f': c = '\f'; break; } } if (buflen >= bufmax) { bufmax = 2 * bufmax + 10; buffer = xrealloc (buffer, bufmax * sizeof (int)); } buffer[buflen++] = c; } return conv_from_ucs4 (buffer, buflen); } /* Accumulating flag comments. */ static char *special_comment; static inline void special_comment_reset () { if (special_comment != NULL) free (special_comment); special_comment = NULL; } static void special_comment_add (const char *flag) { if (special_comment == NULL) special_comment = xstrdup (flag); else { size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1; special_comment = xrealloc (special_comment, total_len); strcat (special_comment, ", "); strcat (special_comment, flag); } } static inline void special_comment_finish () { if (special_comment != NULL) { po_callback_comment_special (special_comment); free (special_comment); special_comment = NULL; } } /* Accumulating comments. */ static int *buffer; static size_t bufmax; static size_t buflen; static bool next_is_obsolete; static bool next_is_fuzzy; static char *fuzzy_msgstr; static bool expect_fuzzy_msgstr_as_c_comment; static bool expect_fuzzy_msgstr_as_cxx_comment; static inline void comment_start () { buflen = 0; } static inline void comment_add (int c) { if (buflen >= bufmax) { bufmax = 2 * bufmax + 10; buffer = xrealloc (buffer, bufmax * sizeof (int)); } buffer[buflen++] = c; } static inline void comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr) { char *line; buflen -= chars_to_remove; /* Drop trailing white space, but not EOLs. */ while (buflen >= 1 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) --buflen; /* At special positions we interpret a comment of the form = "escaped string" with an optional trailing semicolon as being the fuzzy msgstr, not a regular comment. */ if (test_for_fuzzy_msgstr && buflen > 2 && buffer[0] == '=' && buffer[1] == ' ' && (fuzzy_msgstr = parse_escaped_string (buffer + 2, buflen - (buffer[buflen - 1] == ';') - 2))) return; line = conv_from_ucs4 (buffer, buflen); if (strcmp (line, "Flag: untranslated") == 0) { special_comment_add ("fuzzy"); next_is_fuzzy = true; } else if (strcmp (line, "Flag: unmatched") == 0) next_is_obsolete = true; else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0) special_comment_add (line + 6); else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0) /* A comment extracted from the source. */ po_callback_comment_dot (line + 9); else { char *last_colon; unsigned long number; char *endp; if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0 && (last_colon = strrchr (line + 6, ':')) != NULL && *(last_colon + 1) != '\0' && (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0')) { /* A "File: :" type comment. */ *last_colon = '\0'; po_callback_comment_filepos (line + 6, number); } else po_callback_comment (line); } } /* Phase 4: Replace each comment that is not inside a string with a space character. */ static int phase4_getc () { int c; c = phase3_getc (); if (c != '/') return c; c = phase3_getc (); switch (c) { default: phase3_ungetc (c); return '/'; case '*': /* C style comment. */ { bool last_was_star; size_t trailing_stars; bool seen_newline; comment_start (); last_was_star = false; trailing_stars = 0; seen_newline = false; /* Drop additional stars at the beginning of the comment. */ for (;;) { c = phase3_getc (); if (c != '*') break; last_was_star = true; } phase3_ungetc (c); for (;;) { c = phase3_getc (); if (c == UEOF) break; /* We skip all leading white space, but not EOLs. */ if (!(buflen == 0 && (c == ' ' || c == '\t'))) comment_add (c); switch (c) { case '\n': seen_newline = true; comment_line_end (1, false); comment_start (); last_was_star = false; trailing_stars = 0; continue; case '*': last_was_star = true; trailing_stars++; continue; case '/': if (last_was_star) { /* Drop additional stars at the end of the comment. */ comment_line_end (trailing_stars + 1, expect_fuzzy_msgstr_as_c_comment && !seen_newline); break; } /* FALLTHROUGH */ default: last_was_star = false; trailing_stars = 0; continue; } break; } return ' '; } case '/': /* C++ style comment. */ comment_start (); for (;;) { c = phase3_getc (); if (c == '\n' || c == UEOF) break; /* We skip all leading white space, but not EOLs. */ if (!(buflen == 0 && (c == ' ' || c == '\t'))) comment_add (c); } comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment); return '\n'; } } static inline void phase4_ungetc (int c) { phase3_ungetc (c); } /* Return true if a character is considered as whitespace. */ static bool is_whitespace (int c) { return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f' || c == '\b'); } /* Return true if a character needs quoting, i.e. cannot be used in unquoted tokens. */ static bool is_quotable (int c) { if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) return false; switch (c) { case '!': case '#': case '$': case '%': case '&': case '*': case '+': case '-': case '.': case '/': case ':': case '?': case '@': case '|': case '~': case '_': case '^': return false; default: return true; } } /* Read a key or value string. Return the string in UTF-8 encoding, or NULL if no string is seen. Return the start position of the string in *pos. */ static char * read_string (lex_pos_ty *pos) { static int *buffer; static size_t bufmax; static size_t buflen; int c; /* Skip whitespace before the string. */ do c = phase4_getc (); while (is_whitespace (c)); if (c == UEOF) /* No more string. */ return NULL; *pos = gram_pos; buflen = 0; if (c == '"') { /* Read a string enclosed in double-quotes. */ for (;;) { c = phase3_getc (); if (c == UEOF || c == '"') break; if (c == '\\') { c = phase3_getc (); if (c == UEOF) break; if (c >= '0' && c <= '7') { unsigned int n = 0; int j = 0; for (;;) { n = n * 8 + (c - '0'); if (++j == 3) break; c = phase3_getc (); if (!(c >= '0' && c <= '7')) { phase3_ungetc (c); break; } } c = n; } else if (c == 'u' || c == 'U') { unsigned int n = 0; int j; for (j = 0; j < 4; j++) { c = phase3_getc (); if (c >= '0' && c <= '9') n = n * 16 + (c - '0'); else if (c >= 'A' && c <= 'F') n = n * 16 + (c - 'A' + 10); else if (c >= 'a' && c <= 'f') n = n * 16 + (c - 'a' + 10); else { phase3_ungetc (c); break; } } c = n; } else switch (c) { case 'a': c = '\a'; break; case 'b': c = '\b'; break; case 't': c = '\t'; break; case 'r': c = '\r'; break; case 'n': c = '\n'; break; case 'v': c = '\v'; break; case 'f': c = '\f'; break; } } if (buflen >= bufmax) { bufmax = 2 * bufmax + 10; buffer = xrealloc (buffer, bufmax * sizeof (int)); } buffer[buflen++] = c; } if (c == UEOF) { error_with_progname = false; error (0, 0, _("%s:%lu: warning: unterminated string"), real_file_name, (unsigned long) gram_pos.line_number); error_with_progname = true; } } else { /* Read a token outside quotes. */ if (is_quotable (c)) { error_with_progname = false; error (0, 0, _("%s:%lu: warning: syntax error"), real_file_name, (unsigned long) gram_pos.line_number); error_with_progname = true; } for (; c != UEOF && !is_quotable (c); c = phase4_getc ()) { if (buflen >= bufmax) { bufmax = 2 * bufmax + 10; buffer = xrealloc (buffer, bufmax * sizeof (int)); } buffer[buflen++] = c; } } return conv_from_ucs4 (buffer, buflen); } /* Read a .strings file from a stream, and dispatch to the various abstract_po_reader_class_ty methods. */ void stringtable_parse (abstract_po_reader_ty *pop, FILE *file, const char *real_filename, const char *logical_filename) { fp = file; real_file_name = real_filename; gram_pos.file_name = xstrdup (real_file_name); gram_pos.line_number = 1; encoding = enc_undetermined; expect_fuzzy_msgstr_as_c_comment = false; expect_fuzzy_msgstr_as_cxx_comment = false; for (;;) { char *msgid; lex_pos_ty msgid_pos; char *msgstr; lex_pos_ty msgstr_pos; int c; /* Prepare for next msgid/msgstr pair. */ special_comment_reset (); next_is_obsolete = false; next_is_fuzzy = false; fuzzy_msgstr = NULL; /* Read the key and all the comments preceding it. */ msgid = read_string (&msgid_pos); if (msgid == NULL) break; special_comment_finish (); /* Skip whitespace. */ do c = phase4_getc (); while (is_whitespace (c)); /* Expect a '=' or ';'. */ if (c == UEOF) { error_with_progname = false; error (0, 0, _("%s:%lu: warning: unterminated key/value pair"), real_file_name, (unsigned long) gram_pos.line_number); error_with_progname = true; break; } if (c == ';') { /* "key"; is an abbreviation for "key"=""; and does not necessarily designate an untranslated entry. */ msgstr = ""; msgstr_pos = msgid_pos; po_callback_message (msgid, &msgid_pos, NULL, msgstr, strlen (msgstr) + 1, &msgstr_pos, false, next_is_obsolete); } else if (c == '=') { /* Read the value. */ msgstr = read_string (&msgstr_pos); if (msgstr == NULL) { error_with_progname = false; error (0, 0, _("%s:%lu: warning: unterminated key/value pair"), real_file_name, (unsigned long) gram_pos.line_number); error_with_progname = true; break; } /* Skip whitespace. But for fuzzy key/value pairs, look for the tentative msgstr in the form of a C style comment. */ expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy; do { c = phase4_getc (); if (fuzzy_msgstr != NULL) expect_fuzzy_msgstr_as_c_comment = false; } while (is_whitespace (c)); expect_fuzzy_msgstr_as_c_comment = false; /* Expect a ';'. */ if (c == ';') { /* But for fuzzy key/value pairs, look for the tentative msgstr in the form of a C++ style comment. */ if (fuzzy_msgstr == NULL && next_is_fuzzy) { do c = phase3_getc (); while (c == ' '); phase3_ungetc (c); expect_fuzzy_msgstr_as_cxx_comment = true; c = phase4_getc (); phase4_ungetc (c); expect_fuzzy_msgstr_as_cxx_comment = false; } if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0) msgstr = fuzzy_msgstr; /* A key/value pair. */ po_callback_message (msgid, &msgid_pos, NULL, msgstr, strlen (msgstr) + 1, &msgstr_pos, false, next_is_obsolete); } else { error_with_progname = false; error (0, 0, _("\ %s:%lu: warning: syntax error, expected ';' after string"), real_file_name, (unsigned long) gram_pos.line_number); error_with_progname = true; break; } } else { error_with_progname = false; error (0, 0, _("\ %s:%lu: warning: syntax error, expected '=' or ';' after string"), real_file_name, (unsigned long) gram_pos.line_number); error_with_progname = true; break; } } fp = NULL; real_file_name = NULL; gram_pos.line_number = 0; }