From a0dc7d9af85e06f151bb52ad68e6f900cd1adf5c Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Mon, 27 Aug 2001 12:04:15 +0000 Subject: Java format string checking. --- src/format-java.c | 779 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 779 insertions(+) create mode 100644 src/format-java.c (limited to 'src') diff --git a/src/format-java.c b/src/format-java.c new file mode 100644 index 0000000..fbc5549 --- /dev/null +++ b/src/format-java.c @@ -0,0 +1,779 @@ +/* Java format strings. + Copyright (C) 2001 Free Software Foundation, Inc. + Written by Bruno Haible , 2001. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include + +#include "format.h" +#include "c-ctype.h" +#include "system.h" +#include "error.h" +#include "progname.h" +#include "libgettext.h" + +#define _(str) gettext (str) + +/* Java format strings are described in java/text/MessageFormat.html. + See also the ICU documentation class_MessageFormat.html. + + messageFormatPattern := string ( "{" messageFormatElement "}" string )* + + messageFormatElement := argument { "," elementFormat } + + elementFormat := "time" { "," datetimeStyle } + | "date" { "," datetimeStyle } + | "number" { "," numberStyle } + | "choice" { "," choiceStyle } + + datetimeStyle := "short" + | "medium" + | "long" + | "full" + | dateFormatPattern + + numberStyle := "currency" + | "percent" + | "integer" + | numberFormatPattern + + choiceStyle := choiceFormatPattern + + dateFormatPattern see SimpleDateFormat.applyPattern + + numberFormatPattern see DecimalFormat.applyPattern + + choiceFormatPattern see ChoiceFormat constructor + + In strings, literal curly braces can be used if quoted between single + quotes. A real single quote is represented by ''. + + If a pattern is used, then unquoted braces in the pattern, if any, must + match: that is, "ab {0} de" and "ab '}' de" are ok, but "ab {0'}' de" and + "ab } de" are not. + + The argument is a number from 0 to 9, which corresponds to the arguments + presented in an array to be formatted. + + It is ok to have unused arguments in the array. + + Adding a dateFormatPattern / numberFormatPattern / choiceFormatPattern + to an elementFormat is equivalent to creating a SimpleDateFormat / + DecimalFormat / ChoiceFormat and use of setFormat. For example, + + MessageFormat form = + new MessageFormat("The disk \"{1}\" contains {0,choice,0#no files|1#one file|2#{0,number} files}."); + + is equivalent to + + MessageFormat form = new MessageFormat("The disk \"{1}\" contains {0}."); + form.setFormat(1, // Number of {} occurrence in the string! + new ChoiceFormat(new double[] { 0, 1, 2 }, + new String[] { "no files", "one file", + "{0,number} files" })); + + Note: The behaviour of quotes inside a choiceFormatPattern is not clear. + Example 1: + "abc{1,choice,0#{1,number,00';'000}}def" + JDK 1.1.x: exception + JDK 1.3.x: behaves like "abc{1,choice,0#{1,number,00;000}}def" + Example 2: + "abc{1,choice,0#{1,number,00';'}}def" + JDK 1.1.x: interprets the semicolon as number suffix + JDK 1.3.x: behaves like "abc{1,choice,0#{1,number,00;}}def" + */ + +enum format_arg_type +{ + FAT_NONE, + FAT_OBJECT, /* java.lang.Object */ + FAT_NUMBER, /* java.lang.Number */ + FAT_DATE /* java.util.Date */ +}; + +struct numbered_arg +{ + unsigned int number; + enum format_arg_type type; +}; + +struct spec +{ + unsigned int directives; + unsigned int numbered_arg_count; + unsigned int allocated; + struct numbered_arg *numbered; +}; + + +/* Prototypes for local functions. Needed to ensure compiler checking of + function argument counts despite of K&R C function definition syntax. */ +static bool message_format_parse PARAMS ((const char *format, + struct spec *spec)); +static bool date_format_parse PARAMS ((const char *format)); +static bool number_format_parse PARAMS ((const char *format)); +static bool choice_format_parse PARAMS ((const char *format, + struct spec *spec)); +static int numbered_arg_compare PARAMS ((const void *p1, const void *p2)); +static void *format_parse PARAMS ((const char *format)); +static void format_free PARAMS ((void *descr)); +static int format_get_number_of_directives PARAMS ((void *descr)); +static bool format_check PARAMS ((const lex_pos_ty *pos, + void *msgid_descr, void *msgstr_descr)); + + +/* Quote handling: + - When we see a single-quote, ignore it, but toggle the quoting flag. + - When we see a double single-quote, ignore the first of the two. + Assumes local variables format, quoting. */ +#define HANDLE_QUOTE \ + if (*format == '\'' && *++format != '\'') \ + quoting = !quoting; + +/* Note that message_format_parse and choice_format_parse are mutually + recursive. This is because MessageFormat can use some ChoiceFormats, + and a ChoiceFormat is made up from several MessageFormats. */ + +/* Return true if a format is a valid messageFormatPattern. + Extracts argument type information into spec. */ +static bool +message_format_parse (format, spec) + const char *format; + struct spec *spec; +{ + bool quoting = false; + + for (;;) + { + HANDLE_QUOTE; + if (!quoting && *format == '{') + { + unsigned int depth; + const char *element_start; + const char *element_end; + size_t n; + char *element; + unsigned int number; + enum format_arg_type type; + + spec->directives++; + + element_start = ++format; + depth = 0; + for (; *format != '\0'; format++) + { + if (*format == '{') + depth++; + else if (*format == '}') + { + if (depth == 0) + break; + else + depth--; + } + } + if (*format == '\0') + return false; + element_end = format++; + + n = element_end - element_start; + element = (char *) alloca (n + 1); + memcpy (element, element_start, n); + element[n] = '\0'; + + if (!c_isdigit (*element)) + return false; + number = 0; + do + { + number = 10 * number + (*element - '0'); + element++; + } + while (c_isdigit (*element)); + + type = FAT_OBJECT; + if (*element == '\0') + ; + else if (strncmp (element, ",time", 5) == 0 + || strncmp (element, ",date", 5) == 0) + { + type = FAT_DATE; + element += 5; + if (*element == '\0') + ; + else if (*element++ == ',' + && (strcmp (element, "short") == 0 + || strcmp (element, "medium") == 0 + || strcmp (element, "long") == 0 + || strcmp (element, "full") == 0 + || date_format_parse (element))) + ; + else + return false; + } + else if (strncmp (element, ",number", 7) == 0) + { + type = FAT_NUMBER; + element += 7; + if (*element == '\0') + ; + else if (*element++ == ',' + && (strcmp (element, "currency") == 0 + || strcmp (element, "percent") == 0 + || strcmp (element, "integer") == 0 + || number_format_parse (element))) + ; + else + return false; + } + else if (strncmp (element, ",choice", 7) == 0) + { + type = FAT_NUMBER; /* because ChoiceFormat extends NumberFormat */ + element += 7; + if (*element == '\0') + ; + else if (*element++ == ',' + && choice_format_parse (element, spec)) + ; + else + return false; + } + else + return false; + + if (spec->allocated == spec->numbered_arg_count) + { + spec->allocated = 2 * spec->allocated + 1; + spec->numbered = (struct numbered_arg *) xrealloc (spec->numbered, spec->allocated * sizeof (struct numbered_arg)); + } + spec->numbered[spec->numbered_arg_count].number = number; + spec->numbered[spec->numbered_arg_count].type = type; + spec->numbered_arg_count++; + } + /* The doc says "ab}de" is invalid. Even though JDK accepts it. */ + else if (!quoting && *format == '}') + return false; + else if (*format != '\0') + format++; + else + break; + } + + return true; +} + +/* Return true if a format is a valid dateFormatPattern. */ +static bool +date_format_parse (format) + const char *format; +{ + /* Any string is valid. Single-quote starts a quoted section, to be + terminated at the next single-quote or string end. Double single-quote + gives a single single-quote. Non-quoted ASCII letters are first grouped + into blocks of equal letters. Then each block (e.g. 'yyyy') is + interpreted according to some rules. */ + return true; +} + +/* Return true if a format is a valid numberFormatPattern. */ +static bool +number_format_parse (format) + const char *format; +{ + /* Pattern Syntax: + pattern := pos_pattern{';' neg_pattern} + pos_pattern := {prefix}number{suffix} + neg_pattern := {prefix}number{suffix} + number := integer{'.' fraction}{exponent} + prefix := '\u0000'..'\uFFFD' - special_characters + suffix := '\u0000'..'\uFFFD' - special_characters + integer := min_int | '#' | '#' integer | '#' ',' integer + min_int := '0' | '0' min_int | '0' ',' min_int + fraction := '0'* '#'* + exponent := 'E' '0' '0'* + Notation: + X* 0 or more instances of X + { X } 0 or 1 instances of X + X | Y either X or Y + X..Y any character from X up to Y, inclusive + S - T characters in S, except those in T + Single-quote starts a quoted section, to be terminated at the next + single-quote or string end. Double single-quote gives a single + single-quote. + */ + bool quoting = false; + bool seen_semicolon = false; + + HANDLE_QUOTE; + for (;;) + { + /* Parse prefix. */ + while (*format != '\0' + && !(!quoting && (*format == '0' || *format == '#'))) + { + if (format[0] == '\\') + { + if (format[1] == 'u' + && c_isxdigit (format[2]) + && c_isxdigit (format[3]) + && c_isxdigit (format[4]) + && c_isxdigit (format[5])) + format += 6; + else + format += 2; + } + else + format += 1; + HANDLE_QUOTE; + } + + /* Parse integer. */ + if (!(!quoting && (*format == '0' || *format == '#'))) + return false; + while (!quoting && *format == '#') + { + format++; + HANDLE_QUOTE; + if (!quoting && *format == ',') + { + format++; + HANDLE_QUOTE; + } + } + while (!quoting && *format == '0') + { + format++; + HANDLE_QUOTE; + if (!quoting && *format == ',') + { + format++; + HANDLE_QUOTE; + } + } + + /* Parse fraction. */ + if (!quoting && *format == '.') + { + format++; + HANDLE_QUOTE; + while (!quoting && *format == '0') + { + format++; + HANDLE_QUOTE; + } + while (!quoting && *format == '#') + { + format++; + HANDLE_QUOTE; + } + } + + /* Parse exponent. */ + if (!quoting && *format == 'E') + { + const char *format_save = format; + format++; + HANDLE_QUOTE; + if (!quoting && *format == '0') + { + do + { + format++; + HANDLE_QUOTE; + } + while (!quoting && *format == '0'); + } + else + { + /* Back up. */ + format = format_save; + quoting = false; + } + } + + /* Parse suffix. */ + while (*format != '\0' + && (seen_semicolon || !(!quoting && *format == ';'))) + { + if (format[0] == '\\') + { + if (format[1] == 'u' + && c_isxdigit (format[2]) + && c_isxdigit (format[3]) + && c_isxdigit (format[4]) + && c_isxdigit (format[5])) + format += 6; + else + format += 2; + } + else + format += 1; + HANDLE_QUOTE; + } + + if (seen_semicolon || !(!quoting && *format == ';')) + break; + } + + return (*format == '\0'); +} + +/* Return true if a format is a valid choiceFormatPattern. + Extracts argument type information into spec. */ +static bool +choice_format_parse (format, spec) + const char *format; + struct spec *spec; +{ + /* Pattern syntax: + pattern := | choice | choice '|' pattern + choice := number separator messageformat + separator := '<' | '#' | '\u2264' + Single-quote starts a quoted section, to be terminated at the next + single-quote or string end. Double single-quote gives a single + single-quote. + */ + bool quoting = false; + + HANDLE_QUOTE; + if (*format == '\0') + return true; + for (;;) + { + /* Don't bother looking too precisely into the syntax of the number. + It can contain various Unicode characters. */ + char *msgformat; + char *mp; + + /* Parse number. */ + while (*format != '\0' + && !(!quoting && (*format == '<' || *format == '#' + || strncmp (format, "\\u2264", 6) == 0 + || *format == '|'))) + { + if (format[0] == '\\') + { + if (format[1] == 'u' + && c_isxdigit (format[2]) + && c_isxdigit (format[3]) + && c_isxdigit (format[4]) + && c_isxdigit (format[5])) + format += 6; + else + format += 2; + } + else + format += 1; + HANDLE_QUOTE; + } + + /* Short clause at end of pattern is valid and is ignored! */ + if (*format == '\0') + break; + + if (*format == '<' || *format == '#') + format += 1; + else if (strncmp (format, "\\u2264", 6) == 0) + format += 6; + else + return false; + HANDLE_QUOTE; + + msgformat = (char *) alloca (strlen (format) + 1); + mp = msgformat; + + while (*format != '\0' && !(!quoting && *format == '|')) + { + *mp++ = *format++; + HANDLE_QUOTE; + } + *mp = '\0'; + + if (!message_format_parse (msgformat, spec)) + return false; + + if (*format == '\0') + break; + + format++; + HANDLE_QUOTE; + } + + return true; +} + +static int +numbered_arg_compare (p1, p2) + const void *p1; + const void *p2; +{ + unsigned int n1 = ((const struct numbered_arg *) p1)->number; + unsigned int n2 = ((const struct numbered_arg *) p2)->number; + + return (n1 > n2 ? 1 : n1 < n2 ? -1 : 0); +} + +static void * +format_parse (format) + const char *format; +{ + struct spec spec; + struct spec *result; + + spec.directives = 0; + spec.numbered_arg_count = 0; + spec.allocated = 0; + spec.numbered = NULL; + + if (!message_format_parse (format, &spec)) + goto bad_format; + + /* Sort the numbered argument array, and eliminate duplicates. */ + if (spec.numbered_arg_count > 1) + { + unsigned int i, j; + bool err; + + qsort (spec.numbered, spec.numbered_arg_count, + sizeof (struct numbered_arg), numbered_arg_compare); + + /* Remove duplicates: Copy from i to j, keeping 0 <= j <= i. */ + err = false; + for (i = j = 0; i < spec.numbered_arg_count; i++) + if (j > 0 && spec.numbered[i].number == spec.numbered[j-1].number) + { + enum format_arg_type type1 = spec.numbered[i].type; + enum format_arg_type type2 = spec.numbered[j-1].type; + enum format_arg_type type_both; + + if (type1 == type2 || type2 == FAT_OBJECT) + type_both = type1; + else if (type1 == FAT_OBJECT) + type_both = type2; + else + /* Incompatible types. */ + type_both = FAT_NONE, err = true; + + spec.numbered[j-1].type = type_both; + } + else + { + if (j < i) + { + spec.numbered[j].number = spec.numbered[i].number; + spec.numbered[j].type = spec.numbered[i].type; + } + j++; + } + spec.numbered_arg_count = j; + if (err) + goto bad_format; + } + + result = (struct spec *) xmalloc (sizeof (struct spec)); + *result = spec; + return result; + + bad_format: + if (spec.numbered != NULL) + free (spec.numbered); + return NULL; +} + +static void +format_free (descr) + void *descr; +{ + struct spec *spec = (struct spec *) descr; + + if (spec->numbered != NULL) + free (spec->numbered); + free (spec); +} + +static int +format_get_number_of_directives (descr) + void *descr; +{ + struct spec *spec = (struct spec *) descr; + + return spec->directives; +} + +static bool +format_check (pos, msgid_descr, msgstr_descr) + const lex_pos_ty *pos; + void *msgid_descr; + void *msgstr_descr; +{ + struct spec *spec1 = (struct spec *) msgid_descr; + struct spec *spec2 = (struct spec *) msgstr_descr; + bool err = false; + + if (spec1->numbered_arg_count + spec2->numbered_arg_count > 0) + { + unsigned int i; + unsigned int n = MAX (spec1->numbered_arg_count, spec2->numbered_arg_count); + + /* Check the argument names are the same. + Both arrays are sorted. We search for the first difference. */ + for (i = 0; i < n; i++) + { + int cmp = (i >= spec1->numbered_arg_count ? 1 : + i >= spec2->numbered_arg_count ? -1 : + spec1->numbered[i].number > spec2->numbered[i].number ? 1 : + spec1->numbered[i].number < spec2->numbered[i].number ? -1 : + 0); + + if (cmp > 0) + { + error_with_progname = false; + error_at_line (0, 0, pos->file_name, pos->line_number, + _("a format specification for argument {%u} doesn't exist in 'msgid'"), + spec2->numbered[i].number); + error_with_progname = true; + err = true; + break; + } + else if (cmp < 0) + { + error_with_progname = false; + error_at_line (0, 0, pos->file_name, pos->line_number, + _("a format specification for argument {%u} doesn't exist in 'msgstr'"), + spec1->numbered[i].number); + error_with_progname = true; + err = true; + break; + } + } + /* Check the argument types are the same. */ + if (!err) + for (i = 0; i < spec2->numbered_arg_count; i++) + if (spec1->numbered[i].type != spec2->numbered[i].type) + { + error_with_progname = false; + error_at_line (0, 0, pos->file_name, pos->line_number, + _("format specifications in 'msgid' and 'msgstr' for argument {%u} are not the same"), + spec2->numbered[i].number); + error_with_progname = true; + err = true; + break; + } + } + + return err; +} + + +struct formatstring_parser formatstring_java = +{ + format_parse, + format_free, + format_get_number_of_directives, + format_check +}; + + +#ifdef TEST + +/* Test program: Print the argument list specification returned by + format_parse for strings read from standard input. */ + +#include +#include "getline.h" + +static void +format_print (descr) + void *descr; +{ + struct spec *spec = (struct spec *) descr; + unsigned int last; + unsigned int i; + + if (spec == NULL) + { + printf ("INVALID"); + return; + } + + printf ("("); + last = 0; + for (i = 0; i < spec->numbered_arg_count; i++) + { + unsigned int number = spec->numbered[i].number; + + if (i > 0) + printf (" "); + if (number < last) + abort (); + for (; last < number; last++) + printf ("_ "); + switch (spec->numbered[i].type) + { + case FAT_OBJECT: + printf ("*"); + break; + case FAT_NUMBER: + printf ("Number"); + break; + case FAT_DATE: + printf ("Date"); + break; + default: + abort (); + } + last = number + 1; + } + printf (")"); +} + +int +main () +{ + for (;;) + { + char *line = NULL; + size_t line_len = 0; + void *descr; + + if (getline (&line, &line_len, stdin) < 0) + break; + + descr = format_parse (line); + + format_print (descr); + printf ("\n"); + + free (line); + } + + return 0; +} + +/* + * For Emacs M-x compile + * Local Variables: + * compile-command: "gcc -O -g -Wall -I.. -I../lib -I../intl -DHAVE_CONFIG_H -DTEST format-java.c ../lib/libnlsut.a" + * End: + */ + +#endif /* TEST */ -- cgit v1.1