diff options
Diffstat (limited to 'gettext-tools')
24 files changed, 923 insertions, 12 deletions
diff --git a/gettext-tools/doc/ChangeLog b/gettext-tools/doc/ChangeLog index 7226dde..bc0df95 100644 --- a/gettext-tools/doc/ChangeLog +++ b/gettext-tools/doc/ChangeLog @@ -1,3 +1,7 @@ +2015-03-03 Daiki Ueno <ueno@gnu.org> + + * xgettext.texi: Document options --check and --sentence-end. + 2015-02-09 Daiki Ueno <ueno@gnu.org> * gettext.texi (Plural forms): Add Arabic, Bahasa Indonesian, and diff --git a/gettext-tools/doc/xgettext.texi b/gettext-tools/doc/xgettext.texi index 451e25f..f01c9d2 100644 --- a/gettext-tools/doc/xgettext.texi +++ b/gettext-tools/doc/xgettext.texi @@ -144,6 +144,60 @@ gettext ( The second comment line will not be extracted, because there is one blank line between the comment line and the keyword. +@item --check[=@var{CHECK}] +@opindex --check@r{, @code{xgettext} option} +@cindex supported syntax checks, @code{xgettext} +Perform a syntax check on msgid and msgid_plural. The supported checks +are: + +@table @samp +@item ellipsis-unicode +Prefer Unicode ellipsis character over ASCII @code{...} + +@item space-ellipsis +Prohibit whitespace before an ellipsis character + +@item quote-unicode +Prefer Unicode quotation marks over ASCII @code{"'`} + +@end table + +The option has an effect on all input files. To enable or disable +checks for a certain string, you can mark it with an @code{xgettext:} +special comment in the source file. For example, if you specify the +@code{--check=space-ellipsis} option, but want to suppress the check on +a particular string, add the following comment: + +@example +/* xgettext: no-space-ellipsis-check */ +gettext ("We really want a space before ellipsis here ..."); +@end example + +The @code{xgettext:} comment can be followed by flags separated with a +comma. The possible flags are of the form @samp{[no-]@var{name}-check}, +where @var{name} is the name of a valid syntax check. If a flag is +prefixed by @code{no-}, the meaning is negated. + +Some tests apply the checks to each sentence within the msgid, rather +than the whole string. xgettext detects the end of sentence by +performing a pattern match, which usually looks for a period followed by +a certain number of spaces. The number is specified with the +@code{--sentence-end} option. + +@item --sentence-end[=@var{TYPE}] +@opindex --sentence-end@r{, @code{xgettext} option} +@cindex sentence end markers, @code{xgettext} +The supported values are: + +@table @samp +@item single-space +Expect at least one whitespace after a period + +@item double-space +Expect at least two whitespaces after a period + +@end table + @end table @subsection Language specific options diff --git a/gettext-tools/gnulib-lib/.gitignore b/gettext-tools/gnulib-lib/.gitignore index 36427cb..d87a15b 100644 --- a/gettext-tools/gnulib-lib/.gitignore +++ b/gettext-tools/gnulib-lib/.gitignore @@ -429,3 +429,4 @@ /qset-acl.c /secure_getenv.c /set-acl.c +/unictype.in.h diff --git a/gettext-tools/libgettextpo/.gitignore b/gettext-tools/libgettextpo/.gitignore index c4aed34..ef70a69 100644 --- a/gettext-tools/libgettextpo/.gitignore +++ b/gettext-tools/libgettextpo/.gitignore @@ -215,3 +215,4 @@ /charset.alias /exported.sh +/unictype.in.h diff --git a/gettext-tools/libgettextpo/Makefile.am b/gettext-tools/libgettextpo/Makefile.am index b4c07f7..10f5de6 100644 --- a/gettext-tools/libgettextpo/Makefile.am +++ b/gettext-tools/libgettextpo/Makefile.am @@ -90,7 +90,8 @@ libgettextpo_la_AUXSOURCES = \ ../src/format.c \ ../src/plural-exp.c \ ../src/plural-eval.c \ - ../src/msgl-check.c + ../src/msgl-check.c \ + ../src/sentence.c # Libtool's library version information for libgettextpo. # See the libtool documentation, section "Library interface versions". diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index 93a7dd0..f0e10fe 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,44 @@ +2015-03-02 Daiki Ueno <ueno@gnu.org> + + xgettext: Support message syntax checks + With this change, xgettext could report common syntactic problems + in extracted strings. The current built-in checks are + ellipsis-unicode, space-ellipsis, and quote-unicode. Those checks + can be enabled with --check option of xgettext and disabled with + special "xgettext:" comment in source files. + Feature suggested by Philip Withnall in: + https://savannah.gnu.org/bugs/?44098 + * message.h (enum syntax_check_type): New enum. + (NSYNTAXCHECKS): New constant. + (enum is_syntax_check): New enum. + (struct message_ty): New field 'do_syntax_check'. + (syntax_check_name): New variable declaration. + * message.c (syntax_check_name): New variable. + * msgl-cat.c (catenate_msgdomain_list): Propagate + mp->do_syntax_check. + * msgmerge.c (message_merge): Propagate ref->do_syntax_check. + * msgl-check.h (syntax_check_message_list): New declaration. + * msgl-check.c (syntax_check_ellipsis_unicode): New function. + (syntax_check_space_ellipsis): New function. + (syntax_check_quote_unicode): New function. + (syntax_check_message): New function. + (syntax_check_message_list): New function. + * read-catalog-abstract.h (po_parse_comment_special): Adjust + function declaration. + * read-catalog-abstract.c (po_parse_comment_special): Add new + argument SCP for syntax checking; all callers changed. + * read-catalog.h (DEFAULT_CATALOG_READER_TY): New field + 'do_syntax_check'. + * read-catalog.c (default_constructor): Initialize + this->do_syntax_check. + (default_copy_comment_state): Propagate this->do_syntax_check. + * sentence.h: New file. + * sentence.c: New file. + * xgettext.c (long_options): Add options --check and --sentence-end. + (main): Handle options --check and --sentence-end. + (usage): Document options --check and --sentence-end. + (remember_a_message): Propagate do_syntax_check value. + 2015-02-05 Alex Henrie <alexhenrie24@gmail.com> (tiny change) xgettext: Wrap location comments to 79 characters diff --git a/gettext-tools/src/Makefile.am b/gettext-tools/src/Makefile.am index 3f6ce30..edb376f 100644 --- a/gettext-tools/src/Makefile.am +++ b/gettext-tools/src/Makefile.am @@ -148,7 +148,7 @@ $(COMMON_SOURCE) read-catalog.c \ color.c write-catalog.c write-properties.c write-stringtable.c write-po.c \ msgl-ascii.c msgl-iconv.c msgl-equal.c msgl-cat.c msgl-header.c msgl-english.c \ msgl-check.c file-list.c msgl-charset.c po-time.c plural-exp.c plural-eval.c \ -plural-table.c quote.h \ +plural-table.c quote.h sentence.h sentence.c \ $(FORMAT_SOURCE) \ read-desktop.c diff --git a/gettext-tools/src/message.c b/gettext-tools/src/message.c index 586675f..2596887 100644 --- a/gettext-tools/src/message.c +++ b/gettext-tools/src/message.c @@ -104,6 +104,14 @@ possible_format_p (enum is_format is_format) } +const char *const syntax_check_name[NSYNTAXCHECKS] = +{ + /* sc_ellipsis_unicode */ "ellipsis-unicode", + /* sc_space_ellipsis */ "space-ellipsis", + /* sc_quote_unicode */ "quote-unicode" +}; + + message_ty * message_alloc (const char *msgctxt, const char *msgid, const char *msgid_plural, @@ -130,6 +138,8 @@ message_alloc (const char *msgctxt, mp->range.min = -1; mp->range.max = -1; mp->do_wrap = undecided; + for (i = 0; i < NSYNTAXCHECKS; i++) + mp->do_syntax_check[i] = undecided; mp->prev_msgctxt = NULL; mp->prev_msgid = NULL; mp->prev_msgid_plural = NULL; @@ -235,6 +245,8 @@ message_copy (message_ty *mp) result->is_format[i] = mp->is_format[i]; result->range = mp->range; result->do_wrap = mp->do_wrap; + for (i = 0; i < NSYNTAXCHECKS; i++) + result->do_syntax_check[i] = mp->do_syntax_check[i]; for (j = 0; j < mp->filepos_count; ++j) { lex_pos_ty *pp = &mp->filepos[j]; diff --git a/gettext-tools/src/message.h b/gettext-tools/src/message.h index bf2215a..8b9bc3f 100644 --- a/gettext-tools/src/message.h +++ b/gettext-tools/src/message.h @@ -114,6 +114,29 @@ enum is_wrap #endif +/* Kinds of syntax checks which apply to strings. */ +enum syntax_check_type +{ + sc_ellipsis_unicode, + sc_space_ellipsis, + sc_quote_unicode +}; +#define NSYNTAXCHECKS 3 +extern DLL_VARIABLE const char *const syntax_check_name[NSYNTAXCHECKS]; + +/* Is current msgid subject to a syntax check? */ +#if 0 +enum is_syntax_check +{ + undecided, + yes, + no +}; +#else /* HACK - C's enum concept is so stupid */ +#define is_syntax_check is_format +#endif + + struct altstr { const char *msgstr; @@ -175,6 +198,9 @@ struct message_ty /* Do we want the string to be wrapped in the emitted PO file? */ enum is_wrap do_wrap; + /* Do we want to apply extra syntax checks on the string? */ + enum is_syntax_check do_syntax_check[NSYNTAXCHECKS]; + /* The prev_msgctxt, prev_msgid and prev_msgid_plural strings appearing before the message, if present. Generated by msgmerge. */ const char *prev_msgctxt; diff --git a/gettext-tools/src/msgl-cat.c b/gettext-tools/src/msgl-cat.c index 0bd58d4..8502a64 100644 --- a/gettext-tools/src/msgl-cat.c +++ b/gettext-tools/src/msgl-cat.c @@ -308,6 +308,8 @@ domain \"%s\" in input file '%s' doesn't contain a header entry with a charset s tmp->range.min = - INT_MAX; tmp->range.max = - INT_MAX; tmp->do_wrap = yes; /* may be set to no later */ + for (i = 0; i < NSYNTAXCHECKS; i++) + tmp->do_syntax_check[i] = undecided; /* may be set to yes/no later */ tmp->obsolete = true; /* may be set to false later */ tmp->alternative_count = 0; tmp->alternative = NULL; @@ -535,6 +537,8 @@ UTF-8 encoded from the beginning, i.e. already in your source code files.\n"), tmp->is_format[i] = mp->is_format[i]; tmp->range = mp->range; tmp->do_wrap = mp->do_wrap; + for (i = 0; i < NSYNTAXCHECKS; i++) + tmp->do_syntax_check[i] = mp->do_syntax_check[i]; tmp->prev_msgctxt = mp->prev_msgctxt; tmp->prev_msgid = mp->prev_msgid; tmp->prev_msgid_plural = mp->prev_msgid_plural; @@ -583,6 +587,9 @@ UTF-8 encoded from the beginning, i.e. already in your source code files.\n"), } if (tmp->do_wrap == undecided) tmp->do_wrap = mp->do_wrap; + for (i = 0; i < NSYNTAXCHECKS; i++) + if (tmp->do_syntax_check[i] == undecided) + tmp->do_syntax_check[i] = mp->do_syntax_check[i]; tmp->obsolete = false; } else @@ -635,6 +642,12 @@ UTF-8 encoded from the beginning, i.e. already in your source code files.\n"), } if (mp->do_wrap == no) tmp->do_wrap = no; + for (i = 0; i < NSYNTAXCHECKS; i++) + if (mp->do_syntax_check[i] == yes) + tmp->do_syntax_check[i] = yes; + else if (mp->do_syntax_check[i] == no + && tmp->do_syntax_check[i] == undecided) + tmp->do_syntax_check[i] = no; /* Don't fill tmp->prev_msgid in this case. */ if (!mp->obsolete) tmp->obsolete = false; diff --git a/gettext-tools/src/msgl-check.c b/gettext-tools/src/msgl-check.c index d6f4a3d..b5f2537 100644 --- a/gettext-tools/src/msgl-check.c +++ b/gettext-tools/src/msgl-check.c @@ -40,6 +40,10 @@ #include "plural-table.h" #include "c-strstr.h" #include "message.h" +#include "quote.h" +#include "sentence.h" +#include "unictype.h" +#include "unistr.h" #include "gettext.h" #define _(str) gettext (str) @@ -912,3 +916,204 @@ check_message_list (message_list_ty *mlp, return seen_errors; } + + +static int +syntax_check_ellipsis_unicode (const message_ty *mp, const char *msgid) +{ + const char *str = msgid; + const char *str_limit = str + strlen (msgid); + int seen_errors = 0; + + while (str < str_limit) + { + const char *end, *cp; + ucs4_t ending_char; + + end = sentence_end (str, &ending_char); + + /* sentence_end doesn't treat '...' specially. */ + cp = end - (ending_char == '.' ? 2 : 3); + if (cp >= str && memcmp (cp, "...", 3) == 0) + { + po_xerror (PO_SEVERITY_ERROR, mp, NULL, 0, 0, false, + _("ASCII ellipsis ('...') instead of Unicode")); + seen_errors++; + } + + str = end + 1; + } + + return seen_errors; +} + + +static int +syntax_check_space_ellipsis (const message_ty *mp, const char *msgid) +{ + const char *str = msgid; + const char *str_limit = str + strlen (msgid); + int seen_errors = 0; + + while (str < str_limit) + { + const char *end, *ellipsis = NULL; + ucs4_t ending_char; + + end = sentence_end (str, &ending_char); + + if (ending_char == 0x2026) + ellipsis = end; + else if (ending_char == '.') + { + /* sentence_end doesn't treat '...' specially. */ + const char *cp = end - 2; + if (cp >= str && memcmp (cp, "...", 3) == 0) + ellipsis = cp; + } + else + { + /* Look for a '...'. */ + const char *cp = end - 3; + if (cp >= str && memcmp (cp, "...", 3) == 0) + ellipsis = cp; + else + { + ucs4_t uc = 0xfffd; + + /* Look for a U+2026. */ + for (cp = end - 1; cp >= str; cp--) + { + u8_mbtouc (&uc, (const unsigned char *) cp, ellipsis - cp); + if (uc != 0xfffd) + break; + } + + if (uc == 0x2026) + ellipsis = cp; + } + } + + if (ellipsis) + { + const char *cp; + ucs4_t uc = 0xfffd; + + /* Look at the character before ellipsis. */ + for (cp = ellipsis - 1; cp >= str; cp--) + { + u8_mbtouc (&uc, (const unsigned char *) cp, ellipsis - cp); + if (uc != 0xfffd) + break; + } + + if (uc != 0xfffd && uc_is_space (uc)) + { + po_xerror (PO_SEVERITY_ERROR, mp, NULL, 0, 0, false, + _("\ +space before ellipsis found in user visible strings")); + seen_errors++; + } + } + + str = end + 1; + } + + return seen_errors; +} + + +struct callback_arg +{ + const message_ty *mp; + int seen_errors; +}; + +static void +syntax_check_quote_unicode_callback (char quote, const char *quoted, + size_t quoted_length, void *data) +{ + struct callback_arg *arg = data; + + switch (quote) + { + case '"': + po_xerror (PO_SEVERITY_ERROR, arg->mp, NULL, 0, 0, false, + _("ASCII double quote used instead of Unicode")); + arg->seen_errors++; + break; + + case '\'': + po_xerror (PO_SEVERITY_ERROR, arg->mp, NULL, 0, 0, false, + _("ASCII single quote used instead of Unicode")); + arg->seen_errors++; + break; + + default: + break; + } +} + +static int +syntax_check_quote_unicode (const message_ty *mp, const char *msgid) +{ + struct callback_arg arg; + + arg.mp = mp; + arg.seen_errors = 0; + + scan_quoted (msgid, strlen (msgid), + syntax_check_quote_unicode_callback, &arg); + + return arg.seen_errors; +} + + +typedef int (* syntax_check_function) (const message_ty *mp, const char *msgid); +static const syntax_check_function sc_funcs[NSYNTAXCHECKS] = +{ + syntax_check_ellipsis_unicode, + syntax_check_space_ellipsis, + syntax_check_quote_unicode +}; + +/* Perform all syntax checks on a non-obsolete message. + Return the number of errors that were seen. */ +static int +syntax_check_message (const message_ty *mp) +{ + int seen_errors = 0; + int i; + + for (i = 0; i < NSYNTAXCHECKS; i++) + { + if (mp->do_syntax_check[i] == yes) + { + seen_errors += sc_funcs[i] (mp, mp->msgid); + if (mp->msgid_plural) + seen_errors += sc_funcs[i] (mp, mp->msgid_plural); + } + } + + return seen_errors; +} + + +/* Perform all syntax checks on a message list. + Return the number of errors that were seen. */ +int +syntax_check_message_list (message_list_ty *mlp) +{ + int seen_errors = 0; + size_t j; + + for (j = 0; j < mlp->nitems; j++) + { + message_ty *mp = mlp->item[j]; + + if (!is_header (mp)) + seen_errors += syntax_check_message (mp); + } + + return seen_errors; +} diff --git a/gettext-tools/src/msgl-check.h b/gettext-tools/src/msgl-check.h index f03300c..73fee69 100644 --- a/gettext-tools/src/msgl-check.h +++ b/gettext-tools/src/msgl-check.h @@ -60,6 +60,9 @@ extern int check_message_list (message_list_ty *mlp, int check_compatibility, int check_accelerators, char accelerator_char); +/* Perform all syntax checks on a message list. + Return the number of errors that were seen. */ +extern int syntax_check_message_list (message_list_ty *mlp); #ifdef __cplusplus } diff --git a/gettext-tools/src/msgmerge.c b/gettext-tools/src/msgmerge.c index 0415b2a..71d8962 100644 --- a/gettext-tools/src/msgmerge.c +++ b/gettext-tools/src/msgmerge.c @@ -1330,6 +1330,9 @@ message_merge (message_ty *def, message_ty *ref, bool force_fuzzy, result->do_wrap = ref->do_wrap; + for (i = 0; i < NSYNTAXCHECKS; i++) + result->do_syntax_check[i] = ref->do_syntax_check[i]; + /* Insert previous msgid, commented out with "#|". Do so only when --previous is specified, for backward compatibility. Since the "previous msgid" represents the original msgid that led to diff --git a/gettext-tools/src/read-catalog-abstract.c b/gettext-tools/src/read-catalog-abstract.c index d4e98ee..0817cd7 100644 --- a/gettext-tools/src/read-catalog-abstract.c +++ b/gettext-tools/src/read-catalog-abstract.c @@ -262,7 +262,8 @@ po_callback_comment_special (const char *s) void po_parse_comment_special (const char *s, bool *fuzzyp, enum is_format formatp[NFORMATS], - struct argument_range *rangep, enum is_wrap *wrapp) + struct argument_range *rangep, enum is_wrap *wrapp, + enum is_syntax_check scp[NSYNTAXCHECKS]) { size_t i; @@ -272,6 +273,8 @@ po_parse_comment_special (const char *s, rangep->min = -1; rangep->max = -1; *wrapp = undecided; + for (i = 0; i < NSYNTAXCHECKS; i++) + scp[i] = undecided; while (*s != '\0') { @@ -405,6 +408,36 @@ po_parse_comment_special (const char *s, continue; } + /* Accept syntax check description. */ + if (len >= 6 && memcmp (t + len - 6, "-check", 6) == 0) + { + const char *p; + size_t n; + enum is_syntax_check value; + + p = t; + n = len - 6; + + if (n >= 3 && memcmp (p, "no-", 3) == 0) + { + p += 3; + n -= 3; + value = no; + } + else + value = yes; + + for (i = 0; i < NSYNTAXCHECKS; i++) + if (strlen (syntax_check_name[i]) == n + && memcmp (syntax_check_name[i], p, n) == 0) + { + scp[i] = value; + break; + } + if (i < NSYNTAXCHECKS) + continue; + } + /* Unknown special comment marker. It may have been generated from a future xgettext version. Ignore it. */ } diff --git a/gettext-tools/src/read-catalog-abstract.h b/gettext-tools/src/read-catalog-abstract.h index c3fc84f..367584b 100644 --- a/gettext-tools/src/read-catalog-abstract.h +++ b/gettext-tools/src/read-catalog-abstract.h @@ -184,7 +184,8 @@ extern void po_callback_comment_dispatcher (const char *s); extern void po_parse_comment_special (const char *s, bool *fuzzyp, enum is_format formatp[NFORMATS], struct argument_range *rangep, - enum is_wrap *wrapp); + enum is_wrap *wrapp, + enum is_syntax_check scp[NSYNTAXCHECKS]); #ifdef __cplusplus diff --git a/gettext-tools/src/read-catalog.c b/gettext-tools/src/read-catalog.c index 4642249..8c77df1 100644 --- a/gettext-tools/src/read-catalog.c +++ b/gettext-tools/src/read-catalog.c @@ -105,6 +105,8 @@ default_constructor (abstract_catalog_reader_ty *that) this->range.min = -1; this->range.max = -1; this->do_wrap = undecided; + for (i = 0; i < NSYNTAXCHECKS; i++) + this->do_syntax_check[i] = undecided; } @@ -172,6 +174,8 @@ default_copy_comment_state (default_catalog_reader_ty *this, message_ty *mp) mp->is_format[i] = this->is_format[i]; mp->range = this->range; mp->do_wrap = this->do_wrap; + for (i = 0; i < NSYNTAXCHECKS; i++) + mp->do_syntax_check[i] = this->do_syntax_check[i]; } @@ -205,6 +209,8 @@ default_reset_comment_state (default_catalog_reader_ty *this) this->range.min = -1; this->range.max = -1; this->do_wrap = undecided; + for (i = 0; i < NSYNTAXCHECKS; i++) + this->do_syntax_check[i] = undecided; } @@ -299,7 +305,7 @@ default_comment_special (abstract_catalog_reader_ty *that, const char *s) default_catalog_reader_ty *this = (default_catalog_reader_ty *) that; po_parse_comment_special (s, &this->is_fuzzy, this->is_format, &this->range, - &this->do_wrap); + &this->do_wrap, this->do_syntax_check); } diff --git a/gettext-tools/src/read-catalog.h b/gettext-tools/src/read-catalog.h index f567d78..74e0fd7 100644 --- a/gettext-tools/src/read-catalog.h +++ b/gettext-tools/src/read-catalog.h @@ -113,6 +113,7 @@ struct default_catalog_reader_class_ty enum is_format is_format[NFORMATS]; \ struct argument_range range; \ enum is_wrap do_wrap; \ + enum is_syntax_check do_syntax_check[NSYNTAXCHECKS]; \ typedef struct default_catalog_reader_ty default_catalog_reader_ty; struct default_catalog_reader_ty diff --git a/gettext-tools/src/sentence.c b/gettext-tools/src/sentence.c new file mode 100644 index 0000000..a5ae35e --- /dev/null +++ b/gettext-tools/src/sentence.c @@ -0,0 +1,194 @@ +/* Sentence handling. + Copyright (C) 2015 Free Software Foundation, Inc. + Written by Daiki Ueno <ueno@gnu.org>, 2015. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +/* Specification. */ +#include "sentence.h" + +#include <stdlib.h> +#include <string.h> +#include "unistr.h" + + +/* The minimal number of white spaces which should follow after the + end of sentence. */ +int sentence_end_required_spaces = 1; + +/* This function works in a similar way to 'forward-sentence' in + Emacs, which basically does a regular expression matching of: + + [.?!\u2026] + []"'\u201d)}]* + \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\) + + Since we are lacking a regular expression routine capable of + Unicode (though gnulib-lib/lib/regex.c provides locale-dependent + version, we would rather avoid depending on wchar_t), apply a + manually constructed DFA, which consists of 8 states where 4 of + them are a terminal. */ +const char * +sentence_end (const char *string, ucs4_t *ending_charp) +{ + const char *str = string; + const char *str_limit = string + strlen (str); + /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal. */ + int state = 0; + /* Previous character before an end marker. */ + ucs4_t ending_char = 0xfffd; + /* Possible starting position of the match, and the next starting + position if the current match fails. */ + const char *match_start, *match_next; + /* Number of spaces. */ + int spaces; + + while (str <= str_limit) + { + ucs4_t uc; + size_t length; + + length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); + + if (state == 0) + { + switch (uc) + { + case '.': case '?': case '!': case 0x2026: + state = 1; + match_start = str; + match_next = str + length; + ending_char = uc; + spaces = 0; + break; + + default: + break; + } + + str += length; + continue; + } + + if (state == 1) + { + switch (uc) + { + case ']': case '"': case '\'': case ')': case '}': case 0x201d: + state = 2; + break; + + case '\0': case '\n': + /* State 3. */ + *ending_charp = ending_char; + return match_start; + + case ' ': case 0x00a0: + if (++spaces == sentence_end_required_spaces) + { + /* State 7. */ + *ending_charp = ending_char; + return match_start; + } + state = 4; + break; + + case '\t': + /* State 5. */ + *ending_charp = ending_char; + return match_start; + + default: + str = match_next; + state = 0; + continue; + } + + str += length; + continue; + } + + if (state == 2) + { + switch (uc) + { + case ']': case '"': case '\'': case ')': case '}': case 0x201d: + break; + + case '\0': case '\n': + /* State 3. */ + *ending_charp = ending_char; + return match_start; + + case ' ': case 0x00a0: + if (++spaces == sentence_end_required_spaces) + { + /* State 7. */ + *ending_charp = ending_char; + return match_start; + } + state = 4; + break; + + case '\t': + /* State 5. */ + *ending_charp = ending_char; + return match_start; + + default: + state = 0; + str = match_next; + continue; + } + + str += length; + continue; + } + + if (state == 4) + { + switch (uc) + { + case '\0': case '\n': + /* State 6. */ + *ending_charp = ending_char; + return match_start; + + case ' ': case 0x00a0: + if (++spaces == sentence_end_required_spaces) + { + /* State 7. */ + *ending_charp = ending_char; + return match_start; + } + break; + + default: + state = 0; + str = match_next; + continue; + } + + str += length; + continue; + } + } + + *ending_charp = 0xfffd; + return str_limit; +} diff --git a/gettext-tools/src/sentence.h b/gettext-tools/src/sentence.h new file mode 100644 index 0000000..02fdc16 --- /dev/null +++ b/gettext-tools/src/sentence.h @@ -0,0 +1,42 @@ +/* Sentence handling. + Copyright (C) 2015 Free Software Foundation, Inc. + Written by Daiki Ueno <ueno@gnu.org>, 2015. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef _SENTENCE_H +#define _SENTENCE_H + +#include "unitypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* The minimal number of white spaces which should follow after the + end of sentence. */ +extern DLL_VARIABLE int sentence_end_required_spaces; + +/* Locate the position of a sentence end marker (a period, a question + mark, etc) in a null-terminated string STR. If there is no + sentence end marker found in STR, return a pointer to the null byte + at the end of STR. ENDING_CHARP is a return location of the end + marker character. */ +extern const char *sentence_end (const char *string, ucs4_t *ending_charp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SENTENCE_H */ diff --git a/gettext-tools/src/xgettext.c b/gettext-tools/src/xgettext.c index f9156eb..310b349 100644 --- a/gettext-tools/src/xgettext.c +++ b/gettext-tools/src/xgettext.c @@ -58,6 +58,8 @@ #include "po-charset.h" #include "msgl-iconv.h" #include "msgl-ascii.h" +#include "msgl-check.h" +#include "po-xerror.h" #include "po-time.h" #include "write-catalog.h" #include "write-po.h" @@ -66,6 +68,7 @@ #include "color.h" #include "format.h" #include "propername.h" +#include "sentence.h" #include "unistr.h" #include "gettext.h" @@ -179,6 +182,9 @@ static bool recognize_format_kde; /* If true, recognize Boost format strings. */ static bool recognize_format_boost; +/* Syntax checks enabled by default. */ +static enum is_syntax_check default_syntax_check[NSYNTAXCHECKS]; + /* Canonicalized encoding name for all input files. */ const char *xgettext_global_source_encoding; @@ -204,6 +210,7 @@ static const struct option long_options[] = { "add-location", optional_argument, NULL, 'n' }, { "boost", no_argument, NULL, CHAR_MAX + 11 }, { "c++", no_argument, NULL, 'C' }, + { "check", required_argument, NULL, CHAR_MAX + 17 }, { "color", optional_argument, NULL, CHAR_MAX + 14 }, { "copyright-holder", required_argument, NULL, CHAR_MAX + 1 }, { "debug", no_argument, &do_debug, 1 }, @@ -236,6 +243,7 @@ static const struct option long_options[] = { "package-version", required_argument, NULL, CHAR_MAX + 13 }, { "properties-output", no_argument, NULL, CHAR_MAX + 6 }, { "qt", no_argument, NULL, CHAR_MAX + 9 }, + { "sentence-end", required_argument, NULL, CHAR_MAX + 18 }, { "sort-by-file", no_argument, NULL, 'F' }, { "sort-output", no_argument, NULL, 's' }, { "strict", no_argument, NULL, 'S' }, @@ -346,7 +354,7 @@ main (int argc, char *argv[]) init_flag_table_vala (); while ((optchar = getopt_long (argc, argv, - "ac::Cd:D:eEf:Fhijk::l:L:m::M::no:p:sTVw:x:", + "ac::Cd:D:eEf:Fhijk::l:L:m::M::no:p:sTVw:W:x:", long_options, NULL)) != EOF) switch (optchar) { @@ -602,6 +610,26 @@ main (int argc, char *argv[]) message_print_style_filepos (filepos_comment_none); break; + case CHAR_MAX + 17: /* --check */ + if (strcmp (optarg, "ellipsis-unicode") == 0) + default_syntax_check[sc_ellipsis_unicode] = yes; + else if (strcmp (optarg, "space-ellipsis") == 0) + default_syntax_check[sc_space_ellipsis] = yes; + else if (strcmp (optarg, "quote-unicode") == 0) + default_syntax_check[sc_quote_unicode] = yes; + else + error (EXIT_FAILURE, 0, _("syntax check '%s' unknown"), optarg); + break; + + case CHAR_MAX + 18: /* --sentence-end */ + if (strcmp (optarg, "single-space") == 0) + sentence_end_required_spaces = 1; + else if (strcmp (optarg, "double-space") == 0) + sentence_end_required_spaces = 2; + else + error (EXIT_FAILURE, 0, _("sentence end type '%s' unknown"), optarg); + break; + default: usage (EXIT_FAILURE); /* NOTREACHED */ @@ -836,6 +864,24 @@ warning: file '%s' extension '%s' is unknown; will try C"), filename, extension) else if (sort_by_msgid) msgdomain_list_sort_by_msgid (mdlp); + /* Check syntax of messages. */ + { + int nerrors = 0; + + for (i = 0; i < mdlp->nitems; i++) + { + message_list_ty *mlp = mdlp->item[i]->messages; + nerrors = syntax_check_message_list (mlp); + } + + /* Exit with status 1 on any error. */ + if (nerrors > 0) + error (EXIT_FAILURE, 0, + ngettext ("found %d fatal error", "found %d fatal errors", + nerrors), + nerrors); + } + /* Write the PO file. */ msgdomain_list_print (mdlp, file_name, output_syntax, force_po, do_debug); @@ -921,6 +967,14 @@ Operation mode:\n")); preceding keyword lines in output file\n\ -c, --add-comments place all comment blocks preceding keyword lines\n\ in output file\n")); + printf (_("\ + --check=NAME perform syntax check on messages\n\ + (ellipsis-unicode, space-ellipsis,\n\ + quote-unicode)\n")); + printf (_("\ + --sentence-end=TYPE type describing the end of sentence\n\ + (single-space, which is the default, \n\ + or double-space)\n")); printf ("\n"); printf (_("\ Language specific options:\n")); @@ -1644,8 +1698,8 @@ xgettext_record_flag (const char *optionstring) flag += 5; } - /* Unlike po_parse_comment_special(), we don't accept "fuzzy" or "wrap" - here - it has no sense. */ + /* Unlike po_parse_comment_special(), we don't accept "fuzzy", + "wrap", or "check" here - it has no sense. */ if (strlen (flag) >= 7 && memcmp (flag + strlen (flag) - 7, "-format", 7) == 0) { @@ -2238,6 +2292,7 @@ remember_a_message (message_list_ty *mlp, char *msgctxt, char *msgid, enum is_format is_format[NFORMATS]; struct argument_range range; enum is_wrap do_wrap; + enum is_syntax_check do_syntax_check[NSYNTAXCHECKS]; message_ty *mp; char *msgstr; size_t i; @@ -2264,6 +2319,8 @@ remember_a_message (message_list_ty *mlp, char *msgctxt, char *msgid, range.min = -1; range.max = -1; do_wrap = undecided; + for (i = 0; i < NSYNTAXCHECKS; i++) + do_syntax_check[i] = undecided; if (msgctxt != NULL) CONVERT_STRING (msgctxt, lc_string); @@ -2297,6 +2354,8 @@ meta information, not the empty string.\n"))); for (i = 0; i < NFORMATS; i++) is_format[i] = mp->is_format[i]; do_wrap = mp->do_wrap; + for (i = 0; i < NSYNTAXCHECKS; i++) + do_syntax_check[i] = mp->do_syntax_check[i]; } else { @@ -2376,12 +2435,13 @@ meta information, not the empty string.\n"))); enum is_format tmp_format[NFORMATS]; struct argument_range tmp_range; enum is_wrap tmp_wrap; + enum is_syntax_check tmp_syntax_check[NSYNTAXCHECKS]; bool interesting; t += strlen ("xgettext:"); po_parse_comment_special (t, &tmp_fuzzy, tmp_format, &tmp_range, - &tmp_wrap); + &tmp_wrap, tmp_syntax_check); interesting = false; for (i = 0; i < NFORMATS; i++) @@ -2400,6 +2460,12 @@ meta information, not the empty string.\n"))); do_wrap = tmp_wrap; interesting = true; } + for (i = 0; i < NSYNTAXCHECKS; i++) + if (tmp_syntax_check[i] != undecided) + { + do_syntax_check[i] = tmp_syntax_check[i]; + interesting = true; + } /* If the "xgettext:" marker was followed by an interesting keyword, and we updated our is_format/do_wrap variables, @@ -2525,6 +2591,14 @@ meta information, not the empty string.\n"))); mp->do_wrap = do_wrap == no ? no : yes; /* By default we wrap. */ + for (i = 0; i < NSYNTAXCHECKS; i++) + { + if (do_syntax_check[i] == undecided) + do_syntax_check[i] = default_syntax_check[i] == yes ? yes : no; + + mp->do_syntax_check[i] = do_syntax_check[i]; + } + /* Warn about the use of non-reorderable format strings when the programming language also provides reorderable format strings. */ warn_format_string (is_format, mp->msgid, pos, "msgid"); diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index 380b937..1ba2935 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,13 @@ +2015-03-03 Daiki Ueno <ueno@gnu.org> + + * xgettext-14: New file. + * sentence.c: New file + * Makefile.am (TESTS): Add new tests. + (noinst_PROGRAMS): Add 'sentence'. + (sentence_SOURCES): New variable. + (sentence_CPPFLAGS): New variable. + (sentence_LDADD): New variable. + 2015-02-06 Daiki Ueno <ueno@gnu.org> tests: Fix "broken pipe" error in msgfilter-7 diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 32bc192..ea8bfa9 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -72,7 +72,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \ recode-sr-latin-1 recode-sr-latin-2 \ xgettext-2 xgettext-3 xgettext-4 xgettext-5 xgettext-6 \ xgettext-7 xgettext-8 xgettext-9 xgettext-10 xgettext-11 xgettext-12 \ - xgettext-13 \ + xgettext-13 xgettext-14 \ xgettext-awk-1 xgettext-awk-2 \ xgettext-c-2 xgettext-c-3 xgettext-c-4 xgettext-c-5 \ xgettext-c-6 xgettext-c-7 xgettext-c-8 xgettext-c-9 xgettext-c-10 \ @@ -137,7 +137,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \ format-lua-1 format-lua-2 \ format-javascript-1 format-javascript-2 \ plural-1 plural-2 \ - gettextpo-1 \ + gettextpo-1 sentence \ lang-c lang-c++ lang-objc lang-sh lang-bash lang-python-1 \ lang-python-2 lang-clisp lang-elisp lang-librep lang-guile \ lang-smalltalk lang-java lang-csharp lang-gawk lang-pascal \ @@ -211,7 +211,7 @@ DEFS = -DLOCALEDIR=\"$(localedir)\" @DEFS@ LDADD = $(LDADD_@USE_INCLUDED_LIBINTL@) @INTL_MACOSX_LIBS@ LDADD_yes = ../intl/libintl.la @LTLIBTHREAD@ LDADD_no = ../intl/libgnuintl.la @LTLIBTHREAD@ @LTLIBINTL@ -noinst_PROGRAMS = tstgettext tstngettext testlocale gettext-3-prg gettext-4-prg gettext-5-prg gettext-6-prg gettext-7-prg gettext-8-prg cake fc3 fc4 fc5 gettextpo-1-prg +noinst_PROGRAMS = tstgettext tstngettext testlocale gettext-3-prg gettext-4-prg gettext-5-prg gettext-6-prg gettext-7-prg gettext-8-prg cake fc3 fc4 fc5 gettextpo-1-prg sentence tstgettext_SOURCES = tstgettext.c setlocale.c tstgettext_CFLAGS = -DINSTALLDIR=\".\" tstgettext_LDADD = ../gnulib-lib/libgettextlib.la $(LDADD) @@ -255,6 +255,9 @@ gettextpo_1_prg_CPPFLAGS = \ # Don't add more libraries here. This test must check whether libgettextpo is # self contained. gettextpo_1_prg_LDADD = ../libgettextpo/libgettextpo.la $(LDADD) +sentence_SOURCES = sentence.c +sentence_CPPFLAGS = -I../src -I../gnulib-lib +sentence_LDADD = ../src/libgettextsrc.la $(LDADD) # Clean up after Solaris cc. clean-local: diff --git a/gettext-tools/tests/sentence.c b/gettext-tools/tests/sentence.c new file mode 100644 index 0000000..839f6c8 --- /dev/null +++ b/gettext-tools/tests/sentence.c @@ -0,0 +1,85 @@ +/* Test of sentence handling. + Copyright (C) 2015 Free Software Foundation, Inc. + Written by Daiki Ueno <ueno@gnu.org>, 2015. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include "sentence.h" + +#include <assert.h> +#include <string.h> + +#define PRIMARY "This is a primary sentence" +#define SECONDARY "This is a secondary sentence" + +#define SIZEOF(x) (sizeof (x) / sizeof (*x)) + +struct data +{ + int required_spaces; + const char *input; + + const char *expected_prefix; + ucs4_t expected_ending_char; +}; + +const struct data data[] = + { + { 1, PRIMARY, PRIMARY, 0xfffd }, + { 1, PRIMARY ".", PRIMARY, '.' }, + { 1, PRIMARY ".x", PRIMARY ".x", 0xfffd }, + { 2, PRIMARY ". " SECONDARY, PRIMARY, '.' }, + { 1, PRIMARY ". " SECONDARY, PRIMARY, '.' }, + { 1, PRIMARY ".' " SECONDARY, PRIMARY, '.' }, + { 3, PRIMARY ". " SECONDARY, PRIMARY ". " SECONDARY, 0xfffd }, + { 2, PRIMARY ".' " SECONDARY, PRIMARY, '.' }, + { 2, PRIMARY ".'x " SECONDARY, PRIMARY ".'x " SECONDARY, 0xfffd }, + { 2, PRIMARY ".''x " SECONDARY, PRIMARY ".''x " SECONDARY, 0xfffd }, + { 2, PRIMARY ".\n" SECONDARY, PRIMARY, '.' }, + { 2, PRIMARY ". \n" SECONDARY, PRIMARY, '.' }, + { 2, PRIMARY ".\xc2\xa0\n" SECONDARY, PRIMARY, '.' }, + { 2, PRIMARY ".\t" SECONDARY, PRIMARY, '.' }, + { 2, PRIMARY ".'\t" SECONDARY, PRIMARY, '.' }, + { 2, PRIMARY ".'\n" SECONDARY, PRIMARY, '.' } + }; + +static void +check_sentence_end (const struct data *d) +{ + int saved_required_spaces = sentence_end_required_spaces; + const char *result; + ucs4_t ending_char; + + sentence_end_required_spaces = d->required_spaces; + result = sentence_end (d->input, &ending_char); + sentence_end_required_spaces = saved_required_spaces; + + assert (result == d->input + strlen (d->expected_prefix)); + assert (ending_char == d->expected_ending_char); +} + +int +main (int argc, char **argv) +{ + int i; + + for (i = 0; i < SIZEOF (data); i++) + check_sentence_end (&data[i]); + + return 0; +} diff --git a/gettext-tools/tests/xgettext-14 b/gettext-tools/tests/xgettext-14 new file mode 100755 index 0000000..b769b2f --- /dev/null +++ b/gettext-tools/tests/xgettext-14 @@ -0,0 +1,98 @@ +#!/bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test for --check option. + +# --check=ellipsis-unicode +cat <<\EOF > xg-ellipsis-u.c +gettext ("This is a sentence..."); + +ngettext ("This is a sentence", "These are sentences...", 2); + +/* xgettext: no-ellipsis-unicode-check */ +gettext ("This is another sentence..."); + +gettext ("This is a multi-sentence example. This is the first sentence. " + "This is the second..., no it's not, this is the second sentence...\n" + "This is the third sentence...? Perhaps.\n"); +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --omit-header --add-comments --check=ellipsis-unicode -d xg-ellipsis-u.tmp xg-ellipsis-u.c 2>xg-ellipsis-u.err + +test `grep -c 'ASCII ellipsis' xg-ellipsis-u.err` = 4 || exit 1 + +LANGUAGE= LC_ALL=C ${XGETTEXT} --omit-header --add-comments --check=ellipsis-unicode --sentence-end=double-space -d xg-ellipsis-ud.tmp xg-ellipsis-u.c 2>xg-ellipsis-ud.err + +test `grep -c 'ASCII ellipsis' xg-ellipsis-ud.err` = 3 || exit 1 + +# --check=space-ellipsis +cat <<\EOF > xg-space-e.c +gettext ("This is a sentence ..."); + +/* xgettext: no-space-ellipsis-check, no-ellipsis-unicode-check */ +gettext ("This is another sentence ..."); + +gettext ("This is a multi-sentence example. This is the first sentence. " + "This is the second..., no it's not, this is the second sentence ...\n" + "This is the third sentence \u2026? Perhaps.\n"); +EOF + +LANGUAGE= LC_ALL=C ${XGETTEXT} --omit-header --add-comments --check=space-ellipsis -d xg-space-e.tmp xg-space-e.c 2>xg-space-e.err + +test `grep -c 'space before ellipsis' xg-space-e.err` = 3 || exit 1 + +# --check=quote-unicode +cat <<\EOF > xg-quote-u.c +gettext ("\"double quoted\""); + +/* xgettext: no-quote-unicode-check */ +gettext ("\"double quoted but ignored\""); + +gettext ("double quoted but empty \"\""); + +gettext ("\"\" double quoted but empty"); + +gettext ("\"foo\" \"bar\" \"baz\""); + +gettext ("'single quoted'"); + +/* xgettext: no-quote-unicode-check */ +gettext ("'single quoted but ignored'"); + +gettext ("'foo' 'bar' 'baz'"); + +gettext ("prefix'single quoted without surrounding spaces'suffix"); + +gettext ("prefix 'single quoted with surrounding spaces' suffix"); + +gettext ("single quoted with apostrophe, empty '' "); + +gettext ("'single quoted at the beginning of string' "); + +gettext (" 'single quoted at the end of string'"); + +gettext ("line 1\n" +"'single quoted at the beginning of line' \n" +"line 3"); + +gettext ("line 1\n" +" 'single quoted at the end of line'\n" +"line 3"); + +gettext ("`single quoted with grave'"); + +/* xgettext: no-quote-unicode-check */ +gettext ("`single quoted with grave but ignored'"); + +gettext ("single quoted with grave, empty `'"); + +gettext ("`' single quoted with grave, empty"); + +gettext ("`double grave`"); +EOF + +LANGUAGE= LC_ALL=C ${XGETTEXT} --omit-header --add-comments --check=quote-unicode -d xg-quote-u.tmp xg-quote-u.c 2>xg-quote-u.err + +test `grep -c 'ASCII double quote' xg-quote-u.err` = 4 || exit 1 +test `grep -c 'ASCII single quote' xg-quote-u.err` = 12 || exit 1 |