diff options
author | Daiki Ueno <ueno@gnu.org> | 2013-11-20 12:41:20 +0900 |
---|---|---|
committer | Daiki Ueno <ueno@gnu.org> | 2013-12-04 19:53:11 +0900 |
commit | 992d6a594fa5ca3e914bc3f3f18431caee657db2 (patch) | |
tree | 4b045e79e21a275b0f7bcc5c6eb4e4669b1f7cc8 | |
parent | 4142e63b88032c25572fd50b6518dadcfeee2c15 (diff) | |
download | external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.zip external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.tar.gz external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.tar.bz2 |
xgettext: Add E4X support to JavaScript scanner
Reported by Piotr Drąg at: <https://savannah.gnu.org/bugs/?40125>.
* src/xgettext.h (enum lexical_context_ty): New enumeration items
lc_xml_open_tag, lc_xml_close_tag, lc_xml_content.
* src/x-javascript.c (phase5_scan_xml_markup): New
function.
(phase5_get): Handle '<', '>', '/', '=', '{', and '}' specially
to support E4X.
(enum token_type_ty): New enumeration item token_type_equal.
(xml_element_depth): New variable.
(inside_embedded_in_xml): New variable.
(extract_javascript): Initialize those variables.
* tests/Makefile.am (TESTS): Add xgettext-javascript-6.
* tests/xgettext-javascript-6: New file.
-rw-r--r-- | gettext-tools/src/ChangeLog | 14 | ||||
-rw-r--r-- | gettext-tools/src/x-javascript.c | 206 | ||||
-rw-r--r-- | gettext-tools/src/xgettext.h | 7 | ||||
-rw-r--r-- | gettext-tools/tests/ChangeLog | 6 | ||||
-rw-r--r-- | gettext-tools/tests/Makefile.am | 2 | ||||
-rw-r--r-- | gettext-tools/tests/xgettext-javascript-6 | 75 |
6 files changed, 307 insertions, 3 deletions
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index 97c3c1f..64ea86d 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,17 @@ +2013-11-20 Daiki Ueno <ueno@gnu.org> + + xgettext: Add E4X support to JavaScript scanner + Reported by Piotr Drąg at: <https://savannah.gnu.org/bugs/?40125>. + * xgettext.h (enum lexical_context_ty): New enumeration items + lc_xml_open_tag, lc_xml_close_tag, lc_xml_content. + * x-javascript.c (phase5_scan_xml_markup): New function. + (phase5_get): Handle '<', '>', '/', '=', '{', and '}' specially + to support E4X. + (enum token_type_ty): New enumeration item token_type_equal. + (xml_element_depth): New variable. + (inside_embedded_in_xml): New variable. + (extract_javascript): Initialize those variables. + 2013-11-14 Daiki Ueno <ueno@gnu.org> * x-javascript.c (phase3_getc): Make sure to call comment_line_end diff --git a/gettext-tools/src/x-javascript.c b/gettext-tools/src/x-javascript.c index c1503ce..59bbcbe 100644 --- a/gettext-tools/src/x-javascript.c +++ b/gettext-tools/src/x-javascript.c @@ -926,6 +926,7 @@ enum token_type_ty token_type_plus, /* + */ token_type_regexp, /* /.../ */ token_type_operator, /* - * / % . < > = ~ ! | & ? : ^ */ + token_type_equal, /* = */ token_type_string, /* "abc", 'abc' */ token_type_keyword, /* return, else */ token_type_symbol, /* symbol, number */ @@ -1160,6 +1161,114 @@ phase5_scan_regexp () phase2_ungetc (c); } +static int xml_element_depth = 0; +static bool inside_embedded_js_in_xml = false; + +static bool +phase5_scan_xml_markup (token_ty *tp) +{ + struct + { + const char *start; + const char *end; + } markers[] = + { + { "!--", "--" }, + { "![CDATA[", "]]" }, + { "?", "?" } + }; + int i; + + for (i = 0; i < SIZEOF (markers); i++) + { + const char *start = markers[i].start; + const char *end = markers[i].end; + int j; + + /* Look for a start marker. */ + for (j = 0; start[j] != '\0'; j++) + { + int c; + + assert (phase2_pushback_length + j < SIZEOF (phase2_pushback)); + c = phase2_getc (); + if (c == UEOF) + goto eof; + if (c != start[j]) + { + int k = j; + + phase2_ungetc (c); + k--; + + for (; k >= 0; k--) + phase2_ungetc (start[k]); + break; + } + } + + if (start[j] != '\0') + continue; + + /* Skip until the end marker. */ + for (;;) + { + int c; + + for (j = 0; end[j] != '\0'; j++) + { + assert (phase2_pushback_length + 1 < SIZEOF (phase2_pushback)); + c = phase2_getc (); + if (c == UEOF) + goto eof; + if (c != end[j]) + { + /* Don't push the first character back so the next + iteration start from the second character. */ + if (j > 0) + { + int k = j; + + phase2_ungetc (c); + k--; + + for (; k > 0; k--) + phase2_ungetc (end[k]); + } + break; + } + } + + if (end[j] != '\0') + continue; + + c = phase2_getc (); + if (c == UEOF) + goto eof; + if (c != '>') + { + error_with_progname = false; + error (0, 0, + _("%s:%d: warning: %s is not allowed"), + logical_file_name, line_number, + end); + error_with_progname = true; + return false; + } + return true; + } + } + return false; + + eof: + error_with_progname = false; + error (0, 0, + _("%s:%d: warning: unterminated XML markup"), + logical_file_name, line_number); + error_with_progname = true; + return false; +} + static void phase5_get (token_ty *tp) { @@ -1314,13 +1423,93 @@ phase5_get (token_ty *tp) /* Identify operators. The multiple character ones are simply ignored * as they are recognized here and are otherwise not relevant. */ case '-': case '*': /* '+' and '/' are not listed here! */ - case '%': case '<': case '>': case '=': + case '%': case '~': case '!': case '|': case '&': case '^': case '?': case ':': tp->type = last_token_type = token_type_operator; return; + case '=': + tp->type = last_token_type = token_type_equal; + return; + + case '<': + { + /* We assume: + - XMLMarkup and XMLElement are only allowed after '=' or '(' + - embedded JavaScript expressions in XML do not recurse + */ + if (xml_element_depth > 0 + || (!inside_embedded_js_in_xml + && (last_token_type == token_type_equal + || last_token_type == token_type_lparen))) + { + /* Comments, PI, or CDATA. */ + if (phase5_scan_xml_markup (tp)) + return; + c = phase2_getc (); + + /* Closing tag. */ + if (c == '/') + lexical_context = lc_xml_close_tag; + + /* Opening element. */ + else + { + phase2_ungetc (c); + lexical_context = lc_xml_open_tag; + xml_element_depth++; + } + + tp->type = last_token_type = token_type_other; + } + else + tp->type = last_token_type = token_type_operator; + } + return; + + case '>': + if (xml_element_depth > 0 && !inside_embedded_js_in_xml) + { + switch (lexical_context) + { + case lc_xml_open_tag: + lexical_context = lc_xml_content; + break; + + case lc_xml_close_tag: + if (xml_element_depth-- > 0) + lexical_context = lc_xml_content; + else + lexical_context = lc_outside; + break; + + default: + break; + } + tp->type = last_token_type = token_type_other; + } + else + tp->type = last_token_type = token_type_operator; + return; + case '/': + if (xml_element_depth > 0 && !inside_embedded_js_in_xml) + { + /* If it appears in an opening tag of an XML element, it's + part of '/>'. */ + if (lexical_context == lc_xml_open_tag) + { + c = phase2_getc (); + if (c == '>') + lexical_context = lc_outside; + else + phase2_ungetc (c); + } + tp->type = last_token_type = token_type_other; + return; + } + /* Either a division operator or the start of a regular expression literal. If the '/' token is spotted after a symbol it's a division, otherwise it's a regular @@ -1336,6 +1525,18 @@ phase5_get (token_ty *tp) } return; + case '{': + if (xml_element_depth > 0 && !inside_embedded_js_in_xml) + inside_embedded_js_in_xml = true; + tp->type = last_token_type = token_type_other; + return; + + case '}': + if (xml_element_depth > 0 && inside_embedded_js_in_xml) + inside_embedded_js_in_xml = false; + tp->type = last_token_type = token_type_other; + return; + case '(': tp->type = last_token_type = token_type_lparen; return; @@ -1598,6 +1799,7 @@ extract_balanced (message_list_ty *mlp, case token_type_plus: case token_type_regexp: case token_type_operator: + case token_type_equal: case token_type_other: next_context_iter = null_context_list_iterator; state = 0; @@ -1628,6 +1830,8 @@ extract_javascript (FILE *f, last_comment_line = -1; last_non_comment_line = -1; + xml_element_depth = 0; + xgettext_current_file_source_encoding = xgettext_global_source_encoding; #if HAVE_ICONV xgettext_current_file_source_iconv = xgettext_global_source_iconv; diff --git a/gettext-tools/src/xgettext.h b/gettext-tools/src/xgettext.h index 16540fe..2f8a084 100644 --- a/gettext-tools/src/xgettext.h +++ b/gettext-tools/src/xgettext.h @@ -144,7 +144,12 @@ typedef enum { lc_outside, /* Initial context: outside of comments and strings. */ lc_comment, /* Inside a comment. */ - lc_string /* Inside a string literal. */ + lc_string, /* Inside a string literal. */ + + /* For embedded XML in programming code, like E4X in JavaScript. */ + lc_xml_open_tag, /* Inside an opening tag of an XML element. */ + lc_xml_close_tag, /* Inside a closing tag of an XML element. */ + lc_xml_content /* Inside an XML text node. */ } lexical_context_ty; diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index 33efe8d..42c680b 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,9 @@ +2013-11-20 Daiki Ueno <ueno@gnu.org> + + xgettext: Add E4X support to JavaScript scanner + * Makefile.am (TESTS): Add xgettext-javascript-6. + * xgettext-javascript-6: New file. + 2013-11-14 Daiki Ueno <ueno@gnu.org> * xgettext-javascript-1: Add a test to extract translator comments. diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index e1e0df2..d48aa7b 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -102,7 +102,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \ xgettext-ycp-1 xgettext-ycp-2 xgettext-ycp-3 xgettext-ycp-4 \ xgettext-lua-1 xgettext-lua-2 \ xgettext-javascript-1 xgettext-javascript-2 xgettext-javascript-3 \ - xgettext-javascript-4 xgettext-javascript-5 \ + xgettext-javascript-4 xgettext-javascript-5 xgettext-javascript-6 \ xgettext-vala-1 \ xgettext-gsettings-1 \ format-awk-1 format-awk-2 \ diff --git a/gettext-tools/tests/xgettext-javascript-6 b/gettext-tools/tests/xgettext-javascript-6 new file mode 100644 index 0000000..a891ebe --- /dev/null +++ b/gettext-tools/tests/xgettext-javascript-6 @@ -0,0 +1,75 @@ +#!/bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test of JavaScript E4X support. + +tmpfiles="" +trap 'rm -fr $tmpfiles' 1 2 3 15 + +tmpfiles="$tmpfiles xg-js-6.js" +cat <<\EOF > xg-js-6.js +var x1 = <x1></x1>; +var s1 = _("Expected translation string #1"); +var s2 = "foo"; +var x2 = <{s2}>foo {s2} bar</{s2}>; +var x3 = <x3 a1="/"><x4>{_("Expected translation string #2")}</x4></x3>; +var x4 = <x5 a2='/'><x{_("Expected translation string #3")}> +</x{_("Expected translation string #3")}></x5>; +var x4 = <![CDATA[ + _("Unexpected translation string #1") +]]>; +var x5 = <!-- - _("Unexpected translation string #2") - -->; +var s6 = _("Expected translation string #4"); +var x6 = <? _("Unexpected translation string #3") ?>; +var x7 = <!--- this is a comment --> <foo> +</foo>; +EOF + +tmpfiles="$tmpfiles xg-js-6.err xg-js-6.tmp xg-js-6.pot" +: ${XGETTEXT=xgettext} +${XGETTEXT} --add-comments --no-location -o xg-js-6.tmp xg-js-6.js 2>xg-js-6.err +test $? = 0 || { cat xg-js-6.err; rm -fr $tmpfiles; exit 1; } +# Don't simplify this to "grep ... < xg-js-6.tmp", otherwise OpenBSD 4.0 grep +# only outputs "Binary file (standard input) matches". +cat xg-js-6.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-js-6.pot + +tmpfiles="$tmpfiles xg-js-6.ok" +cat <<\EOF > xg-js-6.ok +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: \n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" +"Language-Team: LANGUAGE <LL@li.org>\n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=CHARSET\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "Expected translation string #1" +msgstr "" + +msgid "Expected translation string #2" +msgstr "" + +msgid "Expected translation string #3" +msgstr "" + +msgid "Expected translation string #4" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} xg-js-6.ok xg-js-6.pot +result=$? + +rm -fr $tmpfiles + +exit $result |