summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaiki Ueno <ueno@gnu.org>2013-11-20 12:41:20 +0900
committerDaiki Ueno <ueno@gnu.org>2013-12-04 19:53:11 +0900
commit992d6a594fa5ca3e914bc3f3f18431caee657db2 (patch)
tree4b045e79e21a275b0f7bcc5c6eb4e4669b1f7cc8
parent4142e63b88032c25572fd50b6518dadcfeee2c15 (diff)
downloadexternal_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.zip
external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.tar.gz
external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.tar.bz2
xgettext: Add E4X support to JavaScript scanner
Reported by Piotr Drąg at: <https://savannah.gnu.org/bugs/?40125>. * src/xgettext.h (enum lexical_context_ty): New enumeration items lc_xml_open_tag, lc_xml_close_tag, lc_xml_content. * src/x-javascript.c (phase5_scan_xml_markup): New function. (phase5_get): Handle '<', '>', '/', '=', '{', and '}' specially to support E4X. (enum token_type_ty): New enumeration item token_type_equal. (xml_element_depth): New variable. (inside_embedded_in_xml): New variable. (extract_javascript): Initialize those variables. * tests/Makefile.am (TESTS): Add xgettext-javascript-6. * tests/xgettext-javascript-6: New file.
-rw-r--r--gettext-tools/src/ChangeLog14
-rw-r--r--gettext-tools/src/x-javascript.c206
-rw-r--r--gettext-tools/src/xgettext.h7
-rw-r--r--gettext-tools/tests/ChangeLog6
-rw-r--r--gettext-tools/tests/Makefile.am2
-rw-r--r--gettext-tools/tests/xgettext-javascript-675
6 files changed, 307 insertions, 3 deletions
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index 97c3c1f..64ea86d 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,17 @@
+2013-11-20 Daiki Ueno <ueno@gnu.org>
+
+ xgettext: Add E4X support to JavaScript scanner
+ Reported by Piotr Drąg at: <https://savannah.gnu.org/bugs/?40125>.
+ * xgettext.h (enum lexical_context_ty): New enumeration items
+ lc_xml_open_tag, lc_xml_close_tag, lc_xml_content.
+ * x-javascript.c (phase5_scan_xml_markup): New function.
+ (phase5_get): Handle '<', '>', '/', '=', '{', and '}' specially
+ to support E4X.
+ (enum token_type_ty): New enumeration item token_type_equal.
+ (xml_element_depth): New variable.
+ (inside_embedded_in_xml): New variable.
+ (extract_javascript): Initialize those variables.
+
2013-11-14 Daiki Ueno <ueno@gnu.org>
* x-javascript.c (phase3_getc): Make sure to call comment_line_end
diff --git a/gettext-tools/src/x-javascript.c b/gettext-tools/src/x-javascript.c
index c1503ce..59bbcbe 100644
--- a/gettext-tools/src/x-javascript.c
+++ b/gettext-tools/src/x-javascript.c
@@ -926,6 +926,7 @@ enum token_type_ty
token_type_plus, /* + */
token_type_regexp, /* /.../ */
token_type_operator, /* - * / % . < > = ~ ! | & ? : ^ */
+ token_type_equal, /* = */
token_type_string, /* "abc", 'abc' */
token_type_keyword, /* return, else */
token_type_symbol, /* symbol, number */
@@ -1160,6 +1161,114 @@ phase5_scan_regexp ()
phase2_ungetc (c);
}
+static int xml_element_depth = 0;
+static bool inside_embedded_js_in_xml = false;
+
+static bool
+phase5_scan_xml_markup (token_ty *tp)
+{
+ struct
+ {
+ const char *start;
+ const char *end;
+ } markers[] =
+ {
+ { "!--", "--" },
+ { "![CDATA[", "]]" },
+ { "?", "?" }
+ };
+ int i;
+
+ for (i = 0; i < SIZEOF (markers); i++)
+ {
+ const char *start = markers[i].start;
+ const char *end = markers[i].end;
+ int j;
+
+ /* Look for a start marker. */
+ for (j = 0; start[j] != '\0'; j++)
+ {
+ int c;
+
+ assert (phase2_pushback_length + j < SIZEOF (phase2_pushback));
+ c = phase2_getc ();
+ if (c == UEOF)
+ goto eof;
+ if (c != start[j])
+ {
+ int k = j;
+
+ phase2_ungetc (c);
+ k--;
+
+ for (; k >= 0; k--)
+ phase2_ungetc (start[k]);
+ break;
+ }
+ }
+
+ if (start[j] != '\0')
+ continue;
+
+ /* Skip until the end marker. */
+ for (;;)
+ {
+ int c;
+
+ for (j = 0; end[j] != '\0'; j++)
+ {
+ assert (phase2_pushback_length + 1 < SIZEOF (phase2_pushback));
+ c = phase2_getc ();
+ if (c == UEOF)
+ goto eof;
+ if (c != end[j])
+ {
+ /* Don't push the first character back so the next
+ iteration start from the second character. */
+ if (j > 0)
+ {
+ int k = j;
+
+ phase2_ungetc (c);
+ k--;
+
+ for (; k > 0; k--)
+ phase2_ungetc (end[k]);
+ }
+ break;
+ }
+ }
+
+ if (end[j] != '\0')
+ continue;
+
+ c = phase2_getc ();
+ if (c == UEOF)
+ goto eof;
+ if (c != '>')
+ {
+ error_with_progname = false;
+ error (0, 0,
+ _("%s:%d: warning: %s is not allowed"),
+ logical_file_name, line_number,
+ end);
+ error_with_progname = true;
+ return false;
+ }
+ return true;
+ }
+ }
+ return false;
+
+ eof:
+ error_with_progname = false;
+ error (0, 0,
+ _("%s:%d: warning: unterminated XML markup"),
+ logical_file_name, line_number);
+ error_with_progname = true;
+ return false;
+}
+
static void
phase5_get (token_ty *tp)
{
@@ -1314,13 +1423,93 @@ phase5_get (token_ty *tp)
/* Identify operators. The multiple character ones are simply ignored
* as they are recognized here and are otherwise not relevant. */
case '-': case '*': /* '+' and '/' are not listed here! */
- case '%': case '<': case '>': case '=':
+ case '%':
case '~': case '!': case '|': case '&': case '^':
case '?': case ':':
tp->type = last_token_type = token_type_operator;
return;
+ case '=':
+ tp->type = last_token_type = token_type_equal;
+ return;
+
+ case '<':
+ {
+ /* We assume:
+ - XMLMarkup and XMLElement are only allowed after '=' or '('
+ - embedded JavaScript expressions in XML do not recurse
+ */
+ if (xml_element_depth > 0
+ || (!inside_embedded_js_in_xml
+ && (last_token_type == token_type_equal
+ || last_token_type == token_type_lparen)))
+ {
+ /* Comments, PI, or CDATA. */
+ if (phase5_scan_xml_markup (tp))
+ return;
+ c = phase2_getc ();
+
+ /* Closing tag. */
+ if (c == '/')
+ lexical_context = lc_xml_close_tag;
+
+ /* Opening element. */
+ else
+ {
+ phase2_ungetc (c);
+ lexical_context = lc_xml_open_tag;
+ xml_element_depth++;
+ }
+
+ tp->type = last_token_type = token_type_other;
+ }
+ else
+ tp->type = last_token_type = token_type_operator;
+ }
+ return;
+
+ case '>':
+ if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+ {
+ switch (lexical_context)
+ {
+ case lc_xml_open_tag:
+ lexical_context = lc_xml_content;
+ break;
+
+ case lc_xml_close_tag:
+ if (xml_element_depth-- > 0)
+ lexical_context = lc_xml_content;
+ else
+ lexical_context = lc_outside;
+ break;
+
+ default:
+ break;
+ }
+ tp->type = last_token_type = token_type_other;
+ }
+ else
+ tp->type = last_token_type = token_type_operator;
+ return;
+
case '/':
+ if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+ {
+ /* If it appears in an opening tag of an XML element, it's
+ part of '/>'. */
+ if (lexical_context == lc_xml_open_tag)
+ {
+ c = phase2_getc ();
+ if (c == '>')
+ lexical_context = lc_outside;
+ else
+ phase2_ungetc (c);
+ }
+ tp->type = last_token_type = token_type_other;
+ return;
+ }
+
/* Either a division operator or the start of a regular
expression literal. If the '/' token is spotted after a
symbol it's a division, otherwise it's a regular
@@ -1336,6 +1525,18 @@ phase5_get (token_ty *tp)
}
return;
+ case '{':
+ if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+ inside_embedded_js_in_xml = true;
+ tp->type = last_token_type = token_type_other;
+ return;
+
+ case '}':
+ if (xml_element_depth > 0 && inside_embedded_js_in_xml)
+ inside_embedded_js_in_xml = false;
+ tp->type = last_token_type = token_type_other;
+ return;
+
case '(':
tp->type = last_token_type = token_type_lparen;
return;
@@ -1598,6 +1799,7 @@ extract_balanced (message_list_ty *mlp,
case token_type_plus:
case token_type_regexp:
case token_type_operator:
+ case token_type_equal:
case token_type_other:
next_context_iter = null_context_list_iterator;
state = 0;
@@ -1628,6 +1830,8 @@ extract_javascript (FILE *f,
last_comment_line = -1;
last_non_comment_line = -1;
+ xml_element_depth = 0;
+
xgettext_current_file_source_encoding = xgettext_global_source_encoding;
#if HAVE_ICONV
xgettext_current_file_source_iconv = xgettext_global_source_iconv;
diff --git a/gettext-tools/src/xgettext.h b/gettext-tools/src/xgettext.h
index 16540fe..2f8a084 100644
--- a/gettext-tools/src/xgettext.h
+++ b/gettext-tools/src/xgettext.h
@@ -144,7 +144,12 @@ typedef enum
{
lc_outside, /* Initial context: outside of comments and strings. */
lc_comment, /* Inside a comment. */
- lc_string /* Inside a string literal. */
+ lc_string, /* Inside a string literal. */
+
+ /* For embedded XML in programming code, like E4X in JavaScript. */
+ lc_xml_open_tag, /* Inside an opening tag of an XML element. */
+ lc_xml_close_tag, /* Inside a closing tag of an XML element. */
+ lc_xml_content /* Inside an XML text node. */
}
lexical_context_ty;
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog
index 33efe8d..42c680b 100644
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,9 @@
+2013-11-20 Daiki Ueno <ueno@gnu.org>
+
+ xgettext: Add E4X support to JavaScript scanner
+ * Makefile.am (TESTS): Add xgettext-javascript-6.
+ * xgettext-javascript-6: New file.
+
2013-11-14 Daiki Ueno <ueno@gnu.org>
* xgettext-javascript-1: Add a test to extract translator comments.
diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am
index e1e0df2..d48aa7b 100644
--- a/gettext-tools/tests/Makefile.am
+++ b/gettext-tools/tests/Makefile.am
@@ -102,7 +102,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \
xgettext-ycp-1 xgettext-ycp-2 xgettext-ycp-3 xgettext-ycp-4 \
xgettext-lua-1 xgettext-lua-2 \
xgettext-javascript-1 xgettext-javascript-2 xgettext-javascript-3 \
- xgettext-javascript-4 xgettext-javascript-5 \
+ xgettext-javascript-4 xgettext-javascript-5 xgettext-javascript-6 \
xgettext-vala-1 \
xgettext-gsettings-1 \
format-awk-1 format-awk-2 \
diff --git a/gettext-tools/tests/xgettext-javascript-6 b/gettext-tools/tests/xgettext-javascript-6
new file mode 100644
index 0000000..a891ebe
--- /dev/null
+++ b/gettext-tools/tests/xgettext-javascript-6
@@ -0,0 +1,75 @@
+#!/bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test of JavaScript E4X support.
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles xg-js-6.js"
+cat <<\EOF > xg-js-6.js
+var x1 = <x1></x1>;
+var s1 = _("Expected translation string #1");
+var s2 = "foo";
+var x2 = <{s2}>foo {s2} bar</{s2}>;
+var x3 = <x3 a1="/"><x4>{_("Expected translation string #2")}</x4></x3>;
+var x4 = <x5 a2='/'><x{_("Expected translation string #3")}>
+</x{_("Expected translation string #3")}></x5>;
+var x4 = <![CDATA[
+ _("Unexpected translation string #1")
+]]>;
+var x5 = <!-- - _("Unexpected translation string #2") - -->;
+var s6 = _("Expected translation string #4");
+var x6 = <? _("Unexpected translation string #3") ?>;
+var x7 = <!--- this is a comment --> <foo>
+</foo>;
+EOF
+
+tmpfiles="$tmpfiles xg-js-6.err xg-js-6.tmp xg-js-6.pot"
+: ${XGETTEXT=xgettext}
+${XGETTEXT} --add-comments --no-location -o xg-js-6.tmp xg-js-6.js 2>xg-js-6.err
+test $? = 0 || { cat xg-js-6.err; rm -fr $tmpfiles; exit 1; }
+# Don't simplify this to "grep ... < xg-js-6.tmp", otherwise OpenBSD 4.0 grep
+# only outputs "Binary file (standard input) matches".
+cat xg-js-6.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-js-6.pot
+
+tmpfiles="$tmpfiles xg-js-6.ok"
+cat <<\EOF > xg-js-6.ok
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"Report-Msgid-Bugs-To: \n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"Language: \n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+msgid "Expected translation string #1"
+msgstr ""
+
+msgid "Expected translation string #2"
+msgstr ""
+
+msgid "Expected translation string #3"
+msgstr ""
+
+msgid "Expected translation string #4"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} xg-js-6.ok xg-js-6.pot
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result