xgettext: Add E4X support to JavaScript scanner

Reported by Piotr Drąg at: <https://savannah.gnu.org/bugs/?40125>. * src/xgettext.h (enum lexical_context_ty): New enumeration items lc_xml_open_tag, lc_xml_close_tag, lc_xml_content. * src/x-javascript.c (phase5_scan_xml_markup): New function. (phase5_get): Handle '<', '>', '/', '=', '{', and '}' specially to support E4X. (enum token_type_ty): New enumeration item token_type_equal. (xml_element_depth): New variable. (inside_embedded_in_xml): New variable. (extract_javascript): Initialize those variables. * tests/Makefile.am (TESTS): Add xgettext-javascript-6. * tests/xgettext-javascript-6: New file.
author: Daiki Ueno <ueno@gnu.org> 2013-11-20 12:41:20 +0900
committer: Daiki Ueno <ueno@gnu.org> 2013-12-04 19:53:11 +0900
commit: 992d6a594fa5ca3e914bc3f3f18431caee657db2 (patch)
tree: 4b045e79e21a275b0f7bcc5c6eb4e4669b1f7cc8
parent: 4142e63b88032c25572fd50b6518dadcfeee2c15 (diff)
download: external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.zip
external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.tar.gz
external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.tar.bz2
6 files changed, 307 insertions, 3 deletions
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index 97c3c1f..64ea86d 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,17 @@
+2013-11-20  Daiki Ueno  <ueno@gnu.org>
+
+	xgettext: Add E4X support to JavaScript scanner
+	Reported by Piotr Drąg at: <https://savannah.gnu.org/bugs/?40125>.
+	* xgettext.h (enum lexical_context_ty): New enumeration items
+	lc_xml_open_tag, lc_xml_close_tag, lc_xml_content.
+	* x-javascript.c (phase5_scan_xml_markup): New function.
+	(phase5_get): Handle '<', '>', '/', '=', '{', and '}' specially
+	to support E4X.
+	(enum token_type_ty): New enumeration item token_type_equal.
+	(xml_element_depth): New variable.
+	(inside_embedded_in_xml): New variable.
+	(extract_javascript): Initialize those variables.
+
 2013-11-14  Daiki Ueno  <ueno@gnu.org>
 
 	* x-javascript.c (phase3_getc): Make sure to call comment_line_end
diff --git a/gettext-tools/src/x-javascript.c b/gettext-tools/src/x-javascript.c
index c1503ce..59bbcbe 100644
--- a/gettext-tools/src/x-javascript.c
+++ b/gettext-tools/src/x-javascript.c
@@ -926,6 +926,7 @@ enum token_type_ty
   token_type_plus,              /* + */
   token_type_regexp,            /* /.../ */
   token_type_operator,          /* - * / % . < > = ~ ! | & ? : ^ */
+  token_type_equal,             /* = */
   token_type_string,            /* "abc", 'abc' */
   token_type_keyword,           /* return, else */
   token_type_symbol,            /* symbol, number */
@@ -1160,6 +1161,114 @@ phase5_scan_regexp ()
       phase2_ungetc (c);
 }
 
+static int xml_element_depth = 0;
+static bool inside_embedded_js_in_xml = false;
+
+static bool
+phase5_scan_xml_markup (token_ty *tp)
+{
+  struct
+  {
+    const char *start;
+    const char *end;
+  } markers[] =
+      {
+        { "!--", "--" },
+        { "![CDATA[", "]]" },
+        { "?", "?" }
+      };
+  int i;
+
+  for (i = 0; i < SIZEOF (markers); i++)
+    {
+      const char *start = markers[i].start;
+      const char *end = markers[i].end;
+      int j;
+
+      /* Look for a start marker.  */
+      for (j = 0; start[j] != '\0'; j++)
+        {
+          int c;
+
+          assert (phase2_pushback_length + j < SIZEOF (phase2_pushback));
+          c = phase2_getc ();
+          if (c == UEOF)
+            goto eof;
+          if (c != start[j])
+            {
+              int k = j;
+
+              phase2_ungetc (c);
+              k--;
+
+              for (; k >= 0; k--)
+                phase2_ungetc (start[k]);
+              break;
+            }
+        }
+
+      if (start[j] != '\0')
+        continue;
+
+      /* Skip until the end marker.  */
+      for (;;)
+        {
+          int c;
+
+          for (j = 0; end[j] != '\0'; j++)
+            {
+              assert (phase2_pushback_length + 1 < SIZEOF (phase2_pushback));
+              c = phase2_getc ();
+              if (c == UEOF)
+                goto eof;
+              if (c != end[j])
+                {
+                  /* Don't push the first character back so the next
+                     iteration start from the second character.  */
+                  if (j > 0)
+                    {
+                      int k = j;
+
+                      phase2_ungetc (c);
+                      k--;
+
+                      for (; k > 0; k--)
+                        phase2_ungetc (end[k]);
+                    }
+                  break;
+                }
+            }
+
+          if (end[j] != '\0')
+            continue;
+
+          c = phase2_getc ();
+          if (c == UEOF)
+            goto eof;
+          if (c != '>')
+            {
+              error_with_progname = false;
+              error (0, 0,
+                     _("%s:%d: warning: %s is not allowed"),
+                     logical_file_name, line_number,
+                     end);
+              error_with_progname = true;
+              return false;
+            }
+          return true;
+        }
+    }
+  return false;
+
+ eof:
+  error_with_progname = false;
+  error (0, 0,
+         _("%s:%d: warning: unterminated XML markup"),
+         logical_file_name, line_number);
+  error_with_progname = true;
+  return false;
+}
+
 static void
 phase5_get (token_ty *tp)
 {
@@ -1314,13 +1423,93 @@ phase5_get (token_ty *tp)
         /* Identify operators. The multiple character ones are simply ignored
          * as they are recognized here and are otherwise not relevant. */
         case '-': case '*': /* '+' and '/' are not listed here! */
-        case '%': case '<': case '>': case '=':
+        case '%':
         case '~': case '!': case '|': case '&': case '^':
         case '?': case ':':
           tp->type = last_token_type = token_type_operator;
           return;
 
+        case '=':
+          tp->type = last_token_type = token_type_equal;
+          return;
+
+        case '<':
+          {
+            /* We assume:
+               - XMLMarkup and XMLElement are only allowed after '=' or '('
+               - embedded JavaScript expressions in XML do not recurse
+             */
+            if (xml_element_depth > 0
+                || (!inside_embedded_js_in_xml
+                    && (last_token_type == token_type_equal
+                        || last_token_type == token_type_lparen)))
+              {
+                /* Comments, PI, or CDATA.  */
+                if (phase5_scan_xml_markup (tp))
+                  return;
+                c = phase2_getc ();
+
+                /* Closing tag.  */
+                if (c == '/')
+                  lexical_context = lc_xml_close_tag;
+
+                /* Opening element.  */
+                else
+                  {
+                    phase2_ungetc (c);
+                    lexical_context = lc_xml_open_tag;
+                    xml_element_depth++;
+                  }
+
+                tp->type = last_token_type = token_type_other;
+              }
+            else
+              tp->type = last_token_type = token_type_operator;
+          }
+          return;
+
+        case '>':
+          if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+            {
+              switch (lexical_context)
+                {
+                case lc_xml_open_tag:
+                  lexical_context = lc_xml_content;
+                  break;
+
+                case lc_xml_close_tag:
+                  if (xml_element_depth-- > 0)
+                    lexical_context = lc_xml_content;
+                  else
+                    lexical_context = lc_outside;
+                  break;
+
+                default:
+                  break;
+                }
+              tp->type = last_token_type = token_type_other;
+            }
+          else
+            tp->type = last_token_type = token_type_operator;
+          return;
+
         case '/':
+          if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+            {
+              /* If it appears in an opening tag of an XML element, it's
+                 part of '/>'.  */
+              if (lexical_context == lc_xml_open_tag)
+                {
+                  c = phase2_getc ();
+                  if (c == '>')
+                    lexical_context = lc_outside;
+                  else
+                    phase2_ungetc (c);
+                }
+              tp->type = last_token_type = token_type_other;
+              return;
+            }
+
           /* Either a division operator or the start of a regular
              expression literal.  If the '/' token is spotted after a
              symbol it's a division, otherwise it's a regular
@@ -1336,6 +1525,18 @@ phase5_get (token_ty *tp)
             }
           return;
 
+        case '{':
+          if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+            inside_embedded_js_in_xml = true;
+          tp->type = last_token_type = token_type_other;
+          return;
+
+        case '}':
+          if (xml_element_depth > 0 && inside_embedded_js_in_xml)
+            inside_embedded_js_in_xml = false;
+          tp->type = last_token_type = token_type_other;
+          return;
+
         case '(':
           tp->type = last_token_type = token_type_lparen;
           return;
@@ -1598,6 +1799,7 @@ extract_balanced (message_list_ty *mlp,
         case token_type_plus:
         case token_type_regexp:
         case token_type_operator:
+        case token_type_equal:
         case token_type_other:
           next_context_iter = null_context_list_iterator;
           state = 0;
@@ -1628,6 +1830,8 @@ extract_javascript (FILE *f,
   last_comment_line = -1;
   last_non_comment_line = -1;
 
+  xml_element_depth = 0;
+
   xgettext_current_file_source_encoding = xgettext_global_source_encoding;
 #if HAVE_ICONV
   xgettext_current_file_source_iconv = xgettext_global_source_iconv;
diff --git a/gettext-tools/src/xgettext.h b/gettext-tools/src/xgettext.h
index 16540fe..2f8a084 100644
--- a/gettext-tools/src/xgettext.h
+++ b/gettext-tools/src/xgettext.h
@@ -144,7 +144,12 @@ typedef enum
   {
     lc_outside, /* Initial context: outside of comments and strings.  */
     lc_comment, /* Inside a comment.  */
-    lc_string   /* Inside a string literal.  */
+    lc_string,  /* Inside a string literal.  */
+
+    /* For embedded XML in programming code, like E4X in JavaScript.  */
+    lc_xml_open_tag,   /* Inside an opening tag of an XML element.  */
+    lc_xml_close_tag,  /* Inside a closing tag of an XML element.  */
+    lc_xml_content     /* Inside an XML text node.  */
   }
   lexical_context_ty;
 
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog
index 33efe8d..42c680b 100644
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,9 @@
+2013-11-20  Daiki Ueno  <ueno@gnu.org>
+
+	xgettext: Add E4X support to JavaScript scanner
+	* Makefile.am (TESTS): Add xgettext-javascript-6.
+	* xgettext-javascript-6: New file.
+
 2013-11-14  Daiki Ueno  <ueno@gnu.org>
 
 	* xgettext-javascript-1: Add a test to extract translator comments.
diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am
index e1e0df2..d48aa7b 100644
--- a/gettext-tools/tests/Makefile.am
+++ b/gettext-tools/tests/Makefile.am
@@ -102,7 +102,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \
 	xgettext-ycp-1 xgettext-ycp-2 xgettext-ycp-3 xgettext-ycp-4 \
 	xgettext-lua-1 xgettext-lua-2 \
 	xgettext-javascript-1 xgettext-javascript-2 xgettext-javascript-3 \
-	xgettext-javascript-4 xgettext-javascript-5 \
+	xgettext-javascript-4 xgettext-javascript-5 xgettext-javascript-6 \
 	xgettext-vala-1 \
 	xgettext-gsettings-1 \
 	format-awk-1 format-awk-2 \
diff --git a/gettext-tools/tests/xgettext-javascript-6 b/gettext-tools/tests/xgettext-javascript-6
new file mode 100644
index 0000000..a891ebe
--- /dev/null
+++ b/gettext-tools/tests/xgettext-javascript-6
@@ -0,0 +1,75 @@
+#!/bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test of JavaScript E4X support.
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles xg-js-6.js"
+cat <<\EOF > xg-js-6.js
+var x1 = <x1></x1>;
+var s1 = _("Expected translation string #1");
+var s2 = "foo";
+var x2 = <{s2}>foo {s2} bar</{s2}>;
+var x3 = <x3 a1="/"><x4>{_("Expected translation string #2")}</x4></x3>;
+var x4 = <x5 a2='/'><x{_("Expected translation string #3")}>
+</x{_("Expected translation string #3")}></x5>;
+var x4 = <![CDATA[
+  _("Unexpected translation string #1")
+]]>;
+var x5 = <!-- - _("Unexpected translation string #2") - -->;
+var s6 = _("Expected translation string #4");
+var x6 = <? _("Unexpected translation string #3") ?>;
+var x7 = <!--- this is a comment --> <foo>
+</foo>;
+EOF
+
+tmpfiles="$tmpfiles xg-js-6.err xg-js-6.tmp xg-js-6.pot"
+: ${XGETTEXT=xgettext}
+${XGETTEXT} --add-comments --no-location -o xg-js-6.tmp xg-js-6.js 2>xg-js-6.err
+test $? = 0 || { cat xg-js-6.err; rm -fr $tmpfiles; exit 1; }
+# Don't simplify this to "grep ... < xg-js-6.tmp", otherwise OpenBSD 4.0 grep
+# only outputs "Binary file (standard input) matches".
+cat xg-js-6.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-js-6.pot
+
+tmpfiles="$tmpfiles xg-js-6.ok"
+cat <<\EOF > xg-js-6.ok
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"Report-Msgid-Bugs-To: \n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"Language: \n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+msgid "Expected translation string #1"
+msgstr ""
+
+msgid "Expected translation string #2"
+msgstr ""
+
+msgid "Expected translation string #3"
+msgstr ""
+
+msgid "Expected translation string #4"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} xg-js-6.ok xg-js-6.pot
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result
author	Daiki Ueno <ueno@gnu.org>	2013-11-20 12:41:20 +0900
committer	Daiki Ueno <ueno@gnu.org>	2013-12-04 19:53:11 +0900
commit	992d6a594fa5ca3e914bc3f3f18431caee657db2 (patch)
tree	4b045e79e21a275b0f7bcc5c6eb4e4669b1f7cc8
parent	4142e63b88032c25572fd50b6518dadcfeee2c15 (diff)
download	external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.zip external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.tar.gz external_gettext-992d6a594fa5ca3e914bc3f3f18431caee657db2.tar.bz2