summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--gettext-tools/src/ChangeLog12
-rw-r--r--gettext-tools/src/x-vala.c191
-rw-r--r--gettext-tools/tests/ChangeLog4
-rwxr-xr-xgettext-tools/tests/xgettext-vala-17
4 files changed, 153 insertions, 61 deletions
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index f4ead25..29a1f21 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,5 +1,17 @@
2014-05-02 Daiki Ueno <ueno@gnu.org>
+ vala: Support C99-style Unicode character escapes
+ * x-vala.c: Include assert.h and po-charset.h.
+ (P7_QUOTES, P7_QUOTE, P7_NEWLINE): Redefine as a negative integer.
+ (P7_EOF, P7_STRING_END): New definitions.
+ (UNICODE): New macro.
+ (IS_UNICODE): New macro.
+ (UNICODE_VALUE): New macro.
+ (phase7_getc): Recognize "\unnnn" and "\Unnnnnnnn".
+ (phase3_get): Use mixed_string_buffer for parse string literal.
+
+2014-05-02 Daiki Ueno <ueno@gnu.org>
+
xgettext: Factor out commonly used mixed_string_buffer
* x-python.c (init_mixed_string_buffer)
(mixed_string_buffer_append_byte)
diff --git a/gettext-tools/src/x-vala.c b/gettext-tools/src/x-vala.c
index 68c8d9c..cee1deb 100644
--- a/gettext-tools/src/x-vala.c
+++ b/gettext-tools/src/x-vala.c
@@ -23,6 +23,7 @@
/* Specification. */
#include "x-vala.h"
+#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
@@ -36,6 +37,7 @@
#include "xalloc.h"
#include "xvasprintf.h"
#include "hash.h"
+#include "po-charset.h"
#include "gettext.h"
#define _(s) gettext(s)
@@ -377,12 +379,28 @@ free_token (token_ty *tp)
}
+/* Return value of phase7_getc when EOF is reached. */
+#define P7_EOF (-1)
+#define P7_STRING_END (-2)
+
/* Replace escape sequences within character strings with their single
character equivalents. */
+#define P7_QUOTES (-3)
+#define P7_QUOTE (-4)
+#define P7_NEWLINE (-5)
+
+/* Convert an UTF-16 or UTF-32 code point to a return value that can be
+ distinguished from a single-byte return value. */
+#define UNICODE(code) (0x100 + (code))
+
+/* Test a return value of phase7_getuc whether it designates an UTF-16 or
+ UTF-32 code point. */
+#define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
+
+/* Extract the UTF-16 or UTF-32 code of a return value that satisfies
+ IS_UNICODE. */
+#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
-#define P7_QUOTES (1000 + '"')
-#define P7_QUOTE (1000 + '\'')
-#define P7_NEWLINE (1000 + '\n')
static int
phase7_getc ()
@@ -514,6 +532,47 @@ phase7_getc ()
}
phase1_ungetc (c);
return n;
+
+ case 'U': case 'u':
+ {
+ unsigned char buf[8];
+
+ n = 0;
+ for (j = 0; j < (c == 'u' ? 4 : 8); j++)
+ {
+ int c1 = phase1_getc ();
+
+ if (c1 >= '0' && c1 <= '9')
+ n = (n << 4) + (c1 - '0');
+ else if (c1 >= 'A' && c1 <= 'F')
+ n = (n << 4) + (c1 - 'A' + 10);
+ else if (c1 >= 'a' && c1 <= 'f')
+ n = (n << 4) + (c1 - 'a' + 10);
+ else
+ {
+ phase1_ungetc (c1);
+ while (--j >= 0)
+ phase1_ungetc (buf[j]);
+ phase1_ungetc (c);
+ return '\\';
+ }
+
+ buf[j] = c1;
+ }
+
+ if (n < 0x110000)
+ return UNICODE (n);
+
+ error_with_progname = false;
+ error (0, 0, _("%s:%d: warning: invalid Unicode character"),
+ logical_file_name, line_number);
+ error_with_progname = true;
+
+ while (--j >= 0)
+ phase1_ungetc (buf[j]);
+ phase1_ungetc (c);
+ return '\\';
+ }
}
}
@@ -802,7 +861,9 @@ phase3_get (token_ty *tp)
/* FALLTHROUGH */
case '"':
{
+ struct mixed_string_buffer *bp;
int c2 = phase2_getc ();
+
if (c2 == '"')
{
int c3 = phase2_getc ();
@@ -816,65 +877,67 @@ phase3_get (token_ty *tp)
}
else
phase2_ungetc (c2);
- }
- bufpos = 0;
- for (;;)
- {
- c = phase7_getc ();
- if (c == P7_NEWLINE)
- {
- if (verbatim)
- c = '\n';
- else
- {
- error_with_progname = false;
- error (0, 0, _("%s:%d: warning: unterminated string literal"),
- logical_file_name, line_number - 1);
- error_with_progname = true;
- phase7_ungetc ('\n');
+ /* Start accumulating the string. */
+ bp = mixed_string_buffer_alloc (lc_string,
+ logical_file_name,
+ line_number);
+ for (;;)
+ {
+ c = phase7_getc ();
+ if (c == P7_NEWLINE)
+ {
+ if (verbatim)
+ c = '\n';
+ else
+ {
+ error_with_progname = false;
+ error (0, 0, _("\
+%s:%d: warning: unterminated string literal"),
+ logical_file_name, line_number - 1);
+ error_with_progname = true;
+ phase7_ungetc ('\n');
+ break;
+ }
+ }
+ if (c == P7_QUOTES)
+ {
+ if (verbatim)
+ {
+ int c2 = phase2_getc ();
+ if (c2 == '"')
+ {
+ int c3 = phase2_getc ();
+ if (c3 == '"')
+ break;
+ phase2_ungetc (c3);
+ }
+ phase2_ungetc (c2);
+ c = '"';
+ }
+ else
break;
- }
- }
- if (c == P7_QUOTES)
- {
- if (verbatim)
- {
- int c2 = phase2_getc ();
- if (c2 == '"')
- {
- int c3 = phase2_getc ();
- if (c3 == '"')
- break;
- phase2_ungetc (c3);
- }
- phase2_ungetc (c2);
- c = '"';
- }
- else
- break;
- }
- if (c == EOF)
- break;
- if (c == P7_QUOTE)
- c = '\'';
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos++] = c;
- }
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos] = 0;
- tp->type = last_token_type = template ? token_type_string_template : token_type_string_literal;
- tp->string = xstrdup (buffer);
- tp->comment = add_reference (savable_comment);
- return;
+ }
+ if (c == EOF)
+ break;
+ if (c == P7_QUOTE)
+ c = '\'';
+ if (IS_UNICODE (c))
+ {
+ assert (UNICODE_VALUE (c) >= 0
+ && UNICODE_VALUE (c) < 0x110000);
+ mixed_string_buffer_append_unicode (bp,
+ UNICODE_VALUE (c));
+ }
+ else
+ mixed_string_buffer_append_char (bp, c);
+ }
+ tp->type = last_token_type = template
+ ? token_type_string_template : token_type_string_literal;
+ tp->string = xstrdup (mixed_string_buffer_done (bp));
+ tp->comment = add_reference (savable_comment);
+ return;
+ }
case '/':
switch (last_token_type)
@@ -1192,7 +1255,9 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
arglist_parser_alloc (mlp,
state ? next_shapes : NULL)))
{
+ xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
return true;
}
next_context_iter = null_context_list_iterator;
@@ -1202,7 +1267,9 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
case token_type_rparen:
if (delim == token_type_rparen || delim == token_type_eof)
{
+ xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
return false;
}
@@ -1221,7 +1288,9 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
continue;
case token_type_eof:
+ xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
return true;
case token_type_string_literal:
@@ -1230,6 +1299,7 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
pos.file_name = logical_file_name;
pos.line_number = token.line_number;
+ xgettext_current_source_encoding = po_charset_utf8;
if (extract_all)
remember_a_message (mlp, NULL, token.string, inner_context,
&pos, NULL, token.comment);
@@ -1251,6 +1321,7 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
inner_context, pos.file_name,
pos.line_number, token.comment);
}
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
}
drop_reference (token.comment);
next_context_iter = null_context_list_iterator;
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog
index 740bf5a..ddfdd62 100644
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,7 @@
+2014-05-02 Daiki Ueno <ueno@gnu.org>
+
+ * xgettext-vala-1: Test Unicode character escapes.
+
2014-04-30 Daiki Ueno <ueno@gnu.org>
* xgettext-scheme-4: New file.
diff --git a/gettext-tools/tests/xgettext-vala-1 b/gettext-tools/tests/xgettext-vala-1
index ebc769a..e176d17 100755
--- a/gettext-tools/tests/xgettext-vala-1
+++ b/gettext-tools/tests/xgettext-vala-1
@@ -16,6 +16,8 @@ int main (string[] args) {
var s4 = _("""Extract this
""
fourth string""");
+
+ var s5 = _("Extract this \u2464th string");
return 0;
}
EOF
@@ -43,7 +45,7 @@ msgstr ""
"Language-Team: LANGUAGE <LL@li.org>\n"
"Language: \n"
"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
msgid "Extract this first string"
@@ -60,6 +62,9 @@ msgid ""
" \"\"\n"
" fourth string"
msgstr ""
+
+msgid "Extract this ⑤th string"
+msgstr ""
EOF
: ${DIFF=diff}