summaryrefslogtreecommitdiffstats
path: root/gettext-tools/src/write-qt.c
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2003-10-28 16:10:35 +0000
committerBruno Haible <bruno@clisp.org>2009-06-23 12:11:06 +0200
commit2f09ddb5c812f820b8d9f2cf521262fcbfc4732d (patch)
tree64b68ea17dd7a4d8e1050b5dc857a7cbce7b5944 /gettext-tools/src/write-qt.c
parent06784fb7630f0d8511682df4d72e0e006e60e8b4 (diff)
downloadexternal_gettext-2f09ddb5c812f820b8d9f2cf521262fcbfc4732d.zip
external_gettext-2f09ddb5c812f820b8d9f2cf521262fcbfc4732d.tar.gz
external_gettext-2f09ddb5c812f820b8d9f2cf521262fcbfc4732d.tar.bz2
Support for Qt message catalog format and Qt format strings.
Diffstat (limited to 'gettext-tools/src/write-qt.c')
-rw-r--r--gettext-tools/src/write-qt.c539
1 files changed, 539 insertions, 0 deletions
diff --git a/gettext-tools/src/write-qt.c b/gettext-tools/src/write-qt.c
new file mode 100644
index 0000000..95fabe3
--- /dev/null
+++ b/gettext-tools/src/write-qt.c
@@ -0,0 +1,539 @@
+/* Writing Qt .qm files.
+ Copyright (C) 2003 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2003.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+/* Specification. */
+#include "write-qt.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "error.h"
+#include "xerror.h"
+#include "message.h"
+#include "po-charset.h"
+#include "msgl-iconv.h"
+#include "hash-string.h"
+#include "utf8-ucs4.h"
+#include "xalloc.h"
+#include "obstack.h"
+#include "binary-io.h"
+#include "fwriteerror.h"
+#include "exit.h"
+#include "gettext.h"
+
+#define _(str) gettext (str)
+
+/* Qt .qm files are read by the QTranslator::load() function and written
+ by the Qt QTranslator::save() function.
+
+ The Qt tool 'msg2qm' uses the latter function and can convert PO files
+ to .qm files. But since 'msg2qm' is marked as an "old" tool in Qt 3.0.5's
+ i18n.html documentation and therefore likely to disappear, we provide the
+ same functionality here.
+
+ The format of .qm files, as reverse engineered from the functions
+ QTranslator::save(const QString& filename, SaveMode mode)
+ QTranslator::squeeze(SaveMode mode)
+ QTranslatorMessage::write(QDataStream& stream, bool strip, Prefix prefix)
+ elfHash(const char* name)
+ in qt-3.0.5, is as follows:
+
+ It's a binary data format. Elements are u8 (byte), u16, u32. They are
+ written in big-endian order.
+
+ The file starts with a magic string of 16 bytes:
+ 3C B8 64 18 CA EF 9C 95 CD 21 1C BF 60 A1 BD DD
+
+ Then come three sections. Each of the three sections is optional. Each
+ has this structure:
+ struct {
+ u8 section_type; // 0x42 = hashes, 0x69 = messages, 0x2f = contexts
+ u32 length; // number of bytes of the data
+ u8 data[length];
+ };
+
+ In the first section, the hashes section, the data has the following
+ structure:
+ It's a sorted array of
+ struct {
+ u32 hashcode; // elfHash of the concatenation of msgid and
+ // disambiguating-comment
+ u32 offset; // offset within the data[] of the messages section
+ };
+ It's sorted in ascending order by hashcode as primary sorting criteria
+ and - when the hashcodes are the same - by offset as secondary criteria.
+
+ In the second section, the messages section, the data has the following
+ structure:
+ It's a sequence of records, each representing a message, in no
+ particular order. Each record is a sequence of subsections, each
+ introduced by a particular subsection tag. The possible subsection tags
+ are (and they usually occur in this order):
+ - 03: Translation. Followed by the msgstr in UCS-2 or UTF-16 format:
+ struct {
+ u32 length;
+ u16 chars[length/2];
+ };
+ - 08: Disambiguating-comment. Followed by the NUL-terminated,
+ ISO-8859-1 encoded, disambiguating-comment string:
+ struct {
+ u32 length; // number of bytes including the NUL at the end
+ u8 chars[length];
+ };
+ - 06: SourceText, i.e. msgid. Followed by the NUL-terminated,
+ ISO-8859-1 encoded, msgid:
+ struct {
+ u32 length; // number of bytes including the NUL at the end
+ u8 chars[length];
+ };
+ - 02: SourceText16, i.e. msgid. Encoded as UCS-2, but must actually
+ be ISO-8859-1.
+ struct {
+ u32 length;
+ u16 chars[length/2];
+ };
+ This subsection tag is obsoleted by SourceText.
+ - 07: Context. Followed by the NUL-terminated, ISO-8859-1 encoded,
+ context string (usually a C++ class name or empty):
+ struct {
+ u32 length; // number of bytes including the NUL at the end
+ u8 chars[length];
+ };
+ - 04: Context16. Encoded as UCS-2, but must actually be ISO-8859-1.
+ struct {
+ u32 length;
+ u16 chars[length/2];
+ };
+ This subsection tag is obsoleted by Context.
+ - 05: Hash. Followed by
+ struct {
+ u32 hashcode; // elfHash of the concatenation of msgid and
+ // disambiguating-comment
+ };
+ - 01: End. Designates the end of the record. No further data.
+ Usually the following subsections are written, but some of them are
+ optional:
+ - 03: Translation.
+ - 08: Disambiguating-comment (optional).
+ - 06: SourceText (optional).
+ - 07: Context (optional).
+ - 05: Hash.
+ - 01: End.
+ A subsection can be omitted if the value to be output is the same as
+ for the previous record.
+
+ In the third section, the contexts section, the data contains a hash
+ table. Quite complicated.
+
+ The elfHash function is the same as our hash_string function, except that
+ at the end it maps a hash code of 0x00000000 to 0x00000001.
+
+ When we convert from PO file format, all disambiguating-comments and
+ contexts are empty, and therefore the contexts section can be omitted. */
+
+
+/* Write a u8 (a single byte) to the output stream. */
+static inline void
+write_u8 (FILE *output_file, unsigned char value)
+{
+ putc (value, output_file);
+}
+
+/* Write a u16 (two bytes) to the output stream. */
+static inline void
+write_u16 (FILE *output_file, unsigned short value)
+{
+ unsigned char data[2];
+
+ data[0] = (value >> 8) & 0xff;
+ data[1] = value & 0xff;
+
+ fwrite (data, 2, 1, output_file);
+}
+
+/* Write a u32 (four bytes) to the output stream. */
+static inline void
+write_u32 (FILE *output_file, unsigned int value)
+{
+ unsigned char data[4];
+
+ data[0] = (value >> 24) & 0xff;
+ data[1] = (value >> 16) & 0xff;
+ data[2] = (value >> 8) & 0xff;
+ data[3] = value & 0xff;
+
+ fwrite (data, 4, 1, output_file);
+}
+
+
+#define obstack_chunk_alloc xmalloc
+#define obstack_chunk_free free
+
+/* Add a u8 (a single byte) to an obstack. */
+static void
+append_u8 (struct obstack *mempool, unsigned char value)
+{
+ unsigned char data[1];
+
+ data[0] = value;
+
+ obstack_grow (mempool, data, 1);
+}
+
+/* Add a u16 (two bytes) to an obstack. */
+static void
+append_u16 (struct obstack *mempool, unsigned short value)
+{
+ unsigned char data[2];
+
+ data[0] = (value >> 8) & 0xff;
+ data[1] = value & 0xff;
+
+ obstack_grow (mempool, data, 2);
+}
+
+/* Add a u32 (four bytes) to an obstack. */
+static void
+append_u32 (struct obstack *mempool, unsigned int value)
+{
+ unsigned char data[4];
+
+ data[0] = (value >> 24) & 0xff;
+ data[1] = (value >> 16) & 0xff;
+ data[2] = (value >> 8) & 0xff;
+ data[3] = value & 0xff;
+
+ obstack_grow (mempool, data, 4);
+}
+
+/* Add an ISO-8859-1 encoded string to an obstack. */
+static void
+append_base_string (struct obstack *mempool, const char *string)
+{
+ size_t length = strlen (string) + 1;
+ append_u32 (mempool, length);
+ obstack_grow (mempool, string, length);
+}
+
+/* Add an UTF-16 encoded string to an obstack. */
+static void
+append_unicode_string (struct obstack *mempool, const unsigned short *string,
+ size_t length)
+{
+ append_u32 (mempool, length * 2);
+ for (; length > 0; string++, length--)
+ append_u16 (mempool, *string);
+}
+
+/* Retrieve a 4-byte integer from memory. */
+static inline unsigned int
+peek_u32 (const unsigned char *p)
+{
+ return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
+}
+
+/* Convert an UTF-8 string to ISO-8859-1, without error checking. */
+static char *
+conv_to_iso_8859_1 (const char *string)
+{
+ size_t length = strlen (string);
+ const char *str = string;
+ const char *str_limit = string + length;
+ /* Conversion to ISO-8859-1 can only reduce the number of bytes. */
+ char *result = (char *) xmalloc (length + 1);
+ char *q = result;
+
+ while (str < str_limit)
+ {
+ unsigned int uc;
+ str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
+ /* It has already been verified that the string its in ISO-8859-1. */
+ if (!(uc < 0x100))
+ abort ();
+ /* Store as ISO-8859-1. */
+ *q++ = (unsigned char) uc;
+ }
+ *q = '\0';
+ assert (q - result <= length);
+
+ return result;
+}
+
+/* Convert an UTF-8 string to UTF-16, returning its size (number of UTF-16
+ codepoints) in *SIZEP. */
+static unsigned short *
+conv_to_utf16 (const char *string, size_t *sizep)
+{
+ size_t length = strlen (string);
+ const char *str = string;
+ const char *str_limit = string + length;
+ /* Conversion to UTF-16 can at most double the number of bytes. */
+ unsigned short *result = (unsigned short *) xmalloc (2 * length);
+ unsigned short *q = result;
+
+ while (str < str_limit)
+ {
+ unsigned int uc;
+ str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
+ if (uc < 0x10000)
+ /* UCS-2 character. */
+ *q++ = (unsigned short) uc;
+ else
+ {
+ /* UTF-16 surrogate. */
+ *q++ = 0xd800 + ((uc - 0x10000) >> 10);
+ *q++ = 0xdc00 + ((uc - 0x10000) & 0x3ff);
+ }
+ }
+ assert (q - result <= 2 * length);
+
+ *sizep = q - result;
+ return result;
+}
+
+/* Return the Qt hash code of a string. */
+static unsigned int
+string_hashcode (const char *str)
+{
+ unsigned int h;
+
+ h = hash_string (str);
+ if (h == 0)
+ h = 1;
+ return h;
+}
+
+/* Compare two entries of the hashes section. */
+static int
+cmp_hashes (const void *va, const void *vb)
+{
+ const unsigned char *a = (const unsigned char *) va;
+ const unsigned char *b = (const unsigned char *) vb;
+ unsigned int a_hashcode = peek_u32 (a);
+ unsigned int b_hashcode = peek_u32 (b);
+
+ if (a_hashcode != b_hashcode)
+ return (a_hashcode >= b_hashcode ? 1 : -1);
+ else
+ {
+ unsigned int a_offset = peek_u32 (a + 4);
+ unsigned int b_offset = peek_u32 (b + 4);
+
+ if (a_offset != b_offset)
+ return (a_offset >= b_offset ? 1 : -1);
+ else
+ return 0;
+ }
+}
+
+
+/* Write a section to the output stream. */
+static void
+write_section (FILE *output_file, unsigned char tag, void *data, size_t size)
+{
+ /* A section can be omitted if it is empty. */
+ if (size > 0)
+ {
+ write_u8 (output_file, tag);
+ write_u32 (output_file, size);
+ fwrite (data, size, 1, output_file);
+ }
+}
+
+
+/* Write an entire .qm file. */
+static void
+write_qm (FILE *output_file, message_list_ty *mlp)
+{
+ static unsigned char magic[16] =
+ {
+ 0x3C, 0xB8, 0x64, 0x18, 0xCA, 0xEF, 0x9C, 0x95,
+ 0xCD, 0x21, 0x1C, 0xBF, 0x60, 0xA1, 0xBD, 0xDD
+ };
+ struct obstack hashes_pool;
+ struct obstack messages_pool;
+ size_t j;
+
+ obstack_init (&hashes_pool);
+ obstack_init (&messages_pool);
+
+ /* Prepare the hashes section and the messages section. */
+ for (j = 0; j < mlp->nitems; j++)
+ {
+ message_ty *mp = mlp->item[j];
+
+ /* No need to emit the header entry, it's not needed at runtime. */
+ if (mp->msgid[0] != '\0')
+ {
+ char *msgid_as_iso_8859_1 = conv_to_iso_8859_1 (mp->msgid);
+ size_t msgstr_len;
+ unsigned short *msgstr_as_utf16 =
+ conv_to_utf16 (mp->msgstr, &msgstr_len);
+ unsigned int hashcode = string_hashcode (msgid_as_iso_8859_1);
+ unsigned int offset = obstack_object_size (&messages_pool);
+
+ /* Add a record to the hashes section. */
+ append_u32 (&hashes_pool, hashcode);
+ append_u32 (&hashes_pool, offset);
+
+ /* Add a record to the messages section. */
+
+ append_u8 (&messages_pool, 0x03);
+ append_unicode_string (&messages_pool, msgstr_as_utf16, msgstr_len);
+
+ append_u8 (&messages_pool, 0x08);
+ append_base_string (&messages_pool, "");
+
+ append_u8 (&messages_pool, 0x06);
+ append_base_string (&messages_pool, msgid_as_iso_8859_1);
+
+ append_u8 (&messages_pool, 0x07);
+ append_base_string (&messages_pool, "");
+
+ append_u8 (&messages_pool, 0x05);
+ append_u32 (&messages_pool, hashcode);
+
+ append_u8 (&messages_pool, 0x01);
+
+ free (msgstr_as_utf16);
+ free (msgid_as_iso_8859_1);
+ }
+ }
+
+ /* Sort the hashes section. */
+ {
+ size_t nstrings = obstack_object_size (&hashes_pool) / 8;
+ if (nstrings > 0)
+ qsort (obstack_base (&hashes_pool), nstrings, 8, cmp_hashes);
+ }
+
+ /* Write the magic number. */
+ fwrite (magic, sizeof (magic), 1, output_file);
+
+ /* Write the hashes section. */
+ write_section (output_file, 0x42, obstack_base (&hashes_pool),
+ obstack_object_size (&hashes_pool));
+
+ /* Write the messages section. */
+ write_section (output_file, 0x69, obstack_base (&messages_pool),
+ obstack_object_size (&messages_pool));
+
+ /* Omit the contexts section. */
+#if 0
+ write_section (output_file, 0x2f, ...);
+#endif
+
+ obstack_free (&messages_pool, NULL);
+ obstack_free (&hashes_pool, NULL);
+}
+
+
+int
+msgdomain_write_qt (message_list_ty *mlp, const char *canon_encoding,
+ const char *domain_name, const char *file_name)
+{
+ FILE *output_file;
+
+ /* If no entry for this domain don't even create the file. */
+ if (mlp->nitems != 0)
+ {
+ /* Determine whether mlp has plural entries. */
+ {
+ bool has_plural;
+ size_t j;
+
+ has_plural = false;
+ for (j = 0; j < mlp->nitems; j++)
+ if (mlp->item[j]->msgid_plural != NULL)
+ has_plural = true;
+ if (has_plural)
+ {
+ multiline_error (xstrdup (""),
+ xstrdup (_("\
+message catalog has plural form translations\n\
+but the Qt message catalog format doesn't support plural handling\n")));
+ return 1;
+ }
+ }
+
+ /* Convert the messages to Unicode. */
+ iconv_message_list (mlp, canon_encoding, po_charset_utf8, NULL);
+
+ /* Determine whether mlp has non-ISO-8859-1 msgid entries. */
+ {
+ size_t j;
+
+ for (j = 0; j < mlp->nitems; j++)
+ {
+ const char *string = mlp->item[j]->msgid;
+
+ /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all
+ its bytes are < 0xc4. */
+ for (; *string; string++)
+ if ((unsigned char) *string >= 0xc4)
+ {
+ multiline_error (xstrdup (""),
+ xstrdup (_("\
+message catalog has msgid strings containing characters outside ISO-8859-1\n\
+but the Qt message catalog format supports Unicode only in the translated\n\
+strings, not in the untranslated strings\n")));
+ return 1;
+ }
+ }
+ }
+
+ if (strcmp (domain_name, "-") == 0)
+ {
+ output_file = stdout;
+ SET_BINARY (fileno (output_file));
+ }
+ else
+ {
+ output_file = fopen (file_name, "wb");
+ if (output_file == NULL)
+ {
+ error (0, errno, _("error while opening \"%s\" for writing"),
+ file_name);
+ return 1;
+ }
+ }
+
+ if (output_file != NULL)
+ {
+ write_qm (output_file, mlp);
+
+ /* Make sure nothing went wrong. */
+ if (fwriteerror (output_file))
+ error (EXIT_FAILURE, errno, _("error while writing \"%s\" file"),
+ file_name);
+
+ if (output_file != stdout)
+ fclose (output_file);
+ }
+ }
+
+ return 0;
+}