summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2006-03-13 12:36:06 +0000
committerBruno Haible <bruno@clisp.org>2009-06-23 12:13:02 +0200
commitade34ef0abc5f549de72090f106e23c027739886 (patch)
tree2e548dabf3b242d3a6f8649d3236650a9a4bc8a3
parent92066d515918136e5620b47797f3a17a3458ded3 (diff)
downloadexternal_gettext-ade34ef0abc5f549de72090f106e23c027739886.zip
external_gettext-ade34ef0abc5f549de72090f106e23c027739886.tar.gz
external_gettext-ade34ef0abc5f549de72090f106e23c027739886.tar.bz2
Speed up msgmerge with large compendia by use of fast fuzzy search.
-rw-r--r--NEWS2
-rw-r--r--gettext-tools/ChangeLog5
-rw-r--r--gettext-tools/configure.ac20
-rw-r--r--gettext-tools/lib/ChangeLog5
-rw-r--r--gettext-tools/lib/hash.c30
-rw-r--r--gettext-tools/lib/hash.h9
-rw-r--r--gettext-tools/src/ChangeLog47
-rw-r--r--gettext-tools/src/Makefile.am7
-rw-r--r--gettext-tools/src/Makefile.msvc5
-rw-r--r--gettext-tools/src/Makefile.vms5
-rw-r--r--gettext-tools/src/message.c54
-rw-r--r--gettext-tools/src/message.h22
-rw-r--r--gettext-tools/src/msgfmt.c2
-rw-r--r--gettext-tools/src/msgmerge.c193
-rw-r--r--gettext-tools/src/po-charset.c264
-rw-r--r--gettext-tools/src/po-charset.h10
-rw-r--r--gettext-tools/tests/ChangeLog9
-rw-r--r--gettext-tools/tests/Makefile.am3
18 files changed, 630 insertions, 62 deletions
diff --git a/NEWS b/NEWS
index e664f58..9ff6ae4 100644
--- a/NEWS
+++ b/NEWS
@@ -28,6 +28,8 @@
* msggrep has a new option -v/--invert-match that acts like grep's -v option.
+* msgmerge is much faster now, when using a large compendium.
+
* Programming languages support:
- C++ with Boost:
diff --git a/gettext-tools/ChangeLog b/gettext-tools/ChangeLog
index b7f285d..efb2363 100644
--- a/gettext-tools/ChangeLog
+++ b/gettext-tools/ChangeLog
@@ -1,3 +1,8 @@
+2006-03-11 Bruno Haible <bruno@clisp.org>
+
+ * configure.ac (MSGMERGE_LIBM): New variable. Test for presence of
+ ceil() and sqrt().
+
2005-11-23 Bruno Haible <bruno@clisp.org>
Cygwin portability.
diff --git a/gettext-tools/configure.ac b/gettext-tools/configure.ac
index f42667c..82bb4b3 100644
--- a/gettext-tools/configure.ac
+++ b/gettext-tools/configure.ac
@@ -1,5 +1,5 @@
dnl Configuration for the gettext-tools directory of GNU gettext
-dnl Copyright (C) 1995-1999, 2000-2005 Free Software Foundation, Inc.
+dnl Copyright (C) 1995-1999, 2000-2006 Free Software Foundation, Inc.
dnl
dnl This program is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU General Public License as published by
@@ -114,6 +114,24 @@ AM_GNU_GETTEXT(use-libtool, need-ngettext)
dnl This line internationalizes the bison generated parsers.
BISON_I18N
+dnl Test whether msgmerge must be linked against libm. This is the case on
+dnl most systems; but BeOS has all <math.h> functions in libc and doesn't have
+dnl a libm.
+MSGMERGE_LIBM=?
+AC_TRY_LINK([#include <math.h>], [static double x; x = ceil(x); x = sqrt(x);],
+ [MSGMERGE_LIBM=])
+if test "$MSGMERGE_LIBM" = "?"; then
+ save_LIBS="$LIBS"
+ LIBS="$LIBS -lm"
+ AC_TRY_LINK([#include <math.h>], [static double x; x = ceil(x); x = sqrt(x);],
+ [MSGMERGE_LIBM="-lm"])
+ LIBS="$save_LIBS"
+fi
+if test "$MSGMERGE_LIBM" = "?"; then
+ MSGMERGE_LIBM=
+fi
+AC_SUBST([MSGMERGE_LIBM])
+
dnl Checks for header files.
AC_HEADER_STDC
AC_CHECK_HEADERS(limits.h malloc.h pwd.h string.h unistd.h utime.h values.h)
diff --git a/gettext-tools/lib/ChangeLog b/gettext-tools/lib/ChangeLog
index ce72ca9..004fa37 100644
--- a/gettext-tools/lib/ChangeLog
+++ b/gettext-tools/lib/ChangeLog
@@ -1,3 +1,8 @@
+2006-03-11 Bruno Haible <bruno@clisp.org>
+
+ * hash.h (hash_iterate_modify): New declaration.
+ * hash.c (hash_iterate_modify): New function.
+
2006-01-10 Bruno Haible <bruno@clisp.org>
* localcharset.c: Assume ANSI C. Fixes a gcc warning.
diff --git a/gettext-tools/lib/hash.c b/gettext-tools/lib/hash.c
index cb9c6bb..56725fa 100644
--- a/gettext-tools/lib/hash.c
+++ b/gettext-tools/lib/hash.c
@@ -346,3 +346,33 @@ hash_iterate (hash_table *htab, void **ptr, const void **key, size_t *keylen,
*data = ((hash_entry *) *ptr)->data;
return 0;
}
+
+
+/* Steps *PTR forward to the next used entry in the given hash table. *PTR
+ should be initially set to NULL. Store information about the next entry
+ in *KEY, *KEYLEN, *DATAP. *DATAP is set to point to the storage of the
+ value; modifying **DATAP will modify the value of the entry.
+ Return 0 normally, -1 when the whole hash table has been traversed. */
+int
+hash_iterate_modify (hash_table *htab, void **ptr,
+ const void **key, size_t *keylen,
+ void ***datap)
+{
+ if (*ptr == NULL)
+ {
+ if (htab->first == NULL)
+ return -1;
+ *ptr = (void *) ((hash_entry *) htab->first)->next;
+ }
+ else
+ {
+ if (*ptr == htab->first)
+ return -1;
+ *ptr = (void *) ((hash_entry *) *ptr)->next;
+ }
+
+ *key = ((hash_entry *) *ptr)->key;
+ *keylen = ((hash_entry *) *ptr)->keylen;
+ *datap = &((hash_entry *) *ptr)->data;
+ return 0;
+}
diff --git a/gettext-tools/lib/hash.h b/gettext-tools/lib/hash.h
index 6c4e71a..5563bd0 100644
--- a/gettext-tools/lib/hash.h
+++ b/gettext-tools/lib/hash.h
@@ -71,6 +71,15 @@ extern int hash_iterate (hash_table *htab, void **ptr,
const void **key, size_t *keylen,
void **data);
+/* Steps *PTR forward to the next used entry in the given hash table. *PTR
+ should be initially set to NULL. Store information about the next entry
+ in *KEY, *KEYLEN, *DATAP. *DATAP is set to point to the storage of the
+ value; modifying **DATAP will modify the value of the entry.
+ Return 0 normally, -1 when the whole hash table has been traversed. */
+extern int hash_iterate_modify (hash_table *htab, void **ptr,
+ const void **key, size_t *keylen,
+ void ***datap);
+
/* Given SEED > 1, return the smallest odd prime number >= SEED. */
extern unsigned long int next_prime (unsigned long int seed);
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index e22fa00..dfe393b 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,50 @@
+2006-03-11 Bruno Haible <bruno@clisp.org>
+
+ Speed up msgmerge with large compendia.
+ * message.h (message_list_free): Add keep_messages argument.
+ (message_list_list_free): Add keep_level argument.
+ (fuzzy_search_goal_function): New declaration.
+ (FUZZY_THRESHOLD): New macro.
+ * message.c (message_list_free): Add keep_messages argument.
+ (fuzzy_search_goal_function): New function, extracted from
+ message_list_search_fuzzy_inner.
+ (message_list_search_fuzzy_inner): Use it.
+ (message_list_search_fuzzy): Use symbolic value FUZZY_THRESHOLD.
+ (message_list_list_free): Comment in. Add keep_level argument.
+ (message_list_list_search_fuzzy): Comment out. Use symbolic value
+ FUZZY_THRESHOLD.
+ (msgdomain_free): Update.
+ (msgdomain_list_search_fuzzy): Use symbolic value FUZZY_THRESHOLD.
+ * po-charset.h: Include stddef.h.
+ (character_iterator_t): New type.
+ (po_charset_character_iterator): New declaration.
+ * po-charset.c (char_iterator, euc_character_iterator,
+ euc_jp_character_iterator, euc_tw_character_iterator,
+ big5_character_iterator, big5hkscs_character_iterator,
+ gbk_character_iterator, gb18030_character_iterator,
+ shift_jis_character_iterator, johab_character_iterator,
+ utf8_character_iterator, po_charset_character_iterator): New functions.
+ * msgl-fsearch.h: New file.
+ * msgl-fsearch.c: New file.
+ * msgmerge.c: Include po-charset.h, msgl-fsearch.h.
+ (compendium_filenames): New variable.
+ (compendium): Also put the filename into compendium_filenames.
+ (definitions_ty): New structure type.
+ (definitions_init, definitions_init_findex, definitions_current_list,
+ definitions_set_current_list, definitions_search,
+ definitions_search_fuzzy, definitions_destroy): New functions.
+ (match_domain): Change type of 'definitions' argument.
+ (merge): Also convert the compendium to UTF-8. Use definitions_init,
+ definitions_set_current_list, definitions_destroy.
+ * msgfmt.c (main): Update.
+ * Makefile.am (noinst_HEADERS): Add msgl-fsearch.h.
+ (msgmerge_SOURCES): Add msgl-fsearch.c.
+ (msgmerge_LDADD): Link with MSGMERGE_LIBM.
+ * Makefile.msvc (msgmerge_OBJECTS): Add msgl-fsearch.obj.
+ (msgl-fsearch.obj): New rule.
+ * Makefile.vms (msgmerge_OBJECTS): Add msgl-fsearch.obj.
+ (msgl-fsearch.obj): New rule.
+
2006-03-09 Bruno Haible <bruno@clisp.org>
* Makefile.am (CXXLINK) [!mingw]: Overwrite automake's value. Fixes
diff --git a/gettext-tools/src/Makefile.am b/gettext-tools/src/Makefile.am
index 386cf60..b30d34f 100644
--- a/gettext-tools/src/Makefile.am
+++ b/gettext-tools/src/Makefile.am
@@ -41,7 +41,8 @@ str-list.h \
write-po.h write-properties.h write-stringtable.h \
dir-list.h file-list.h po-gram-gen.h po-gram-gen2.h \
msgl-charset.h msgl-equal.h msgl-iconv.h msgl-ascii.h msgl-cat.h \
-msgl-english.h msgl-check.h msgfmt.h msgunfmt.h plural-count.h plural-eval.h \
+msgl-english.h msgl-check.h msgl-fsearch.h msgfmt.h msgunfmt.h \
+plural-count.h plural-eval.h \
read-mo.h write-mo.h \
read-java.h write-java.h \
read-csharp.h write-csharp.h \
@@ -146,7 +147,7 @@ msgmerge_SOURCES = msgmerge.c
else
msgmerge_SOURCES = ../mingw/c++msgmerge.cc
endif
-msgmerge_SOURCES += plural-count.c
+msgmerge_SOURCES += msgl-fsearch.c plural-count.c
msgunfmt_SOURCES = msgunfmt.c
msgunfmt_SOURCES += \
read-mo.c read-java.c read-csharp.c read-resources.c read-tcl.c
@@ -247,7 +248,7 @@ libgettextpo_la_DEPENDENCIES = libgettextsrc.la
# For msginit, it is also needed because of localename.c.
msgcmp_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@
msgfmt_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@
-msgmerge_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@
+msgmerge_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ @MSGMERGE_LIBM@
msgunfmt_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@
xgettext_LDADD = $(LIBUNINAME) libgettextsrc.la @INTL_MACOSX_LIBS@ @LTLIBEXPAT@
msgattrib_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@
diff --git a/gettext-tools/src/Makefile.msvc b/gettext-tools/src/Makefile.msvc
index e1e37e4..01285c6 100644
--- a/gettext-tools/src/Makefile.msvc
+++ b/gettext-tools/src/Makefile.msvc
@@ -153,7 +153,7 @@ OBJECTS = \
msgcmp_OBJECTS = msgcmp.obj
msgfmt_OBJECTS = msgfmt.obj write-mo.obj write-java.obj write-csharp.obj write-resources.obj write-tcl.obj write-qt.obj plural-eval.obj hash-string.obj
-msgmerge_OBJECTS = msgmerge.obj plural-count.obj
+msgmerge_OBJECTS = msgmerge.obj msgl-fsearch.obj plural-count.obj
msgunfmt_OBJECTS = msgunfmt.obj read-mo.obj read-java.obj read-csharp.obj read-resources.obj read-tcl.obj
xgettext_OBJECTS = xgettext.obj x-c.obj x-po.obj x-sh.obj x-python.obj x-lisp.obj x-elisp.obj x-librep.obj x-scheme.obj x-smalltalk.obj x-java.obj x-csharp.obj x-awk.obj x-ycp.obj x-tcl.obj x-perl.obj x-php.obj x-rst.obj x-glade.obj
msgattrib_OBJECTS = msgattrib.obj
@@ -361,6 +361,9 @@ hash-string.obj : ..\..\gettext-runtime\intl\hash-string.c
msgmerge.obj : msgmerge.c
$(CC) $(INCLUDES) $(CFLAGS) -DINSTALLPREFIX=\"$(IIprefix)\" -DINSTALLDIR=\"$(IIbindir)\" -c -Tp msgmerge.c
+msgl-fsearch.obj : msgl-fsearch.c
+ $(CC) $(INCLUDES) $(CFLAGS) -c msgl-fsearch.c
+
plural-count.obj : plural-count.c
$(CC) $(INCLUDES) $(CFLAGS) -c plural-count.c
diff --git a/gettext-tools/src/Makefile.vms b/gettext-tools/src/Makefile.vms
index cc6ba1d..eccfb82 100644
--- a/gettext-tools/src/Makefile.vms
+++ b/gettext-tools/src/Makefile.vms
@@ -99,7 +99,7 @@ OBJECTS = \
msgcmp_OBJECTS = msgcmp.obj
msgfmt_OBJECTS = msgfmt.obj, write-mo.obj, write-java.obj, write-csharp.obj, write-resources.obj, write-tcl.obj, write-qt.obj, plural-eval.obj, hash-string.obj
-msgmerge_OBJECTS = msgmerge.obj, plural-count.obj
+msgmerge_OBJECTS = msgmerge.obj, msgl-fsearch.obj, plural-count.obj
msgunfmt_OBJECTS = msgunfmt.obj, read-mo.obj, read-java.obj, read-csharp.obj, read-resources.obj, read-tcl.obj
xgettext_OBJECTS = xgettext.obj, x-c.obj, x-po.obj, x-sh.obj, x-python.obj, x-lisp.obj, x-elisp.obj, x-librep.obj, x-scheme.obj, x-smalltalk.obj, x-java.obj, x-csharp.obj, x-awk.obj, x-ycp.obj, x-tcl.obj, x-perl.obj, x-php.obj, x-rst.obj, x-glade.obj
msgattrib_OBJECTS = msgattrib.obj
@@ -291,6 +291,9 @@ hash-string.obj : [-.-.gettext-runtime.intl]hash-string.c
msgmerge.obj : msgmerge.c
$(CC) $(INCLUDES) $(CFLAGS) /define=($(DEFS),"INSTALLPREFIX=""$(prefix)]""","INSTALLDIR=""$(bindir)]""") msgmerge.c
+msgl-fsearch.obj : msgl-fsearch.c
+ $(CC) $(INCLUDES) $(CFLAGS) /define=($(DEFS)) msgl-fsearch.c
+
plural-count.obj : plural-count.c
$(CC) $(INCLUDES) $(CFLAGS) /define=($(DEFS)) plural-count.c
diff --git a/gettext-tools/src/message.c b/gettext-tools/src/message.c
index 7c80acf..120b795 100644
--- a/gettext-tools/src/message.c
+++ b/gettext-tools/src/message.c
@@ -237,12 +237,13 @@ message_list_alloc (bool use_hashtable)
void
-message_list_free (message_list_ty *mlp)
+message_list_free (message_list_ty *mlp, int keep_messages)
{
size_t j;
- for (j = 0; j < mlp->nitems; ++j)
- message_free (mlp->item[j]);
+ if (keep_messages == 0)
+ for (j = 0; j < mlp->nitems; ++j)
+ message_free (mlp->item[j]);
if (mlp->item)
free (mlp->item);
if (mlp->use_hashtable)
@@ -494,6 +495,23 @@ message_list_search (message_list_ty *mlp,
}
+double
+fuzzy_search_goal_function (const message_ty *mp,
+ const char *msgctxt, const char *msgid)
+{
+ return
+ fstrcmp (msgid, mp->msgid)
+ /* A translation for a context is a good proposal also for
+ another. But give mp a small advantage if mp is valid
+ regardless of any context or has the same context as the
+ one being looked up. */
+ + ((mp->msgctxt == NULL
+ || (msgctxt != NULL && strcmp (msgctxt, mp->msgctxt) == 0))
+ ? 0.00001
+ : 0);
+}
+
+
static message_ty *
message_list_search_fuzzy_inner (message_list_ty *mlp,
const char *msgctxt, const char *msgid,
@@ -511,16 +529,7 @@ message_list_search_fuzzy_inner (message_list_ty *mlp,
if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
{
- double weight =
- fstrcmp (msgid, mp->msgid)
- /* A translation for a context is a good proposal also for
- another. But give mp a small advantage if mp is valid
- regardless of any context or has the same context as the
- one being looked up. */
- + ((mp->msgctxt == NULL
- || (msgctxt != NULL && strcmp (msgctxt, mp->msgctxt) == 0))
- ? 0.00001
- : 0);
+ double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);
if (weight > *best_weight_p)
{
*best_weight_p = weight;
@@ -538,7 +547,7 @@ message_list_search_fuzzy (message_list_ty *mlp,
{
double best_weight;
- best_weight = 0.6;
+ best_weight = FUZZY_THRESHOLD;
return message_list_search_fuzzy_inner (mlp, msgctxt, msgid, &best_weight);
}
@@ -556,19 +565,18 @@ message_list_list_alloc ()
}
-#if 0 /* unused */
void
-message_list_list_free (message_list_list_ty *mllp)
+message_list_list_free (message_list_list_ty *mllp, int keep_level)
{
size_t j;
- for (j = 0; j < mllp->nitems; ++j)
- message_list_free (mllp->item[j]);
+ if (keep_level < 2)
+ for (j = 0; j < mllp->nitems; ++j)
+ message_list_free (mllp->item[j], keep_level);
if (mllp->item)
free (mllp->item);
free (mllp);
}
-#endif
void
@@ -628,6 +636,7 @@ message_list_list_search (message_list_list_ty *mllp,
}
+#if 0 /* unused */
message_ty *
message_list_list_search_fuzzy (message_list_list_ty *mllp,
const char *msgctxt, const char *msgid)
@@ -636,7 +645,7 @@ message_list_list_search_fuzzy (message_list_list_ty *mllp,
double best_weight;
message_ty *best_mp;
- best_weight = 0.6;
+ best_weight = FUZZY_THRESHOLD;
best_mp = NULL;
for (j = 0; j < mllp->nitems; ++j)
{
@@ -650,6 +659,7 @@ message_list_list_search_fuzzy (message_list_list_ty *mllp,
}
return best_mp;
}
+#endif
msgdomain_ty*
@@ -667,7 +677,7 @@ msgdomain_alloc (const char *domain, bool use_hashtable)
void
msgdomain_free (msgdomain_ty *mdp)
{
- message_list_free (mdp->messages);
+ message_list_free (mdp->messages, 0);
free (mdp);
}
@@ -783,7 +793,7 @@ msgdomain_list_search_fuzzy (msgdomain_list_ty *mdlp,
double best_weight;
message_ty *best_mp;
- best_weight = 0.6;
+ best_weight = FUZZY_THRESHOLD;
best_mp = NULL;
for (j = 0; j < mdlp->nitems; ++j)
{
diff --git a/gettext-tools/src/message.h b/gettext-tools/src/message.h
index c6d01ad..5a07301 100644
--- a/gettext-tools/src/message.h
+++ b/gettext-tools/src/message.h
@@ -196,8 +196,11 @@ struct message_list_ty
known that the message list will not contain duplicate msgids. */
extern message_list_ty *
message_list_alloc (bool use_hashtable);
+/* Free a message list.
+ If keep_messages = 0, also free the messages. If keep_messages = 1, don't
+ free the messages. */
extern void
- message_list_free (message_list_ty *mlp);
+ message_list_free (message_list_ty *mlp, int keep_messages);
extern void
message_list_append (message_list_ty *mlp, message_ty *mp);
extern void
@@ -232,8 +235,12 @@ struct message_list_list_ty
extern message_list_list_ty *
message_list_list_alloc (void);
+/* Free a list of message lists.
+ If keep_level = 0, also free the messages. If keep_level = 1, don't free
+ the messages but free the lists. If keep_level = 2, don't free the
+ the messages and the lists. */
extern void
- message_list_list_free (message_list_list_ty *mllp);
+ message_list_list_free (message_list_list_ty *mllp, int keep_level);
extern void
message_list_list_append (message_list_list_ty *mllp,
message_list_ty *mlp);
@@ -291,6 +298,17 @@ extern message_ty *
const char *msgctxt, const char *msgid);
+/* The goal function used in fuzzy search.
+ Higher values indicate a closer match. */
+extern double
+ fuzzy_search_goal_function (const message_ty *mp,
+ const char *msgctxt, const char *msgid);
+
+/* The threshold for fuzzy-searching.
+ A message is considered only if fstrcmp (msg, given) > FUZZY_THRESHOLD. */
+#define FUZZY_THRESHOLD 0.6
+
+
#ifdef __cplusplus
}
#endif
diff --git a/gettext-tools/src/msgfmt.c b/gettext-tools/src/msgfmt.c
index 8b871bf..e8d5653 100644
--- a/gettext-tools/src/msgfmt.c
+++ b/gettext-tools/src/msgfmt.c
@@ -588,7 +588,7 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
}
/* List is not used anymore. */
- message_list_free (domain->mlp);
+ message_list_free (domain->mlp, 0);
}
/* Print statistics if requested. */
diff --git a/gettext-tools/src/msgmerge.c b/gettext-tools/src/msgmerge.c
index 18e3e9b..8c5d068 100644
--- a/gettext-tools/src/msgmerge.c
+++ b/gettext-tools/src/msgmerge.c
@@ -47,8 +47,10 @@
#include "c-strcase.h"
#include "stpcpy.h"
#include "stpncpy.h"
+#include "po-charset.h"
#include "msgl-iconv.h"
#include "msgl-equal.h"
+#include "msgl-fsearch.h"
#include "plural-count.h"
#include "backupfile.h"
#include "copy-file.h"
@@ -78,6 +80,9 @@ static bool use_fuzzy_matching = true;
/* List of user-specified compendiums. */
static message_list_list_ty *compendiums;
+/* List of corresponding filenames. */
+static string_list_ty *compendium_filenames;
+
/* Update mode. */
static bool update_mode = false;
static const char *version_control_string;
@@ -560,10 +565,132 @@ compendium (const char *filename)
size_t k;
mdlp = read_po_file (filename);
- if (!compendiums)
- compendiums = message_list_list_alloc ();
+ if (compendiums == NULL)
+ {
+ compendiums = message_list_list_alloc ();
+ compendium_filenames = string_list_alloc ();
+ }
for (k = 0; k < mdlp->nitems; k++)
- message_list_list_append (compendiums, mdlp->item[k]->messages);
+ {
+ message_list_list_append (compendiums, mdlp->item[k]->messages);
+ string_list_append (compendium_filenames, filename);
+ }
+}
+
+
+/* Data structure representing the messages with known translations.
+ They are composed of
+ - A message list from def.po,
+ - The compendiums.
+ The data structure is optimized for exact and fuzzy searches. */
+typedef struct definitions_ty definitions_ty;
+struct definitions_ty
+{
+ /* A list of message lists. The first comes from def.po, the other ones
+ from the compendiums. Each message list has a built-in hash table,
+ for speed when doing the exact searches. */
+ message_list_list_ty *lists;
+ /* A fuzzy index of the compendiums, for speed when doing fuzzy searches.
+ Used only if use_fuzzy_matching is true and compendiums != NULL. */
+ message_fuzzy_index_ty *findex;
+ /* The canonical encoding of the compendiums. */
+ const char *canon_charset;
+};
+
+static inline void
+definitions_init (definitions_ty *definitions, const char *canon_charset)
+{
+ definitions->lists = message_list_list_alloc ();
+ message_list_list_append (definitions->lists, NULL);
+ if (compendiums != NULL)
+ message_list_list_append_list (definitions->lists, compendiums);
+ definitions->findex = NULL;
+ definitions->canon_charset = canon_charset;
+}
+
+/* Create the fuzzy index.
+ Used only if use_fuzzy_matching is true and compendiums != NULL. */
+static inline void
+definitions_init_findex (definitions_ty *definitions)
+{
+ /* Combine all the compendium message lists into a single one. Don't
+ bother checking for duplicates. */
+ message_list_ty *all_compendium;
+ size_t i;
+
+ all_compendium = message_list_alloc (false);
+ for (i = 0; i < compendiums->nitems; i++)
+ {
+ message_list_ty *mlp = compendiums->item[i];
+ size_t j;
+
+ for (j = 0; j < mlp->nitems; j++)
+ message_list_append (all_compendium, mlp->item[j]);
+ }
+
+ /* Create the fuzzy index from it. */
+ definitions->findex =
+ message_fuzzy_index_alloc (all_compendium, definitions->canon_charset);
+}
+
+/* Return the current list of non-compendium messages. */
+static inline message_list_ty *
+definitions_current_list (const definitions_ty *definitions)
+{
+ return definitions->lists->item[0];
+}
+
+/* Set the current list of non-compendium messages. */
+static inline void
+definitions_set_current_list (definitions_ty *definitions, message_list_ty *mlp)
+{
+ definitions->lists->item[0] = mlp;
+}
+
+/* Exact search. */
+static inline message_ty *
+definitions_search (const definitions_ty *definitions,
+ const char *msgctxt, const char *msgid)
+{
+ return message_list_list_search (definitions->lists, msgctxt, msgid);
+}
+
+/* Fuzzy search.
+ Used only if use_fuzzy_matching is true. */
+static inline message_ty *
+definitions_search_fuzzy (definitions_ty *definitions,
+ const char *msgctxt, const char *msgid)
+{
+ message_ty *mp1 =
+ message_list_search_fuzzy (definitions_current_list (definitions),
+ msgctxt, msgid);
+ if (compendiums != NULL)
+ {
+ message_ty *mp2;
+
+ /* Create the fuzzy index lazily. */
+ if (definitions->findex == NULL)
+ definitions_init_findex (definitions);
+
+ mp2 = message_fuzzy_index_search (definitions->findex, msgctxt, msgid);
+
+ /* Choose the best among mp1, mp2. */
+ if (mp1 == NULL
+ || (mp2 != NULL
+ && (fuzzy_search_goal_function (mp2, msgctxt, msgid)
+ > fuzzy_search_goal_function (mp1, msgctxt, msgid))))
+ mp1 = mp2;
+ }
+
+ return mp1;
+}
+
+static inline void
+definitions_destroy (definitions_ty *definitions)
+{
+ message_list_list_free (definitions->lists, 2);
+ if (definitions->findex != NULL)
+ message_fuzzy_index_free (definitions->findex);
}
@@ -907,7 +1034,7 @@ message_merge (message_ty *def, message_ty *ref)
static void
match_domain (const char *fn1, const char *fn2,
- message_list_list_ty *definitions, message_list_ty *refmlp,
+ definitions_ty *definitions, message_list_ty *refmlp,
message_list_ty *resultmlp,
struct statistics *stats, unsigned int *processed)
{
@@ -916,7 +1043,8 @@ match_domain (const char *fn1, const char *fn2,
char *untranslated_plural_msgstr;
size_t j;
- header_entry = message_list_search (definitions->item[0], NULL, "");
+ header_entry =
+ message_list_search (definitions_current_list (definitions), NULL, "");
nplurals = get_plural_count (header_entry ? header_entry->msgstr : NULL);
untranslated_plural_msgstr = (char *) xmalloc (nplurals);
memset (untranslated_plural_msgstr, '\0', nplurals);
@@ -934,8 +1062,7 @@ match_domain (const char *fn1, const char *fn2,
refmsg = refmlp->item[j];
/* See if it is in the other file. */
- defmsg =
- message_list_list_search (definitions, refmsg->msgctxt, refmsg->msgid);
+ defmsg = definitions_search (definitions, refmsg->msgctxt, refmsg->msgid);
if (defmsg)
{
/* Merge the reference with the definition: take the #. and
@@ -958,9 +1085,9 @@ match_domain (const char *fn1, const char *fn2,
help. */
if (use_fuzzy_matching
&& ((defmsg =
- message_list_list_search_fuzzy (definitions,
- refmsg->msgctxt,
- refmsg->msgid)) != NULL))
+ definitions_search_fuzzy (definitions,
+ refmsg->msgctxt,
+ refmsg->msgid)) != NULL))
{
message_ty *mp;
@@ -1117,7 +1244,7 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp)
unsigned int processed;
struct statistics stats;
msgdomain_list_ty *result;
- message_list_list_ty *definitions;
+ definitions_ty definitions;
message_list_ty *empty_list;
stats.merged = stats.fuzzied = stats.missing = stats.obsolete = 0;
@@ -1125,15 +1252,6 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp)
/* This is the definitions file, created by a human. */
def = read_po_file (fn1);
- /* Create the set of places to look for message definitions: a list
- whose first element will be definitions for the current domain, and
- whose other elements come from the compendiums. */
- definitions = message_list_list_alloc ();
- message_list_list_append (definitions, NULL);
- if (compendiums)
- message_list_list_append_list (definitions, compendiums);
- empty_list = message_list_alloc (false);
-
/* This is the references file, created by groping the sources with
the xgettext program. */
ref = read_po_file (fn2);
@@ -1148,7 +1266,8 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp)
}
/* The references file can be either in ASCII or in UTF-8. If it is
- in UTF-8, we have to convert the definitions to UTF-8 as well. */
+ in UTF-8, we have to convert the definitions and the compendiums to
+ UTF-8 as well. */
{
bool was_utf8 = false;
for (k = 0; k < ref->nitems; k++)
@@ -1178,9 +1297,23 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp)
}
}
if (was_utf8)
- def = iconv_msgdomain_list (def, "UTF-8", fn1);
+ {
+ def = iconv_msgdomain_list (def, "UTF-8", fn1);
+ if (compendiums != NULL)
+ for (k = 0; k < compendiums->nitems; k++)
+ iconv_message_list (compendiums->item[k], NULL, po_charset_utf8,
+ compendium_filenames->item[k]);
+ }
+ else
+ {
+ /* TODO: Convert all compendiums->item[k] to the same encoding. */
+ }
}
+ /* Initialize and preprocess the total set of message definitions. */
+ definitions_init (&definitions, po_charset_utf8);
+ empty_list = message_list_alloc (false);
+
result = msgdomain_list_alloc (false);
processed = 0;
@@ -1192,12 +1325,14 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp)
message_list_ty *refmlp = ref->item[k]->messages;
message_list_ty *resultmlp =
msgdomain_list_sublist (result, domain, true);
+ message_list_ty *defmlp;
- definitions->item[0] = msgdomain_list_sublist (def, domain, false);
- if (definitions->item[0] == NULL)
- definitions->item[0] = empty_list;
+ defmlp = msgdomain_list_sublist (def, domain, false);
+ if (defmlp == NULL)
+ defmlp = empty_list;
+ definitions_set_current_list (&definitions, defmlp);
- match_domain (fn1, fn2, definitions, refmlp, resultmlp,
+ match_domain (fn1, fn2, &definitions, refmlp, resultmlp,
&stats, &processed);
}
else
@@ -1217,14 +1352,16 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp)
message_list_ty *resultmlp =
msgdomain_list_sublist (result, domain, true);
- definitions->item[0] = defmlp;
+ definitions_set_current_list (&definitions, defmlp);
- match_domain (fn1, fn2, definitions, refmlp, resultmlp,
+ match_domain (fn1, fn2, &definitions, refmlp, resultmlp,
&stats, &processed);
}
}
}
+ definitions_destroy (&definitions);
+
/* Look for messages in the definition file, which are not present
in the reference file, indicating messages which defined but not
used in the program. Don't scan the compendium(s). */
diff --git a/gettext-tools/src/po-charset.c b/gettext-tools/src/po-charset.c
index cff5826..23d29f0 100644
--- a/gettext-tools/src/po-charset.c
+++ b/gettext-tools/src/po-charset.c
@@ -1,5 +1,5 @@
/* Charset handling while reading PO files.
- Copyright (C) 2001-2005 Free Software Foundation, Inc.
+ Copyright (C) 2001-2006 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2001.
This program is free software; you can redistribute it and/or modify
@@ -169,6 +169,268 @@ bool po_is_charset_weird_cjk (const char *canon_charset)
return false;
}
+/* Hardcoded iterator functions for all kinds of encodings.
+ We could also implement a general iterator function with iconv(),
+ but we need a fast one. */
+
+/* Character iterator for 8-bit encodings. */
+static size_t
+char_iterator (const char *s)
+{
+ return 1;
+}
+
+/* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */
+/* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */
+static size_t
+euc_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xa1 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xff)
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */
+static size_t
+euc_jp_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xa1 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xff)
+ return 2;
+ }
+ else if (c == 0x8e)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xe0)
+ return 2;
+ }
+ else if (c == 0x8f)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xff)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0xa1 && c3 < 0xff)
+ return 3;
+ }
+ }
+ return 1;
+}
+
+/* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */
+static size_t
+euc_tw_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xa1 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xff)
+ return 2;
+ }
+ else if (c == 0x8e)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 <= 0xb0)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0xa1 && c3 < 0xff)
+ {
+ unsigned char c4 = s[3];
+ if (c4 >= 0xa1 && c4 < 0xff)
+ return 4;
+ }
+ }
+ }
+ return 1;
+}
+
+/* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */
+static size_t
+big5_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xa1 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */
+static size_t
+big5hkscs_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0x88 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for GBK. See libiconv/lib/ces_gbk.h and
+ libiconv/lib/gbk.h. */
+static size_t
+gbk_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0x81 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for GB18030. See libiconv/lib/gb18030.h. */
+static size_t
+gb18030_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0x81 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
+ return 2;
+ }
+ if (c >= 0x81 && c <= 0x84)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0x30 && c2 <= 0x39)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0x81 && c3 < 0xff)
+ {
+ unsigned char c4 = s[3];
+ if (c4 >= 0x30 && c4 <= 0x39)
+ return 4;
+ }
+ }
+ }
+ return 1;
+}
+
+/* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */
+static size_t
+shift_jis_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for JOHAB. See libiconv/lib/johab.h and
+ libiconv/lib/johab_hangul.h. */
+static size_t
+johab_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0x84 && c <= 0xd3)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
+ return 2;
+ }
+ else if (c >= 0xd9 && c <= 0xf9)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for UTF-8. See libiconv/lib/utf8.h. */
+static size_t
+utf8_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xc2)
+ {
+ if (c < 0xe0)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0x80 && c2 < 0xc0)
+ return 2;
+ }
+ else if (c < 0xf0)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0x80 && c2 < 0xc0)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0x80 && c3 < 0xc0)
+ return 3;
+ }
+ }
+ else if (c < 0xf8)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0x80 && c2 < 0xc0)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0x80 && c3 < 0xc0)
+ {
+ unsigned char c4 = s[3];
+ if (c4 >= 0x80 && c4 < 0xc0)
+ return 4;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+/* Returns a character iterator for a given encoding.
+ Given a pointer into a string, it returns the number occupied by the next
+ single character. If the piece of string is not valid or if the *s == '\0',
+ it returns 1. */
+character_iterator_t
+po_charset_character_iterator (const char *canon_charset)
+{
+ if (canon_charset == utf8)
+ return utf8_character_iterator;
+ if (strcmp (canon_charset, "GB2312") == 0
+ || strcmp (canon_charset, "EUC-KR") == 0)
+ return euc_character_iterator;
+ if (strcmp (canon_charset, "EUC-JP") == 0)
+ return euc_jp_character_iterator;
+ if (strcmp (canon_charset, "EUC-TW") == 0)
+ return euc_tw_character_iterator;
+ if (strcmp (canon_charset, "BIG5") == 0)
+ return big5_character_iterator;
+ if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
+ return big5hkscs_character_iterator;
+ if (strcmp (canon_charset, "GBK") == 0)
+ return gbk_character_iterator;
+ if (strcmp (canon_charset, "GB18030") == 0)
+ return gb18030_character_iterator;
+ if (strcmp (canon_charset, "SHIFT_JIS") == 0)
+ return shift_jis_character_iterator;
+ if (strcmp (canon_charset, "JOHAB") == 0)
+ return johab_character_iterator;
+ return char_iterator;
+}
+
/* The PO file's encoding, as specified in the header entry. */
const char *po_lex_charset;
diff --git a/gettext-tools/src/po-charset.h b/gettext-tools/src/po-charset.h
index a75ec72..ce0bfd8 100644
--- a/gettext-tools/src/po-charset.h
+++ b/gettext-tools/src/po-charset.h
@@ -1,5 +1,5 @@
/* Charset handling while reading PO files.
- Copyright (C) 2001-2003 Free Software Foundation, Inc.
+ Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2001.
This program is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
#define _PO_CHARSET_H
#include <stdbool.h>
+#include <stddef.h>
#if HAVE_ICONV
#include <iconv.h>
@@ -55,6 +56,13 @@ extern bool po_is_charset_weird (const char *canon_charset);
0x{80..FF}{30..FF}. */
extern bool po_is_charset_weird_cjk (const char *canon_charset);
+/* Returns a character iterator for a given encoding.
+ Given a pointer into a string, it returns the number occupied by the next
+ single character. If the piece of string is not valid or if the *s == '\0',
+ it returns 1. */
+typedef size_t (*character_iterator_t) (const char *s);
+extern character_iterator_t po_charset_character_iterator (const char *canon_charset);
+
/* The PO file's encoding, as specified in the header entry. */
extern DLL_VARIABLE const char *po_lex_charset;
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog
index 118cc6b..23b6935 100644
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,12 @@
+2006-03-11 Bruno Haible <bruno@clisp.org>
+
+ * msgmerge-compendium-5: New file.
+ * msgmerge-v.comp.po: New file, from Clytie Siddall.
+ * msgmerge-v.pot: New file, from KDE project.
+ * msgmerge-v.out: New file.
+ * Makefile.am (TESTS): Add msgmerge-compendium-5.
+ (EXTRA_DIST): Add msgmerge-v.comp.po, msgmerge-v.pot, msgmerge-v.out.
+
2006-02-12 Bruno Haible <bruno@clisp.org>
* xgettext-c-14: New file.
diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am
index 0b0e103..baa8567 100644
--- a/gettext-tools/tests/Makefile.am
+++ b/gettext-tools/tests/Makefile.am
@@ -50,7 +50,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \
msgmerge-13 msgmerge-14 msgmerge-15 msgmerge-16 msgmerge-17 \
msgmerge-18 \
msgmerge-compendium-1 msgmerge-compendium-2 msgmerge-compendium-3 \
- msgmerge-compendium-4 \
+ msgmerge-compendium-4 msgmerge-compendium-5 \
msgmerge-properties-1 msgmerge-properties-2 \
msgmerge-update-1 msgmerge-update-2 msgmerge-update-3 \
msgunfmt-1 \
@@ -114,6 +114,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \
EXTRA_DIST += $(TESTS) \
test.mo xg-c-1.ok.po mex-test2.ok \
+ msgmerge-v.comp.po msgmerge-v.pot msgmerge-v.out \
msguniq-a.in msguniq-a.inp msguniq-a.out \
qttest_pl.po qttest_pl.qm \
qttest2_de.po qttest2_de.qm qttest2_de.ts \