diff options
author | Bruno Haible <bruno@clisp.org> | 2006-03-13 12:36:06 +0000 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2009-06-23 12:13:02 +0200 |
commit | ade34ef0abc5f549de72090f106e23c027739886 (patch) | |
tree | 2e548dabf3b242d3a6f8649d3236650a9a4bc8a3 | |
parent | 92066d515918136e5620b47797f3a17a3458ded3 (diff) | |
download | external_gettext-ade34ef0abc5f549de72090f106e23c027739886.zip external_gettext-ade34ef0abc5f549de72090f106e23c027739886.tar.gz external_gettext-ade34ef0abc5f549de72090f106e23c027739886.tar.bz2 |
Speed up msgmerge with large compendia by use of fast fuzzy search.
-rw-r--r-- | NEWS | 2 | ||||
-rw-r--r-- | gettext-tools/ChangeLog | 5 | ||||
-rw-r--r-- | gettext-tools/configure.ac | 20 | ||||
-rw-r--r-- | gettext-tools/lib/ChangeLog | 5 | ||||
-rw-r--r-- | gettext-tools/lib/hash.c | 30 | ||||
-rw-r--r-- | gettext-tools/lib/hash.h | 9 | ||||
-rw-r--r-- | gettext-tools/src/ChangeLog | 47 | ||||
-rw-r--r-- | gettext-tools/src/Makefile.am | 7 | ||||
-rw-r--r-- | gettext-tools/src/Makefile.msvc | 5 | ||||
-rw-r--r-- | gettext-tools/src/Makefile.vms | 5 | ||||
-rw-r--r-- | gettext-tools/src/message.c | 54 | ||||
-rw-r--r-- | gettext-tools/src/message.h | 22 | ||||
-rw-r--r-- | gettext-tools/src/msgfmt.c | 2 | ||||
-rw-r--r-- | gettext-tools/src/msgmerge.c | 193 | ||||
-rw-r--r-- | gettext-tools/src/po-charset.c | 264 | ||||
-rw-r--r-- | gettext-tools/src/po-charset.h | 10 | ||||
-rw-r--r-- | gettext-tools/tests/ChangeLog | 9 | ||||
-rw-r--r-- | gettext-tools/tests/Makefile.am | 3 |
18 files changed, 630 insertions, 62 deletions
@@ -28,6 +28,8 @@ * msggrep has a new option -v/--invert-match that acts like grep's -v option. +* msgmerge is much faster now, when using a large compendium. + * Programming languages support: - C++ with Boost: diff --git a/gettext-tools/ChangeLog b/gettext-tools/ChangeLog index b7f285d..efb2363 100644 --- a/gettext-tools/ChangeLog +++ b/gettext-tools/ChangeLog @@ -1,3 +1,8 @@ +2006-03-11 Bruno Haible <bruno@clisp.org> + + * configure.ac (MSGMERGE_LIBM): New variable. Test for presence of + ceil() and sqrt(). + 2005-11-23 Bruno Haible <bruno@clisp.org> Cygwin portability. diff --git a/gettext-tools/configure.ac b/gettext-tools/configure.ac index f42667c..82bb4b3 100644 --- a/gettext-tools/configure.ac +++ b/gettext-tools/configure.ac @@ -1,5 +1,5 @@ dnl Configuration for the gettext-tools directory of GNU gettext -dnl Copyright (C) 1995-1999, 2000-2005 Free Software Foundation, Inc. +dnl Copyright (C) 1995-1999, 2000-2006 Free Software Foundation, Inc. dnl dnl This program is free software; you can redistribute it and/or modify dnl it under the terms of the GNU General Public License as published by @@ -114,6 +114,24 @@ AM_GNU_GETTEXT(use-libtool, need-ngettext) dnl This line internationalizes the bison generated parsers. BISON_I18N +dnl Test whether msgmerge must be linked against libm. This is the case on +dnl most systems; but BeOS has all <math.h> functions in libc and doesn't have +dnl a libm. +MSGMERGE_LIBM=? +AC_TRY_LINK([#include <math.h>], [static double x; x = ceil(x); x = sqrt(x);], + [MSGMERGE_LIBM=]) +if test "$MSGMERGE_LIBM" = "?"; then + save_LIBS="$LIBS" + LIBS="$LIBS -lm" + AC_TRY_LINK([#include <math.h>], [static double x; x = ceil(x); x = sqrt(x);], + [MSGMERGE_LIBM="-lm"]) + LIBS="$save_LIBS" +fi +if test "$MSGMERGE_LIBM" = "?"; then + MSGMERGE_LIBM= +fi +AC_SUBST([MSGMERGE_LIBM]) + dnl Checks for header files. AC_HEADER_STDC AC_CHECK_HEADERS(limits.h malloc.h pwd.h string.h unistd.h utime.h values.h) diff --git a/gettext-tools/lib/ChangeLog b/gettext-tools/lib/ChangeLog index ce72ca9..004fa37 100644 --- a/gettext-tools/lib/ChangeLog +++ b/gettext-tools/lib/ChangeLog @@ -1,3 +1,8 @@ +2006-03-11 Bruno Haible <bruno@clisp.org> + + * hash.h (hash_iterate_modify): New declaration. + * hash.c (hash_iterate_modify): New function. + 2006-01-10 Bruno Haible <bruno@clisp.org> * localcharset.c: Assume ANSI C. Fixes a gcc warning. diff --git a/gettext-tools/lib/hash.c b/gettext-tools/lib/hash.c index cb9c6bb..56725fa 100644 --- a/gettext-tools/lib/hash.c +++ b/gettext-tools/lib/hash.c @@ -346,3 +346,33 @@ hash_iterate (hash_table *htab, void **ptr, const void **key, size_t *keylen, *data = ((hash_entry *) *ptr)->data; return 0; } + + +/* Steps *PTR forward to the next used entry in the given hash table. *PTR + should be initially set to NULL. Store information about the next entry + in *KEY, *KEYLEN, *DATAP. *DATAP is set to point to the storage of the + value; modifying **DATAP will modify the value of the entry. + Return 0 normally, -1 when the whole hash table has been traversed. */ +int +hash_iterate_modify (hash_table *htab, void **ptr, + const void **key, size_t *keylen, + void ***datap) +{ + if (*ptr == NULL) + { + if (htab->first == NULL) + return -1; + *ptr = (void *) ((hash_entry *) htab->first)->next; + } + else + { + if (*ptr == htab->first) + return -1; + *ptr = (void *) ((hash_entry *) *ptr)->next; + } + + *key = ((hash_entry *) *ptr)->key; + *keylen = ((hash_entry *) *ptr)->keylen; + *datap = &((hash_entry *) *ptr)->data; + return 0; +} diff --git a/gettext-tools/lib/hash.h b/gettext-tools/lib/hash.h index 6c4e71a..5563bd0 100644 --- a/gettext-tools/lib/hash.h +++ b/gettext-tools/lib/hash.h @@ -71,6 +71,15 @@ extern int hash_iterate (hash_table *htab, void **ptr, const void **key, size_t *keylen, void **data); +/* Steps *PTR forward to the next used entry in the given hash table. *PTR + should be initially set to NULL. Store information about the next entry + in *KEY, *KEYLEN, *DATAP. *DATAP is set to point to the storage of the + value; modifying **DATAP will modify the value of the entry. + Return 0 normally, -1 when the whole hash table has been traversed. */ +extern int hash_iterate_modify (hash_table *htab, void **ptr, + const void **key, size_t *keylen, + void ***datap); + /* Given SEED > 1, return the smallest odd prime number >= SEED. */ extern unsigned long int next_prime (unsigned long int seed); diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index e22fa00..dfe393b 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,50 @@ +2006-03-11 Bruno Haible <bruno@clisp.org> + + Speed up msgmerge with large compendia. + * message.h (message_list_free): Add keep_messages argument. + (message_list_list_free): Add keep_level argument. + (fuzzy_search_goal_function): New declaration. + (FUZZY_THRESHOLD): New macro. + * message.c (message_list_free): Add keep_messages argument. + (fuzzy_search_goal_function): New function, extracted from + message_list_search_fuzzy_inner. + (message_list_search_fuzzy_inner): Use it. + (message_list_search_fuzzy): Use symbolic value FUZZY_THRESHOLD. + (message_list_list_free): Comment in. Add keep_level argument. + (message_list_list_search_fuzzy): Comment out. Use symbolic value + FUZZY_THRESHOLD. + (msgdomain_free): Update. + (msgdomain_list_search_fuzzy): Use symbolic value FUZZY_THRESHOLD. + * po-charset.h: Include stddef.h. + (character_iterator_t): New type. + (po_charset_character_iterator): New declaration. + * po-charset.c (char_iterator, euc_character_iterator, + euc_jp_character_iterator, euc_tw_character_iterator, + big5_character_iterator, big5hkscs_character_iterator, + gbk_character_iterator, gb18030_character_iterator, + shift_jis_character_iterator, johab_character_iterator, + utf8_character_iterator, po_charset_character_iterator): New functions. + * msgl-fsearch.h: New file. + * msgl-fsearch.c: New file. + * msgmerge.c: Include po-charset.h, msgl-fsearch.h. + (compendium_filenames): New variable. + (compendium): Also put the filename into compendium_filenames. + (definitions_ty): New structure type. + (definitions_init, definitions_init_findex, definitions_current_list, + definitions_set_current_list, definitions_search, + definitions_search_fuzzy, definitions_destroy): New functions. + (match_domain): Change type of 'definitions' argument. + (merge): Also convert the compendium to UTF-8. Use definitions_init, + definitions_set_current_list, definitions_destroy. + * msgfmt.c (main): Update. + * Makefile.am (noinst_HEADERS): Add msgl-fsearch.h. + (msgmerge_SOURCES): Add msgl-fsearch.c. + (msgmerge_LDADD): Link with MSGMERGE_LIBM. + * Makefile.msvc (msgmerge_OBJECTS): Add msgl-fsearch.obj. + (msgl-fsearch.obj): New rule. + * Makefile.vms (msgmerge_OBJECTS): Add msgl-fsearch.obj. + (msgl-fsearch.obj): New rule. + 2006-03-09 Bruno Haible <bruno@clisp.org> * Makefile.am (CXXLINK) [!mingw]: Overwrite automake's value. Fixes diff --git a/gettext-tools/src/Makefile.am b/gettext-tools/src/Makefile.am index 386cf60..b30d34f 100644 --- a/gettext-tools/src/Makefile.am +++ b/gettext-tools/src/Makefile.am @@ -41,7 +41,8 @@ str-list.h \ write-po.h write-properties.h write-stringtable.h \ dir-list.h file-list.h po-gram-gen.h po-gram-gen2.h \ msgl-charset.h msgl-equal.h msgl-iconv.h msgl-ascii.h msgl-cat.h \ -msgl-english.h msgl-check.h msgfmt.h msgunfmt.h plural-count.h plural-eval.h \ +msgl-english.h msgl-check.h msgl-fsearch.h msgfmt.h msgunfmt.h \ +plural-count.h plural-eval.h \ read-mo.h write-mo.h \ read-java.h write-java.h \ read-csharp.h write-csharp.h \ @@ -146,7 +147,7 @@ msgmerge_SOURCES = msgmerge.c else msgmerge_SOURCES = ../mingw/c++msgmerge.cc endif -msgmerge_SOURCES += plural-count.c +msgmerge_SOURCES += msgl-fsearch.c plural-count.c msgunfmt_SOURCES = msgunfmt.c msgunfmt_SOURCES += \ read-mo.c read-java.c read-csharp.c read-resources.c read-tcl.c @@ -247,7 +248,7 @@ libgettextpo_la_DEPENDENCIES = libgettextsrc.la # For msginit, it is also needed because of localename.c. msgcmp_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ msgfmt_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ -msgmerge_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ +msgmerge_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ @MSGMERGE_LIBM@ msgunfmt_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ xgettext_LDADD = $(LIBUNINAME) libgettextsrc.la @INTL_MACOSX_LIBS@ @LTLIBEXPAT@ msgattrib_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ diff --git a/gettext-tools/src/Makefile.msvc b/gettext-tools/src/Makefile.msvc index e1e37e4..01285c6 100644 --- a/gettext-tools/src/Makefile.msvc +++ b/gettext-tools/src/Makefile.msvc @@ -153,7 +153,7 @@ OBJECTS = \ msgcmp_OBJECTS = msgcmp.obj msgfmt_OBJECTS = msgfmt.obj write-mo.obj write-java.obj write-csharp.obj write-resources.obj write-tcl.obj write-qt.obj plural-eval.obj hash-string.obj -msgmerge_OBJECTS = msgmerge.obj plural-count.obj +msgmerge_OBJECTS = msgmerge.obj msgl-fsearch.obj plural-count.obj msgunfmt_OBJECTS = msgunfmt.obj read-mo.obj read-java.obj read-csharp.obj read-resources.obj read-tcl.obj xgettext_OBJECTS = xgettext.obj x-c.obj x-po.obj x-sh.obj x-python.obj x-lisp.obj x-elisp.obj x-librep.obj x-scheme.obj x-smalltalk.obj x-java.obj x-csharp.obj x-awk.obj x-ycp.obj x-tcl.obj x-perl.obj x-php.obj x-rst.obj x-glade.obj msgattrib_OBJECTS = msgattrib.obj @@ -361,6 +361,9 @@ hash-string.obj : ..\..\gettext-runtime\intl\hash-string.c msgmerge.obj : msgmerge.c $(CC) $(INCLUDES) $(CFLAGS) -DINSTALLPREFIX=\"$(IIprefix)\" -DINSTALLDIR=\"$(IIbindir)\" -c -Tp msgmerge.c +msgl-fsearch.obj : msgl-fsearch.c + $(CC) $(INCLUDES) $(CFLAGS) -c msgl-fsearch.c + plural-count.obj : plural-count.c $(CC) $(INCLUDES) $(CFLAGS) -c plural-count.c diff --git a/gettext-tools/src/Makefile.vms b/gettext-tools/src/Makefile.vms index cc6ba1d..eccfb82 100644 --- a/gettext-tools/src/Makefile.vms +++ b/gettext-tools/src/Makefile.vms @@ -99,7 +99,7 @@ OBJECTS = \ msgcmp_OBJECTS = msgcmp.obj msgfmt_OBJECTS = msgfmt.obj, write-mo.obj, write-java.obj, write-csharp.obj, write-resources.obj, write-tcl.obj, write-qt.obj, plural-eval.obj, hash-string.obj -msgmerge_OBJECTS = msgmerge.obj, plural-count.obj +msgmerge_OBJECTS = msgmerge.obj, msgl-fsearch.obj, plural-count.obj msgunfmt_OBJECTS = msgunfmt.obj, read-mo.obj, read-java.obj, read-csharp.obj, read-resources.obj, read-tcl.obj xgettext_OBJECTS = xgettext.obj, x-c.obj, x-po.obj, x-sh.obj, x-python.obj, x-lisp.obj, x-elisp.obj, x-librep.obj, x-scheme.obj, x-smalltalk.obj, x-java.obj, x-csharp.obj, x-awk.obj, x-ycp.obj, x-tcl.obj, x-perl.obj, x-php.obj, x-rst.obj, x-glade.obj msgattrib_OBJECTS = msgattrib.obj @@ -291,6 +291,9 @@ hash-string.obj : [-.-.gettext-runtime.intl]hash-string.c msgmerge.obj : msgmerge.c $(CC) $(INCLUDES) $(CFLAGS) /define=($(DEFS),"INSTALLPREFIX=""$(prefix)]""","INSTALLDIR=""$(bindir)]""") msgmerge.c +msgl-fsearch.obj : msgl-fsearch.c + $(CC) $(INCLUDES) $(CFLAGS) /define=($(DEFS)) msgl-fsearch.c + plural-count.obj : plural-count.c $(CC) $(INCLUDES) $(CFLAGS) /define=($(DEFS)) plural-count.c diff --git a/gettext-tools/src/message.c b/gettext-tools/src/message.c index 7c80acf..120b795 100644 --- a/gettext-tools/src/message.c +++ b/gettext-tools/src/message.c @@ -237,12 +237,13 @@ message_list_alloc (bool use_hashtable) void -message_list_free (message_list_ty *mlp) +message_list_free (message_list_ty *mlp, int keep_messages) { size_t j; - for (j = 0; j < mlp->nitems; ++j) - message_free (mlp->item[j]); + if (keep_messages == 0) + for (j = 0; j < mlp->nitems; ++j) + message_free (mlp->item[j]); if (mlp->item) free (mlp->item); if (mlp->use_hashtable) @@ -494,6 +495,23 @@ message_list_search (message_list_ty *mlp, } +double +fuzzy_search_goal_function (const message_ty *mp, + const char *msgctxt, const char *msgid) +{ + return + fstrcmp (msgid, mp->msgid) + /* A translation for a context is a good proposal also for + another. But give mp a small advantage if mp is valid + regardless of any context or has the same context as the + one being looked up. */ + + ((mp->msgctxt == NULL + || (msgctxt != NULL && strcmp (msgctxt, mp->msgctxt) == 0)) + ? 0.00001 + : 0); +} + + static message_ty * message_list_search_fuzzy_inner (message_list_ty *mlp, const char *msgctxt, const char *msgid, @@ -511,16 +529,7 @@ message_list_search_fuzzy_inner (message_list_ty *mlp, if (mp->msgstr != NULL && mp->msgstr[0] != '\0') { - double weight = - fstrcmp (msgid, mp->msgid) - /* A translation for a context is a good proposal also for - another. But give mp a small advantage if mp is valid - regardless of any context or has the same context as the - one being looked up. */ - + ((mp->msgctxt == NULL - || (msgctxt != NULL && strcmp (msgctxt, mp->msgctxt) == 0)) - ? 0.00001 - : 0); + double weight = fuzzy_search_goal_function (mp, msgctxt, msgid); if (weight > *best_weight_p) { *best_weight_p = weight; @@ -538,7 +547,7 @@ message_list_search_fuzzy (message_list_ty *mlp, { double best_weight; - best_weight = 0.6; + best_weight = FUZZY_THRESHOLD; return message_list_search_fuzzy_inner (mlp, msgctxt, msgid, &best_weight); } @@ -556,19 +565,18 @@ message_list_list_alloc () } -#if 0 /* unused */ void -message_list_list_free (message_list_list_ty *mllp) +message_list_list_free (message_list_list_ty *mllp, int keep_level) { size_t j; - for (j = 0; j < mllp->nitems; ++j) - message_list_free (mllp->item[j]); + if (keep_level < 2) + for (j = 0; j < mllp->nitems; ++j) + message_list_free (mllp->item[j], keep_level); if (mllp->item) free (mllp->item); free (mllp); } -#endif void @@ -628,6 +636,7 @@ message_list_list_search (message_list_list_ty *mllp, } +#if 0 /* unused */ message_ty * message_list_list_search_fuzzy (message_list_list_ty *mllp, const char *msgctxt, const char *msgid) @@ -636,7 +645,7 @@ message_list_list_search_fuzzy (message_list_list_ty *mllp, double best_weight; message_ty *best_mp; - best_weight = 0.6; + best_weight = FUZZY_THRESHOLD; best_mp = NULL; for (j = 0; j < mllp->nitems; ++j) { @@ -650,6 +659,7 @@ message_list_list_search_fuzzy (message_list_list_ty *mllp, } return best_mp; } +#endif msgdomain_ty* @@ -667,7 +677,7 @@ msgdomain_alloc (const char *domain, bool use_hashtable) void msgdomain_free (msgdomain_ty *mdp) { - message_list_free (mdp->messages); + message_list_free (mdp->messages, 0); free (mdp); } @@ -783,7 +793,7 @@ msgdomain_list_search_fuzzy (msgdomain_list_ty *mdlp, double best_weight; message_ty *best_mp; - best_weight = 0.6; + best_weight = FUZZY_THRESHOLD; best_mp = NULL; for (j = 0; j < mdlp->nitems; ++j) { diff --git a/gettext-tools/src/message.h b/gettext-tools/src/message.h index c6d01ad..5a07301 100644 --- a/gettext-tools/src/message.h +++ b/gettext-tools/src/message.h @@ -196,8 +196,11 @@ struct message_list_ty known that the message list will not contain duplicate msgids. */ extern message_list_ty * message_list_alloc (bool use_hashtable); +/* Free a message list. + If keep_messages = 0, also free the messages. If keep_messages = 1, don't + free the messages. */ extern void - message_list_free (message_list_ty *mlp); + message_list_free (message_list_ty *mlp, int keep_messages); extern void message_list_append (message_list_ty *mlp, message_ty *mp); extern void @@ -232,8 +235,12 @@ struct message_list_list_ty extern message_list_list_ty * message_list_list_alloc (void); +/* Free a list of message lists. + If keep_level = 0, also free the messages. If keep_level = 1, don't free + the messages but free the lists. If keep_level = 2, don't free the + the messages and the lists. */ extern void - message_list_list_free (message_list_list_ty *mllp); + message_list_list_free (message_list_list_ty *mllp, int keep_level); extern void message_list_list_append (message_list_list_ty *mllp, message_list_ty *mlp); @@ -291,6 +298,17 @@ extern message_ty * const char *msgctxt, const char *msgid); +/* The goal function used in fuzzy search. + Higher values indicate a closer match. */ +extern double + fuzzy_search_goal_function (const message_ty *mp, + const char *msgctxt, const char *msgid); + +/* The threshold for fuzzy-searching. + A message is considered only if fstrcmp (msg, given) > FUZZY_THRESHOLD. */ +#define FUZZY_THRESHOLD 0.6 + + #ifdef __cplusplus } #endif diff --git a/gettext-tools/src/msgfmt.c b/gettext-tools/src/msgfmt.c index 8b871bf..e8d5653 100644 --- a/gettext-tools/src/msgfmt.c +++ b/gettext-tools/src/msgfmt.c @@ -588,7 +588,7 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\ } /* List is not used anymore. */ - message_list_free (domain->mlp); + message_list_free (domain->mlp, 0); } /* Print statistics if requested. */ diff --git a/gettext-tools/src/msgmerge.c b/gettext-tools/src/msgmerge.c index 18e3e9b..8c5d068 100644 --- a/gettext-tools/src/msgmerge.c +++ b/gettext-tools/src/msgmerge.c @@ -47,8 +47,10 @@ #include "c-strcase.h" #include "stpcpy.h" #include "stpncpy.h" +#include "po-charset.h" #include "msgl-iconv.h" #include "msgl-equal.h" +#include "msgl-fsearch.h" #include "plural-count.h" #include "backupfile.h" #include "copy-file.h" @@ -78,6 +80,9 @@ static bool use_fuzzy_matching = true; /* List of user-specified compendiums. */ static message_list_list_ty *compendiums; +/* List of corresponding filenames. */ +static string_list_ty *compendium_filenames; + /* Update mode. */ static bool update_mode = false; static const char *version_control_string; @@ -560,10 +565,132 @@ compendium (const char *filename) size_t k; mdlp = read_po_file (filename); - if (!compendiums) - compendiums = message_list_list_alloc (); + if (compendiums == NULL) + { + compendiums = message_list_list_alloc (); + compendium_filenames = string_list_alloc (); + } for (k = 0; k < mdlp->nitems; k++) - message_list_list_append (compendiums, mdlp->item[k]->messages); + { + message_list_list_append (compendiums, mdlp->item[k]->messages); + string_list_append (compendium_filenames, filename); + } +} + + +/* Data structure representing the messages with known translations. + They are composed of + - A message list from def.po, + - The compendiums. + The data structure is optimized for exact and fuzzy searches. */ +typedef struct definitions_ty definitions_ty; +struct definitions_ty +{ + /* A list of message lists. The first comes from def.po, the other ones + from the compendiums. Each message list has a built-in hash table, + for speed when doing the exact searches. */ + message_list_list_ty *lists; + /* A fuzzy index of the compendiums, for speed when doing fuzzy searches. + Used only if use_fuzzy_matching is true and compendiums != NULL. */ + message_fuzzy_index_ty *findex; + /* The canonical encoding of the compendiums. */ + const char *canon_charset; +}; + +static inline void +definitions_init (definitions_ty *definitions, const char *canon_charset) +{ + definitions->lists = message_list_list_alloc (); + message_list_list_append (definitions->lists, NULL); + if (compendiums != NULL) + message_list_list_append_list (definitions->lists, compendiums); + definitions->findex = NULL; + definitions->canon_charset = canon_charset; +} + +/* Create the fuzzy index. + Used only if use_fuzzy_matching is true and compendiums != NULL. */ +static inline void +definitions_init_findex (definitions_ty *definitions) +{ + /* Combine all the compendium message lists into a single one. Don't + bother checking for duplicates. */ + message_list_ty *all_compendium; + size_t i; + + all_compendium = message_list_alloc (false); + for (i = 0; i < compendiums->nitems; i++) + { + message_list_ty *mlp = compendiums->item[i]; + size_t j; + + for (j = 0; j < mlp->nitems; j++) + message_list_append (all_compendium, mlp->item[j]); + } + + /* Create the fuzzy index from it. */ + definitions->findex = + message_fuzzy_index_alloc (all_compendium, definitions->canon_charset); +} + +/* Return the current list of non-compendium messages. */ +static inline message_list_ty * +definitions_current_list (const definitions_ty *definitions) +{ + return definitions->lists->item[0]; +} + +/* Set the current list of non-compendium messages. */ +static inline void +definitions_set_current_list (definitions_ty *definitions, message_list_ty *mlp) +{ + definitions->lists->item[0] = mlp; +} + +/* Exact search. */ +static inline message_ty * +definitions_search (const definitions_ty *definitions, + const char *msgctxt, const char *msgid) +{ + return message_list_list_search (definitions->lists, msgctxt, msgid); +} + +/* Fuzzy search. + Used only if use_fuzzy_matching is true. */ +static inline message_ty * +definitions_search_fuzzy (definitions_ty *definitions, + const char *msgctxt, const char *msgid) +{ + message_ty *mp1 = + message_list_search_fuzzy (definitions_current_list (definitions), + msgctxt, msgid); + if (compendiums != NULL) + { + message_ty *mp2; + + /* Create the fuzzy index lazily. */ + if (definitions->findex == NULL) + definitions_init_findex (definitions); + + mp2 = message_fuzzy_index_search (definitions->findex, msgctxt, msgid); + + /* Choose the best among mp1, mp2. */ + if (mp1 == NULL + || (mp2 != NULL + && (fuzzy_search_goal_function (mp2, msgctxt, msgid) + > fuzzy_search_goal_function (mp1, msgctxt, msgid)))) + mp1 = mp2; + } + + return mp1; +} + +static inline void +definitions_destroy (definitions_ty *definitions) +{ + message_list_list_free (definitions->lists, 2); + if (definitions->findex != NULL) + message_fuzzy_index_free (definitions->findex); } @@ -907,7 +1034,7 @@ message_merge (message_ty *def, message_ty *ref) static void match_domain (const char *fn1, const char *fn2, - message_list_list_ty *definitions, message_list_ty *refmlp, + definitions_ty *definitions, message_list_ty *refmlp, message_list_ty *resultmlp, struct statistics *stats, unsigned int *processed) { @@ -916,7 +1043,8 @@ match_domain (const char *fn1, const char *fn2, char *untranslated_plural_msgstr; size_t j; - header_entry = message_list_search (definitions->item[0], NULL, ""); + header_entry = + message_list_search (definitions_current_list (definitions), NULL, ""); nplurals = get_plural_count (header_entry ? header_entry->msgstr : NULL); untranslated_plural_msgstr = (char *) xmalloc (nplurals); memset (untranslated_plural_msgstr, '\0', nplurals); @@ -934,8 +1062,7 @@ match_domain (const char *fn1, const char *fn2, refmsg = refmlp->item[j]; /* See if it is in the other file. */ - defmsg = - message_list_list_search (definitions, refmsg->msgctxt, refmsg->msgid); + defmsg = definitions_search (definitions, refmsg->msgctxt, refmsg->msgid); if (defmsg) { /* Merge the reference with the definition: take the #. and @@ -958,9 +1085,9 @@ match_domain (const char *fn1, const char *fn2, help. */ if (use_fuzzy_matching && ((defmsg = - message_list_list_search_fuzzy (definitions, - refmsg->msgctxt, - refmsg->msgid)) != NULL)) + definitions_search_fuzzy (definitions, + refmsg->msgctxt, + refmsg->msgid)) != NULL)) { message_ty *mp; @@ -1117,7 +1244,7 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp) unsigned int processed; struct statistics stats; msgdomain_list_ty *result; - message_list_list_ty *definitions; + definitions_ty definitions; message_list_ty *empty_list; stats.merged = stats.fuzzied = stats.missing = stats.obsolete = 0; @@ -1125,15 +1252,6 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp) /* This is the definitions file, created by a human. */ def = read_po_file (fn1); - /* Create the set of places to look for message definitions: a list - whose first element will be definitions for the current domain, and - whose other elements come from the compendiums. */ - definitions = message_list_list_alloc (); - message_list_list_append (definitions, NULL); - if (compendiums) - message_list_list_append_list (definitions, compendiums); - empty_list = message_list_alloc (false); - /* This is the references file, created by groping the sources with the xgettext program. */ ref = read_po_file (fn2); @@ -1148,7 +1266,8 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp) } /* The references file can be either in ASCII or in UTF-8. If it is - in UTF-8, we have to convert the definitions to UTF-8 as well. */ + in UTF-8, we have to convert the definitions and the compendiums to + UTF-8 as well. */ { bool was_utf8 = false; for (k = 0; k < ref->nitems; k++) @@ -1178,9 +1297,23 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp) } } if (was_utf8) - def = iconv_msgdomain_list (def, "UTF-8", fn1); + { + def = iconv_msgdomain_list (def, "UTF-8", fn1); + if (compendiums != NULL) + for (k = 0; k < compendiums->nitems; k++) + iconv_message_list (compendiums->item[k], NULL, po_charset_utf8, + compendium_filenames->item[k]); + } + else + { + /* TODO: Convert all compendiums->item[k] to the same encoding. */ + } } + /* Initialize and preprocess the total set of message definitions. */ + definitions_init (&definitions, po_charset_utf8); + empty_list = message_list_alloc (false); + result = msgdomain_list_alloc (false); processed = 0; @@ -1192,12 +1325,14 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp) message_list_ty *refmlp = ref->item[k]->messages; message_list_ty *resultmlp = msgdomain_list_sublist (result, domain, true); + message_list_ty *defmlp; - definitions->item[0] = msgdomain_list_sublist (def, domain, false); - if (definitions->item[0] == NULL) - definitions->item[0] = empty_list; + defmlp = msgdomain_list_sublist (def, domain, false); + if (defmlp == NULL) + defmlp = empty_list; + definitions_set_current_list (&definitions, defmlp); - match_domain (fn1, fn2, definitions, refmlp, resultmlp, + match_domain (fn1, fn2, &definitions, refmlp, resultmlp, &stats, &processed); } else @@ -1217,14 +1352,16 @@ merge (const char *fn1, const char *fn2, msgdomain_list_ty **defp) message_list_ty *resultmlp = msgdomain_list_sublist (result, domain, true); - definitions->item[0] = defmlp; + definitions_set_current_list (&definitions, defmlp); - match_domain (fn1, fn2, definitions, refmlp, resultmlp, + match_domain (fn1, fn2, &definitions, refmlp, resultmlp, &stats, &processed); } } } + definitions_destroy (&definitions); + /* Look for messages in the definition file, which are not present in the reference file, indicating messages which defined but not used in the program. Don't scan the compendium(s). */ diff --git a/gettext-tools/src/po-charset.c b/gettext-tools/src/po-charset.c index cff5826..23d29f0 100644 --- a/gettext-tools/src/po-charset.c +++ b/gettext-tools/src/po-charset.c @@ -1,5 +1,5 @@ /* Charset handling while reading PO files. - Copyright (C) 2001-2005 Free Software Foundation, Inc. + Copyright (C) 2001-2006 Free Software Foundation, Inc. Written by Bruno Haible <haible@clisp.cons.org>, 2001. This program is free software; you can redistribute it and/or modify @@ -169,6 +169,268 @@ bool po_is_charset_weird_cjk (const char *canon_charset) return false; } +/* Hardcoded iterator functions for all kinds of encodings. + We could also implement a general iterator function with iconv(), + but we need a fast one. */ + +/* Character iterator for 8-bit encodings. */ +static size_t +char_iterator (const char *s) +{ + return 1; +} + +/* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */ +/* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */ +static size_t +euc_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xa1 && c < 0xff) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xff) + return 2; + } + return 1; +} + +/* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */ +static size_t +euc_jp_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xa1 && c < 0xff) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xff) + return 2; + } + else if (c == 0x8e) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xe0) + return 2; + } + else if (c == 0x8f) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xff) + { + unsigned char c3 = s[2]; + if (c3 >= 0xa1 && c3 < 0xff) + return 3; + } + } + return 1; +} + +/* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */ +static size_t +euc_tw_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xa1 && c < 0xff) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xff) + return 2; + } + else if (c == 0x8e) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 <= 0xb0) + { + unsigned char c3 = s[2]; + if (c3 >= 0xa1 && c3 < 0xff) + { + unsigned char c4 = s[3]; + if (c4 >= 0xa1 && c4 < 0xff) + return 4; + } + } + } + return 1; +} + +/* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */ +static size_t +big5_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xa1 && c < 0xff) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) + return 2; + } + return 1; +} + +/* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */ +static size_t +big5hkscs_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0x88 && c < 0xff) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) + return 2; + } + return 1; +} + +/* Character iterator for GBK. See libiconv/lib/ces_gbk.h and + libiconv/lib/gbk.h. */ +static size_t +gbk_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0x81 && c < 0xff) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff)) + return 2; + } + return 1; +} + +/* Character iterator for GB18030. See libiconv/lib/gb18030.h. */ +static size_t +gb18030_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0x81 && c < 0xff) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff)) + return 2; + } + if (c >= 0x81 && c <= 0x84) + { + unsigned char c2 = s[1]; + if (c2 >= 0x30 && c2 <= 0x39) + { + unsigned char c3 = s[2]; + if (c3 >= 0x81 && c3 < 0xff) + { + unsigned char c4 = s[3]; + if (c4 >= 0x30 && c4 <= 0x39) + return 4; + } + } + } + return 1; +} + +/* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */ +static size_t +shift_jis_character_iterator (const char *s) +{ + unsigned char c = *s; + if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9)) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc)) + return 2; + } + return 1; +} + +/* Character iterator for JOHAB. See libiconv/lib/johab.h and + libiconv/lib/johab_hangul.h. */ +static size_t +johab_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0x84 && c <= 0xd3) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) + return 2; + } + else if (c >= 0xd9 && c <= 0xf9) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe)) + return 2; + } + return 1; +} + +/* Character iterator for UTF-8. See libiconv/lib/utf8.h. */ +static size_t +utf8_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xc2) + { + if (c < 0xe0) + { + unsigned char c2 = s[1]; + if (c2 >= 0x80 && c2 < 0xc0) + return 2; + } + else if (c < 0xf0) + { + unsigned char c2 = s[1]; + if (c2 >= 0x80 && c2 < 0xc0) + { + unsigned char c3 = s[2]; + if (c3 >= 0x80 && c3 < 0xc0) + return 3; + } + } + else if (c < 0xf8) + { + unsigned char c2 = s[1]; + if (c2 >= 0x80 && c2 < 0xc0) + { + unsigned char c3 = s[2]; + if (c3 >= 0x80 && c3 < 0xc0) + { + unsigned char c4 = s[3]; + if (c4 >= 0x80 && c4 < 0xc0) + return 4; + } + } + } + } + return 1; +} + +/* Returns a character iterator for a given encoding. + Given a pointer into a string, it returns the number occupied by the next + single character. If the piece of string is not valid or if the *s == '\0', + it returns 1. */ +character_iterator_t +po_charset_character_iterator (const char *canon_charset) +{ + if (canon_charset == utf8) + return utf8_character_iterator; + if (strcmp (canon_charset, "GB2312") == 0 + || strcmp (canon_charset, "EUC-KR") == 0) + return euc_character_iterator; + if (strcmp (canon_charset, "EUC-JP") == 0) + return euc_jp_character_iterator; + if (strcmp (canon_charset, "EUC-TW") == 0) + return euc_tw_character_iterator; + if (strcmp (canon_charset, "BIG5") == 0) + return big5_character_iterator; + if (strcmp (canon_charset, "BIG5-HKSCS") == 0) + return big5hkscs_character_iterator; + if (strcmp (canon_charset, "GBK") == 0) + return gbk_character_iterator; + if (strcmp (canon_charset, "GB18030") == 0) + return gb18030_character_iterator; + if (strcmp (canon_charset, "SHIFT_JIS") == 0) + return shift_jis_character_iterator; + if (strcmp (canon_charset, "JOHAB") == 0) + return johab_character_iterator; + return char_iterator; +} + /* The PO file's encoding, as specified in the header entry. */ const char *po_lex_charset; diff --git a/gettext-tools/src/po-charset.h b/gettext-tools/src/po-charset.h index a75ec72..ce0bfd8 100644 --- a/gettext-tools/src/po-charset.h +++ b/gettext-tools/src/po-charset.h @@ -1,5 +1,5 @@ /* Charset handling while reading PO files. - Copyright (C) 2001-2003 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc. Written by Bruno Haible <haible@clisp.cons.org>, 2001. This program is free software; you can redistribute it and/or modify @@ -20,6 +20,7 @@ #define _PO_CHARSET_H #include <stdbool.h> +#include <stddef.h> #if HAVE_ICONV #include <iconv.h> @@ -55,6 +56,13 @@ extern bool po_is_charset_weird (const char *canon_charset); 0x{80..FF}{30..FF}. */ extern bool po_is_charset_weird_cjk (const char *canon_charset); +/* Returns a character iterator for a given encoding. + Given a pointer into a string, it returns the number occupied by the next + single character. If the piece of string is not valid or if the *s == '\0', + it returns 1. */ +typedef size_t (*character_iterator_t) (const char *s); +extern character_iterator_t po_charset_character_iterator (const char *canon_charset); + /* The PO file's encoding, as specified in the header entry. */ extern DLL_VARIABLE const char *po_lex_charset; diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index 118cc6b..23b6935 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,12 @@ +2006-03-11 Bruno Haible <bruno@clisp.org> + + * msgmerge-compendium-5: New file. + * msgmerge-v.comp.po: New file, from Clytie Siddall. + * msgmerge-v.pot: New file, from KDE project. + * msgmerge-v.out: New file. + * Makefile.am (TESTS): Add msgmerge-compendium-5. + (EXTRA_DIST): Add msgmerge-v.comp.po, msgmerge-v.pot, msgmerge-v.out. + 2006-02-12 Bruno Haible <bruno@clisp.org> * xgettext-c-14: New file. diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 0b0e103..baa8567 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -50,7 +50,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \ msgmerge-13 msgmerge-14 msgmerge-15 msgmerge-16 msgmerge-17 \ msgmerge-18 \ msgmerge-compendium-1 msgmerge-compendium-2 msgmerge-compendium-3 \ - msgmerge-compendium-4 \ + msgmerge-compendium-4 msgmerge-compendium-5 \ msgmerge-properties-1 msgmerge-properties-2 \ msgmerge-update-1 msgmerge-update-2 msgmerge-update-3 \ msgunfmt-1 \ @@ -114,6 +114,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \ EXTRA_DIST += $(TESTS) \ test.mo xg-c-1.ok.po mex-test2.ok \ + msgmerge-v.comp.po msgmerge-v.pot msgmerge-v.out \ msguniq-a.in msguniq-a.inp msguniq-a.out \ qttest_pl.po qttest_pl.qm \ qttest2_de.po qttest2_de.qm qttest2_de.ts \ |