/* Message list concatenation and duplicate handling. Copyright (C) 2001-2003, 2005 Free Software Foundation, Inc. Written by Bruno Haible , 2001. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif #include /* Specification. */ #include "msgl-cat.h" #include #include #include #include "error.h" #include "xerror.h" #include "message.h" #include "read-po.h" #include "po-charset.h" #include "msgl-ascii.h" #include "msgl-equal.h" #include "msgl-iconv.h" #include "xalloc.h" #include "xallocsa.h" #include "strstr.h" #include "basename.h" #include "exit.h" #include "gettext.h" #define _(str) gettext (str) /* These variables control which messages are selected. */ int more_than; int less_than; /* If true, use the first available translation. If false, merge all available translations into one and fuzzy it. */ bool use_first; /* If true, merge like msgcomm. If false, merge like msgcat and msguniq. */ bool msgcomm_mode = false; /* If true, omit the header entry. If false, keep the header entry present in the input. */ bool omit_header = false; static bool is_message_selected (const message_ty *tmp) { int used = (tmp->used >= 0 ? tmp->used : - tmp->used); return (is_header (tmp) ? !omit_header /* keep the header entry */ : (used > more_than && used < less_than)); } static bool is_message_needed (const message_ty *mp) { if (!msgcomm_mode && ((!is_header (mp) && mp->is_fuzzy) || mp->msgstr[0] == '\0')) /* Weak translation. Needed if there are only weak translations. */ return mp->tmp->used < 0 && is_message_selected (mp->tmp); else /* Good translation. */ return is_message_selected (mp->tmp); } /* The use_first logic. */ static bool is_message_first_needed (const message_ty *mp) { if (mp->tmp->obsolete && is_message_needed (mp)) { mp->tmp->obsolete = false; return true; } else return false; } msgdomain_list_ty * catenate_msgdomain_list (string_list_ty *file_list, const char *to_code) { const char * const *files = file_list->item; size_t nfiles = file_list->nitems; msgdomain_list_ty **mdlps; const char ***canon_charsets; const char ***identifications; msgdomain_list_ty *total_mdlp; const char *canon_to_code; size_t n, j, k; /* Read input files. */ mdlps = (msgdomain_list_ty **) xmalloc (nfiles * sizeof (msgdomain_list_ty *)); for (n = 0; n < nfiles; n++) mdlps[n] = read_po_file (files[n]); /* Determine the canonical name of each input file's encoding. */ canon_charsets = (const char ***) xmalloc (nfiles * sizeof (const char **)); for (n = 0; n < nfiles; n++) { msgdomain_list_ty *mdlp = mdlps[n]; size_t k; canon_charsets[n] = (const char **) xmalloc (mdlp->nitems * sizeof (const char *)); for (k = 0; k < mdlp->nitems; k++) { message_list_ty *mlp = mdlp->item[k]->messages; const char *canon_from_code = NULL; if (mlp->nitems > 0) { for (j = 0; j < mlp->nitems; j++) if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete) { const char *header = mlp->item[j]->msgstr; if (header != NULL) { const char *charsetstr = strstr (header, "charset="); if (charsetstr != NULL) { size_t len; char *charset; const char *canon_charset; charsetstr += strlen ("charset="); len = strcspn (charsetstr, " \t\n"); charset = (char *) xallocsa (len + 1); memcpy (charset, charsetstr, len); charset[len] = '\0'; canon_charset = po_charset_canonicalize (charset); if (canon_charset == NULL) { /* Don't give an error for POT files, because POT files usually contain only ASCII msgids. */ const char *filename = files[n]; size_t filenamelen = strlen (filename); if (filenamelen >= 4 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0 && strcmp (charset, "CHARSET") == 0) canon_charset = po_charset_ascii; else error (EXIT_FAILURE, 0, _("\ present charset \"%s\" is not a portable encoding name"), charset); } freesa (charset); if (canon_from_code == NULL) canon_from_code = canon_charset; else if (canon_from_code != canon_charset) error (EXIT_FAILURE, 0, _("\ two different charsets \"%s\" and \"%s\" in input file"), canon_from_code, canon_charset); } } } if (canon_from_code == NULL) { if (is_ascii_message_list (mlp)) canon_from_code = po_charset_ascii; else if (mdlp->encoding != NULL) canon_from_code = mdlp->encoding; else { if (k == 0) error (EXIT_FAILURE, 0, _("\ input file `%s' doesn't contain a header entry with a charset specification"), files[n]); else error (EXIT_FAILURE, 0, _("\ domain \"%s\" in input file `%s' doesn't contain a header entry with a charset specification"), mdlp->item[k]->domain, files[n]); } } } canon_charsets[n][k] = canon_from_code; } } /* Determine textual identifications of each file/domain combination. */ identifications = (const char ***) xmalloc (nfiles * sizeof (const char **)); for (n = 0; n < nfiles; n++) { const char *filename = basename (files[n]); msgdomain_list_ty *mdlp = mdlps[n]; size_t k; identifications[n] = (const char **) xmalloc (mdlp->nitems * sizeof (const char *)); for (k = 0; k < mdlp->nitems; k++) { const char *domain = mdlp->item[k]->domain; message_list_ty *mlp = mdlp->item[k]->messages; char *project_id = NULL; for (j = 0; j < mlp->nitems; j++) if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete) { const char *header = mlp->item[j]->msgstr; if (header != NULL) { const char *cp = strstr (header, "Project-Id-Version:"); if (cp != NULL) { const char *endp; cp += sizeof ("Project-Id-Version:") - 1; endp = strchr (cp, '\n'); if (endp == NULL) endp = cp + strlen (cp); while (cp < endp && *cp == ' ') cp++; if (cp < endp) { size_t len = endp - cp; project_id = (char *) xmalloc (len + 1); memcpy (project_id, cp, len); project_id[len] = '\0'; } break; } } } identifications[n][k] = (project_id != NULL ? (k > 0 ? xasprintf ("%s:%s (%s)", filename, domain, project_id) : xasprintf ("%s (%s)", filename, project_id)) : (k > 0 ? xasprintf ("%s:%s", filename, domain) : xasprintf ("%s", filename))); } } /* Create list of resulting messages, but don't fill it. Only count the number of translations for each message. If for a message, there is at least one non-fuzzy, non-empty translation, use only the non-fuzzy, non-empty translations. Otherwise use the fuzzy or empty translations as well. */ total_mdlp = msgdomain_list_alloc (true); for (n = 0; n < nfiles; n++) { msgdomain_list_ty *mdlp = mdlps[n]; for (k = 0; k < mdlp->nitems; k++) { const char *domain = mdlp->item[k]->domain; message_list_ty *mlp = mdlp->item[k]->messages; message_list_ty *total_mlp; total_mlp = msgdomain_list_sublist (total_mdlp, domain, true); for (j = 0; j < mlp->nitems; j++) { message_ty *mp = mlp->item[j]; message_ty *tmp; size_t i; tmp = message_list_search (total_mlp, mp->msgctxt, mp->msgid); if (tmp == NULL) { tmp = message_alloc (mp->msgctxt, mp->msgid, mp->msgid_plural, NULL, 0, &mp->pos); tmp->is_fuzzy = true; /* may be set to false later */ for (i = 0; i < NFORMATS; i++) tmp->is_format[i] = undecided; /* may be set to yes/no later */ tmp->do_wrap = yes; /* may be set to no later */ tmp->obsolete = true; /* may be set to false later */ tmp->alternative_count = 0; tmp->alternative = NULL; message_list_append (total_mlp, tmp); } if (!msgcomm_mode && ((!is_header (mp) && mp->is_fuzzy) || mp->msgstr[0] == '\0')) /* Weak translation. Counted as negative tmp->used. */ { if (tmp->used <= 0) tmp->used--; } else /* Good translation. Counted as positive tmp->used. */ { if (tmp->used < 0) tmp->used = 0; tmp->used++; } mp->tmp = tmp; } } } /* Remove messages that are not used and need not be converted. */ for (n = 0; n < nfiles; n++) { msgdomain_list_ty *mdlp = mdlps[n]; for (k = 0; k < mdlp->nitems; k++) { message_list_ty *mlp = mdlp->item[k]->messages; message_list_remove_if_not (mlp, use_first ? is_message_first_needed : is_message_needed); /* If no messages are remaining, drop the charset. */ if (mlp->nitems == 0) canon_charsets[n][k] = NULL; } } for (k = 0; k < total_mdlp->nitems; k++) { message_list_ty *mlp = total_mdlp->item[k]->messages; message_list_remove_if_not (mlp, is_message_selected); } /* Determine the common known a-priori encoding, if any. */ if (nfiles > 0) { bool all_same_encoding = true; for (n = 1; n < nfiles; n++) if (mdlps[n]->encoding != mdlps[0]->encoding) { all_same_encoding = false; break; } if (all_same_encoding) total_mdlp->encoding = mdlps[0]->encoding; } /* Determine the target encoding for the remaining messages. */ if (to_code != NULL) { /* Canonicalize target encoding. */ canon_to_code = po_charset_canonicalize (to_code); if (canon_to_code == NULL) error (EXIT_FAILURE, 0, _("target charset \"%s\" is not a portable encoding name."), to_code); } else { /* No target encoding was specified. Test whether the messages are all in a single encoding. If so, conversion is not needed. */ const char *first = NULL; const char *second = NULL; bool with_ASCII = false; bool with_UTF8 = false; bool all_ASCII_compatible = true; for (n = 0; n < nfiles; n++) { msgdomain_list_ty *mdlp = mdlps[n]; for (k = 0; k < mdlp->nitems; k++) if (canon_charsets[n][k] != NULL) { if (canon_charsets[n][k] == po_charset_ascii) with_ASCII = true; else { if (first == NULL) first = canon_charsets[n][k]; else if (canon_charsets[n][k] != first && second == NULL) second = canon_charsets[n][k]; if (strcmp (canon_charsets[n][k], "UTF-8") == 0) with_UTF8 = true; if (!po_charset_ascii_compatible (canon_charsets[n][k])) all_ASCII_compatible = false; } } } if (with_ASCII && !all_ASCII_compatible) { /* assert (first != NULL); */ if (second == NULL) second = po_charset_ascii; } if (second != NULL) { /* A conversion is needed. Warn the user since he hasn't asked for it and might be surprised. */ if (with_UTF8) multiline_warning (xasprintf (_("warning: ")), xasprintf (_("\ Input files contain messages in different encodings, UTF-8 among others.\n\ Converting the output to UTF-8.\n\ "))); else multiline_warning (xasprintf (_("warning: ")), xasprintf (_("\ Input files contain messages in different encodings, %s and %s among others.\n\ Converting the output to UTF-8.\n\ To select a different output encoding, use the --to-code option.\n\ "), first, second)); canon_to_code = po_charset_utf8; } else if (first != NULL && with_ASCII && all_ASCII_compatible) { /* The conversion is a no-op conversion. Don't warn the user, but still perform the conversion, in order to check that the input was really ASCII. */ canon_to_code = first; } else { /* No conversion needed. */ canon_to_code = NULL; } } /* Now convert the remaining messages to to_code. */ if (canon_to_code != NULL) for (n = 0; n < nfiles; n++) { msgdomain_list_ty *mdlp = mdlps[n]; for (k = 0; k < mdlp->nitems; k++) if (canon_charsets[n][k] != NULL) /* If the user hasn't given a to_code, don't bother doing a noop conversion that would only replace the charset name in the header entry with its canonical equivalent. */ if (!(to_code == NULL && canon_charsets[n][k] == canon_to_code)) if (iconv_message_list (mdlp->item[k]->messages, canon_charsets[n][k], canon_to_code, files[n])) { multiline_error (xstrdup (""), xasprintf (_("\ Conversion of file %s from %s encoding to %s encoding\n\ changes some msgids or msgctxts.\n\ Either change all msgids and msgctxts to be pure ASCII, or ensure they are\n\ UTF-8 encoded from the beginning, i.e. already in your source code files.\n"), files[n], canon_charsets[n][k], canon_to_code)); exit (EXIT_FAILURE); } } /* Fill the resulting messages. */ for (n = 0; n < nfiles; n++) { msgdomain_list_ty *mdlp = mdlps[n]; for (k = 0; k < mdlp->nitems; k++) { message_list_ty *mlp = mdlp->item[k]->messages; for (j = 0; j < mlp->nitems; j++) { message_ty *mp = mlp->item[j]; message_ty *tmp = mp->tmp; size_t i; /* No need to discard unneeded weak translations here; they have already been filtered out above. */ if (use_first || tmp->used == 1 || tmp->used == -1) { /* Copy mp, as only message, into tmp. */ tmp->msgstr = mp->msgstr; tmp->msgstr_len = mp->msgstr_len; tmp->pos = mp->pos; if (mp->comment) for (i = 0; i < mp->comment->nitems; i++) message_comment_append (tmp, mp->comment->item[i]); if (mp->comment_dot) for (i = 0; i < mp->comment_dot->nitems; i++) message_comment_dot_append (tmp, mp->comment_dot->item[i]); for (i = 0; i < mp->filepos_count; i++) message_comment_filepos (tmp, mp->filepos[i].file_name, mp->filepos[i].line_number); tmp->is_fuzzy = mp->is_fuzzy; for (i = 0; i < NFORMATS; i++) tmp->is_format[i] = mp->is_format[i]; tmp->do_wrap = mp->do_wrap; tmp->obsolete = mp->obsolete; } else if (msgcomm_mode) { /* Copy mp, as only message, into tmp. */ if (tmp->msgstr == NULL) { tmp->msgstr = mp->msgstr; tmp->msgstr_len = mp->msgstr_len; tmp->pos = mp->pos; tmp->is_fuzzy = mp->is_fuzzy; } if (mp->comment && tmp->comment == NULL) for (i = 0; i < mp->comment->nitems; i++) message_comment_append (tmp, mp->comment->item[i]); if (mp->comment_dot && tmp->comment_dot == NULL) for (i = 0; i < mp->comment_dot->nitems; i++) message_comment_dot_append (tmp, mp->comment_dot->item[i]); for (i = 0; i < mp->filepos_count; i++) message_comment_filepos (tmp, mp->filepos[i].file_name, mp->filepos[i].line_number); for (i = 0; i < NFORMATS; i++) if (tmp->is_format[i] == undecided) tmp->is_format[i] = mp->is_format[i]; if (tmp->do_wrap == undecided) tmp->do_wrap = mp->do_wrap; tmp->obsolete = false; } else { /* Copy mp, among others, into tmp. */ char *id = xasprintf ("#-#-#-#-# %s #-#-#-#-#", identifications[n][k]); size_t nbytes; if (tmp->alternative_count == 0) tmp->pos = mp->pos; i = tmp->alternative_count; nbytes = (i + 1) * sizeof (struct altstr); tmp->alternative = xrealloc (tmp->alternative, nbytes); tmp->alternative[i].msgstr = mp->msgstr; tmp->alternative[i].msgstr_len = mp->msgstr_len; tmp->alternative[i].msgstr_end = tmp->alternative[i].msgstr + tmp->alternative[i].msgstr_len; tmp->alternative[i].comment = mp->comment; tmp->alternative[i].comment_dot = mp->comment_dot; tmp->alternative[i].id = id; tmp->alternative_count = i + 1; for (i = 0; i < mp->filepos_count; i++) message_comment_filepos (tmp, mp->filepos[i].file_name, mp->filepos[i].line_number); if (!mp->is_fuzzy) tmp->is_fuzzy = false; for (i = 0; i < NFORMATS; i++) if (mp->is_format[i] == yes) tmp->is_format[i] = yes; else if (mp->is_format[i] == no && tmp->is_format[i] == undecided) tmp->is_format[i] = no; if (mp->do_wrap == no) tmp->do_wrap = no; if (!mp->obsolete) tmp->obsolete = false; } } } } for (k = 0; k < total_mdlp->nitems; k++) { message_list_ty *mlp = total_mdlp->item[k]->messages; for (j = 0; j < mlp->nitems; j++) { message_ty *tmp = mlp->item[j]; if (tmp->alternative_count > 0) { /* Test whether all alternative translations are equal. */ struct altstr *first = &tmp->alternative[0]; size_t i; for (i = 0; i < tmp->alternative_count; i++) if (!(tmp->alternative[i].msgstr_len == first->msgstr_len && memcmp (tmp->alternative[i].msgstr, first->msgstr, first->msgstr_len) == 0)) break; if (i == tmp->alternative_count) { /* All alternatives are equal. */ tmp->msgstr = first->msgstr; tmp->msgstr_len = first->msgstr_len; } else { /* Concatenate the alternative msgstrs into a single one, separated by markers. */ size_t len; const char *p; const char *p_end; char *new_msgstr; char *np; len = 0; for (i = 0; i < tmp->alternative_count; i++) { size_t id_len = strlen (tmp->alternative[i].id); len += tmp->alternative[i].msgstr_len; p = tmp->alternative[i].msgstr; p_end = tmp->alternative[i].msgstr_end; for (; p < p_end; p += strlen (p) + 1) len += id_len + 2; } new_msgstr = (char *) xmalloc (len); np = new_msgstr; for (;;) { /* Test whether there's one more plural form to process. */ for (i = 0; i < tmp->alternative_count; i++) if (tmp->alternative[i].msgstr < tmp->alternative[i].msgstr_end) break; if (i == tmp->alternative_count) break; /* Process next plural form. */ for (i = 0; i < tmp->alternative_count; i++) if (tmp->alternative[i].msgstr < tmp->alternative[i].msgstr_end) { if (np > new_msgstr && np[-1] != '\0' && np[-1] != '\n') *np++ = '\n'; len = strlen (tmp->alternative[i].id); memcpy (np, tmp->alternative[i].id, len); np += len; *np++ = '\n'; len = strlen (tmp->alternative[i].msgstr); memcpy (np, tmp->alternative[i].msgstr, len); np += len; tmp->alternative[i].msgstr += len + 1; } /* Plural forms are separated by NUL bytes. */ *np++ = '\0'; } tmp->msgstr = new_msgstr; tmp->msgstr_len = np - new_msgstr; tmp->is_fuzzy = true; } /* Test whether all alternative comments are equal. */ for (i = 0; i < tmp->alternative_count; i++) if (tmp->alternative[i].comment == NULL || !string_list_equal (tmp->alternative[i].comment, first->comment)) break; if (i == tmp->alternative_count) /* All alternatives are equal. */ tmp->comment = first->comment; else /* Concatenate the alternative comments into a single one, separated by markers. */ for (i = 0; i < tmp->alternative_count; i++) { string_list_ty *slp = tmp->alternative[i].comment; if (slp != NULL) { size_t l; message_comment_append (tmp, tmp->alternative[i].id); for (l = 0; l < slp->nitems; l++) message_comment_append (tmp, slp->item[l]); } } /* Test whether all alternative dot comments are equal. */ for (i = 0; i < tmp->alternative_count; i++) if (tmp->alternative[i].comment_dot == NULL || !string_list_equal (tmp->alternative[i].comment_dot, first->comment_dot)) break; if (i == tmp->alternative_count) /* All alternatives are equal. */ tmp->comment_dot = first->comment_dot; else /* Concatenate the alternative dot comments into a single one, separated by markers. */ for (i = 0; i < tmp->alternative_count; i++) { string_list_ty *slp = tmp->alternative[i].comment_dot; if (slp != NULL) { size_t l; message_comment_dot_append (tmp, tmp->alternative[i].id); for (l = 0; l < slp->nitems; l++) message_comment_dot_append (tmp, slp->item[l]); } } } } } return total_mdlp; }