summaryrefslogtreecommitdiffstats
path: root/gettext-tools/src/msgl-fsearch.c
blob: fa69f99540b5b1abe6a3985a6b8a555d1ca5cf04 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
/* Fast fuzzy searching among messages.
   Copyright (C) 2006 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2006.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

/* Specification.  */
#include "msgl-fsearch.h"

#include <math.h>
#include <stdlib.h>

#include "xalloc.h"
#include "po-charset.h"


/* Fuzzy searching of L strings in a large set of N messages (assuming
   they have all the same small size) takes O(L * N) when using repeated
   fstrcmp() calls.  When for example L = 800 and N = 69000, this is slow.

   So we preprocess the set of N messages, yielding a data structure
   that allows to select the similar messages for any given string, and
   apply fstrcmp() only to the search result.  This allows to reduce the
   time to something between O(L * 1) and O(L * N) - depending on the
   structure of the strings.  In the example with L = 800 and N = 69000,
   memory use increases by a factor of 2 and the time decreases from
   9068 s to 230 s.

   The data structure is a hash table mapping each n-gram (here with n=4)
   occurring in the message strings to the list of messages that contains
   it.  For example, if the message list is
      [0] "close"
      [1] "losers"
   the index is a hash table mapping
      "clos" -> { 0 }
      "lose" -> { 0, 1 }
      "oser" -> { 1 }
      "sers" -> { 1 }
   Searching the similar messages to, say, "closest", is done by looking up
   all its 4-grams in the hash table and summing up the results:
      "clos" -> { 0 }
      "lose" -> { 0, 1 }
      "oses" -> {}
      "sest" -> {}
   => total: { 2x0, 1x1 }
   The list is sorted according to decreasing frequency: { 0, 1, ... }
   and only the first few messages from this frequency list are passed to
   fstrcmp().

   The n-gram similarity and the fstrcmp() similarity are quite different
   metrics.  For example, "close" and "c l o s e" have no n-grams in common
   (even for n as small as n = 2); however, fstrcmp() will find that they
   are quite similar (= 0.71).  Conversely, "AAAA BBBB ... ZZZZ" and
   "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
   only 0.02.  Also, repeated n-grams don't have the same effect on the
   two similarity measures.  But all this doesn't matter much in practice.

   We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
   lists are likely too long.  (Well, with ideographic languages like Chinese,
   n = 3 might actually be quite good.  Anyway, n = 4 is not bad in this case
   either.)

   The units are characters in the current encoding.  Not just bytes,
   because 4 consecutive bytes in UTF-8 or GB18030 don't mean much.  */

/* Each message is represented by its index in the message list.  */
typedef unsigned int index_ty;

/* An index list has its allocated size and length tacked at the front.
   The indices are sorted in ascending order.  */
typedef index_ty *index_list_ty;
#define IL_ALLOCATED 0
#define IL_LENGTH 1

/* Create a new index list containing only a given index.  */
static inline index_list_ty
new_index (index_ty idx)
{
  index_ty *list = (index_ty *) xmalloc ((2 + 1) * sizeof (index_ty));
  list[IL_ALLOCATED] = 1;
  list[IL_LENGTH] = 1;
  list[2] = idx;

  return list;
}

/* Add a given index, greater or equal than all previous indices, to an
   index list.
   Return the new index list, if it had to be reallocated, or NULL if it
   didn't change.  */
static inline index_list_ty
addlast_index (index_list_ty list, index_ty idx)
{
  index_list_ty result;
  size_t length = list[IL_LENGTH];

  /* Look whether it should be inserted.  */
  if (length > 0 && list[2 + (length - 1)] == idx)
    return NULL;

  /* Now make room for one more list element.  */
  result = NULL;
  if (length == list[IL_ALLOCATED])
    {
      size_t new_allocated = 2 * length - (length >> 6);
      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
      list[IL_ALLOCATED] = new_allocated;
      result = list;
    }
  list[2 + length] = idx;
  list[IL_LENGTH] = length + 1;
  return result;
}

/* Add a given index to an index list.
   Return the new index list, if it had to be reallocated, or NULL if it
   didn't change.  */
static inline index_list_ty
add_index (index_list_ty list, index_ty idx)
{
  index_list_ty result;
  size_t length = list[IL_LENGTH];

  /* Look where it should be inserted.  */
  size_t lo = 0;
  size_t hi = length;
  while (lo < hi)
    {
      /* Here we know that idx must be inserted at a position >= lo, <= hi.  */
      size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
      index_ty val = list[2 + mid];
      if (val < idx)
	lo = mid + 1;
      else if (val > idx)
	hi = mid;
      else
	return NULL;
    }

  /* Now make room for one more list element.  */
  result = NULL;
  if (length == list[IL_ALLOCATED])
    {
      size_t new_allocated = 2 * length - (length >> 6);
      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
      list[IL_ALLOCATED] = new_allocated;
      result = list;
    }
  list[IL_LENGTH] = length + 1;
  for (; length > hi; length--)
    list[2 + length] = list[1 + length];
  list[2 + length] = idx;
  return result;
}

/* We use 4-grams, therefore strings with less than 4 characters cannot be
   handled through the 4-grams table and need to be handled specially.
   Since every character occupies at most 4 bytes (see po-charset.c),
   this means the size of such short strings is bounded by:  */
#define SHORT_STRING_MAX_CHARACTERS (4 - 1)
#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)

/* Such short strings are handled by direct comparison with all messages
   of appropriate size.  Note that for two strings of length 0 <= l1 <= l2,
   fstrcmp() is <= 2 * l1 / (l1 + l2).  Since we are only interested in
   fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
   limit the search to lengths l' in the range
     l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
   Thus we need the list of the short strings up to length:  */
#define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1))

/* A fuzzy index contains a hash table mapping all n-grams to their
   occurrences list.  */
struct message_fuzzy_index_ty
{
  message_ty **messages;
  character_iterator_t iterator;
  hash_table gram4;
  size_t firstfew;
  message_list_ty *short_messages[SHORT_MSG_MAX + 1];
};

/* Allocate a fuzzy index corresponding to a given list of messages.
   The list of messages and the msgctxt and msgid fields of the messages
   inside it must not be modified while the returned fuzzy index is in use.  */
message_fuzzy_index_ty *
message_fuzzy_index_alloc (const message_list_ty *mlp,
			   const char *canon_charset)
{
  message_fuzzy_index_ty *findex =
    (message_fuzzy_index_ty *) xmalloc (sizeof (message_fuzzy_index_ty));
  size_t count = mlp->nitems;
  size_t j;
  size_t l;

  findex->messages = mlp->item;
  findex->iterator = po_charset_character_iterator (canon_charset);

  /* Setup hash table.  */
  if (hash_init (&findex->gram4, 10 * count) < 0)
    xalloc_die ();
  for (j = 0; j < count; j++)
    {
      message_ty *mp = mlp->item[j];

      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
	{
	  const char *str = mp->msgid;

	  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
	  const char *p0 = str;
	  if (*p0 != '\0')
	    {
	      const char *p1 = p0 + findex->iterator (p0);
	      if (*p1 != '\0')
		{
		  const char *p2 = p1 + findex->iterator (p1);
		  if (*p2 != '\0')
		    {
		      const char *p3 = p2 + findex->iterator (p2);
		      if (*p3 != '\0')
			{
			  const char *p4 = p3 + findex->iterator (p3);
			  for (;;)
			    {
			      /* The segment from p0 to p4 is a 4-gram of
				 characters.  Add a hash table entry that maps
				 it to the index j, or extend the existing
				 hash table entry accordingly.  */
			      void *found;

			      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
						   &found) == 0)
				{
				  index_list_ty list = (index_list_ty) found;
				  list = addlast_index (list, j);
				  if (list != NULL)
				    hash_set_value (&findex->gram4, p0, p4 - p0,
						    list);
				}
			      else
				hash_insert_entry (&findex->gram4, p0, p4 - p0,
						   new_index (j));

			      /* Advance.  */
			      if (*p4 == '\0')
				break;
			      p0 = p1;
			      p1 = p2;
			      p2 = p3;
			      p3 = p4;
			      p4 = p4 + findex->iterator (p4);
			    }
			}
		    }
		}
	    }
	}
    }

  /* Shrink memory used by the hash table.  */
  {
    void *iter;
    const void *key;
    size_t keylen;
    void **valuep;

    iter = NULL;
    while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
	   == 0)
      {
	index_list_ty list = (index_list_ty) *valuep;
	index_ty length = list[IL_LENGTH];

	if (length < list[IL_ALLOCATED])
	  {
	    list[IL_ALLOCATED] = length;
	    *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
	  }
      }
  }

  findex->firstfew = (int) sqrt ((double) count);
  if (findex->firstfew < 10)
    findex->firstfew = 10;

  /* Setup lists of short messages.  */
  for (l = 0; l <= SHORT_MSG_MAX; l++)
    findex->short_messages[l] = message_list_alloc (false);
  for (j = 0; j < count; j++)
    {
      message_ty *mp = mlp->item[j];

      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
	{
	  const char *str = mp->msgid;
	  size_t len = strlen (str);

	  if (len <= SHORT_MSG_MAX)
	    message_list_append (findex->short_messages[len], mp);
	}
    }

  /* Shrink memory used by the lists of short messages.  */
  for (l = 0; l <= SHORT_MSG_MAX; l++)
    {
      message_list_ty *mlp = findex->short_messages[l];

      if (mlp->nitems < mlp->nitems_max)
	{
	  mlp->nitems_max = mlp->nitems;
	  mlp->item =
	    (message_ty **)
	    xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
	}
    }

  return findex;
}

/* An index with multiplicity.  */
struct mult_index
{
  index_ty index;
  unsigned int count;
};

/* A list of indices with multiplicity.
   The indices are sorted in ascending order.  */
struct mult_index_list
{
  struct mult_index *item;
  size_t nitems;
  size_t nitems_max;
  /* Work area.  */
  struct mult_index *item2;
  size_t nitems2_max;
};

/* Initialize an empty list of indices with multiplicity.  */
static inline void
mult_index_list_init (struct mult_index_list *accu)
{
  accu->nitems = 0;
  accu->nitems_max = 0;
  accu->item = NULL;
  accu->nitems2_max = 0;
  accu->item2 = NULL;
}

/* Add an index list to a list of indices with multiplicity.  */
static inline void
mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
{
  size_t len1 = accu->nitems;
  size_t len2 = list[IL_LENGTH];
  size_t need = len1 + len2;
  struct mult_index *ptr1;
  struct mult_index *ptr1_end;
  index_ty *ptr2;
  index_ty *ptr2_end;
  struct mult_index *destptr;

  /* Make the work area large enough.  */
  if (accu->nitems2_max < need)
    {
      size_t new_max = 2 * accu->nitems2_max + 1;

      if (new_max < need)
	new_max = need;
      if (accu->item2 != NULL)
	free (accu->item2);
      accu->item2 =
	(struct mult_index *) xmalloc (new_max * sizeof (struct mult_index));
      accu->nitems2_max = new_max;
    }

  /* Make a linear pass through accu and list simultaneously.  */
  ptr1 = accu->item;
  ptr1_end = ptr1 + len1;
  ptr2 = list + 2;
  ptr2_end = ptr2 + len2;
  destptr = accu->item2;
  while (ptr1 < ptr1_end && ptr2 < ptr2_end)
    {
      if (ptr1->index < *ptr2)
	{
	  *destptr = *ptr1;
	  ptr1++;
	}
      else if (ptr1->index > *ptr2)
	{
	  destptr->index = *ptr2;
	  destptr->count = 1;
	  ptr2++;
	}
      else /* ptr1->index == list[2 + i2] */
	{
	  destptr->index = ptr1->index;
	  destptr->count = ptr1->count + 1;
	  ptr1++;
	  ptr2++;
	}
      destptr++;
    }
  while (ptr1 < ptr1_end)
    {
      *destptr = *ptr1;
      ptr1++;
      destptr++;
    }
  while (ptr2 < ptr2_end)
    {
      destptr->index = *ptr2;
      destptr->count = 1;
      ptr2++;
      destptr++;
    }

  /* Swap accu->item and accu->item2.  */
  {
    struct mult_index *dest = accu->item2;
    size_t dest_max = accu->nitems2_max;

    accu->item2 = accu->item;
    accu->nitems2_max = accu->nitems_max;

    accu->item = dest;
    accu->nitems = destptr - dest;
    accu->nitems_max = dest_max;
  }
}

/* Compares two indices with multiplicity, according to their multiplicity.  */
static int
mult_index_compare (const void *p1, const void *p2)
{
  const struct mult_index *ptr1 = (const struct mult_index *) p1;
  const struct mult_index *ptr2 = (const struct mult_index *) p2;

  if (ptr1->count < ptr2->count)
    return 1;
  if (ptr1->count > ptr2->count)
    return -1;
  /* For reproduceable results, also take into account the indices.  */
  if (ptr1->index > ptr2->index)
    return 1;
  if (ptr1->index < ptr2->index)
    return -1;
  return 0;
}

/* Sort a list of indices with multiplicity according to decreasing
   multiplicity.  */
static inline void
mult_index_list_sort (struct mult_index_list *accu)
{
  if (accu->nitems > 1)
    qsort (accu->item, accu->nitems, sizeof (struct mult_index),
	   mult_index_compare);
}

/* Frees a list of indices with multiplicity.  */
static inline void
mult_index_list_free (struct mult_index_list *accu)
{
  if (accu->item != NULL)
    free (accu->item);
  if (accu->item2 != NULL)
    free (accu->item2);
}

/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
   The match does not need to be optimal.  */
message_ty *
message_fuzzy_index_search (message_fuzzy_index_ty *findex,
			    const char *msgctxt, const char *msgid)
{
  const char *str = msgid;

  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
  const char *p0 = str;
  if (*p0 != '\0')
    {
      const char *p1 = p0 + findex->iterator (p0);
      if (*p1 != '\0')
	{
	  const char *p2 = p1 + findex->iterator (p1);
	  if (*p2 != '\0')
	    {
	      const char *p3 = p2 + findex->iterator (p2);
	      if (*p3 != '\0')
		{
		  const char *p4 = p3 + findex->iterator (p3);
		  struct mult_index_list accu;

		  mult_index_list_init (&accu);
		  for (;;)
		    {
		      /* The segment from p0 to p4 is a 4-gram of
			 characters.  Get the hash table entry containing
			 a list of indices, and add it to the accu.  */
		      void *found;

		      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
					   &found) == 0)
			{
			  index_list_ty list = (index_list_ty) found;
			  mult_index_list_accumulate (&accu, list);
			}

		      /* Advance.  */
		      if (*p4 == '\0')
			break;
		      p0 = p1;
		      p1 = p2;
		      p2 = p3;
		      p3 = p4;
		      p4 = p4 + findex->iterator (p4);
		    }

		  /* Sort in decreasing count order.  */
		  mult_index_list_sort (&accu);

		  /* Take the first few messages from this sorted list, and
		     maximize the fstrcmp() result.  */
		  {
		    size_t count;
		    struct mult_index *ptr;
		    message_ty *best_mp;
		    double best_weight;

		    count = findex->firstfew;
		    if (count > accu.nitems)
		      count = accu.nitems;

		    best_weight = FUZZY_THRESHOLD;
		    best_mp = NULL;
		    for (ptr = accu.item; count > 0; ptr++, count--)
		      {
			message_ty *mp = findex->messages[ptr->index];
			double weight =
			  fuzzy_search_goal_function (mp, msgctxt, msgid);

			if (weight > best_weight)
			  {
			    best_weight = weight;
			    best_mp = mp;
			  }
		      }

		    mult_index_list_free (&accu);

		    return best_mp;
		  }
		}
	    }
	}
    }

  /* The string had less than 4 characters.  */
  {
    size_t l = strlen (str);
    size_t lmin, lmax;
    message_ty *best_mp;
    double best_weight;

    if (!(l <= SHORT_STRING_MAX_BYTES))
      abort ();

    /* Walk through those short messages which have an appropriate length.
       See the comment before SHORT_MSG_MAX.  */
    lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
    lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
    if (!(lmax <= SHORT_MSG_MAX))
      abort ();

    best_weight = FUZZY_THRESHOLD;
    best_mp = NULL;
    for (l = lmin; l <= lmax; l++)
      {
	message_list_ty *mlp = findex->short_messages[l];
	size_t j;

	for (j = 0; j < mlp->nitems; j++)
	  {
	    message_ty *mp = mlp->item[j];
	    double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);

	    if (weight > best_weight)
	      {
		best_weight = weight;
		best_mp = mp;
	      }
	  }
      }

    return best_mp;
  }
}

/* Free a fuzzy index.  */
void
message_fuzzy_index_free (message_fuzzy_index_ty *findex)
{
  size_t l;
  void *iter;
  const void *key;
  size_t keylen;
  void *data;

  /* Free the short lists.  */
  for (l = 0; l <= SHORT_MSG_MAX; l++)
    message_list_free (findex->short_messages[l], 1);

  /* Free the index lists occurring as values in the hash tables.  */
  iter = NULL;
  while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
    free ((index_list_ty *) data);
  /* Free the hash table itself.  */
  hash_destroy (&findex->gram4);

  free (findex);
}