summaryrefslogtreecommitdiffstats
path: root/third_party/cld
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/cld')
-rw-r--r--third_party/cld/encodings/compact_lang_det/cldutil.cc15
1 files changed, 12 insertions, 3 deletions
diff --git a/third_party/cld/encodings/compact_lang_det/cldutil.cc b/third_party/cld/encodings/compact_lang_det/cldutil.cc
index 6da7fce..5bbed82 100644
--- a/third_party/cld/encodings/compact_lang_det/cldutil.cc
+++ b/third_party/cld/encodings/compact_lang_det/cldutil.cc
@@ -570,6 +570,10 @@ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
+ // Run a little cache of last hits to catch overly-repetitive "text"
+ int next_prior = 0;
+ uint32 prior_quads[2] = {0, 0};
+
// Visit all quadgrams
if (src[0] == ' ') {++src;}
while (src < srclimit) {
@@ -594,9 +598,14 @@ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
DbgQuadTermToStderr(quadhash, probs, src, len);
}
if (probs != 0) {
- ProcessProbV25Tote(probs, chunk_tote);
- ++(*tote_grams);
- if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
+ // Filter out recent repeats. If this works out, use in the other lookups
+ if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
+ prior_quads[next_prior] = quadhash;
+ next_prior = (next_prior + 1) & 1;
+ ProcessProbV25Tote(probs, chunk_tote);
+ ++(*tote_grams);
+ if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
+ }
}
// Advance all the way past word if at end-of-word