From 5273ac8ec0daaaa716b40d00661722f803e1198e Mon Sep 17 00:00:00 2001 From: "jcivelli@google.com" Date: Thu, 15 Apr 2010 22:02:02 +0000 Subject: Merging in a fix for Dick sites that reduces bad detection with random text. BUG=39217 TEST=Open pages in various languages, make sure the infobar appears and reports the right language for the page. Review URL: http://codereview.chromium.org/1559035 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@44711 0039d316-1c4b-4281-b951-d872f2087c98 --- third_party/cld/encodings/compact_lang_det/cldutil.cc | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'third_party/cld') diff --git a/third_party/cld/encodings/compact_lang_det/cldutil.cc b/third_party/cld/encodings/compact_lang_det/cldutil.cc index 6da7fce..5bbed82 100644 --- a/third_party/cld/encodings/compact_lang_det/cldutil.cc +++ b/third_party/cld/encodings/compact_lang_det/cldutil.cc @@ -570,6 +570,10 @@ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj, if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);} + // Run a little cache of last hits to catch overly-repetitive "text" + int next_prior = 0; + uint32 prior_quads[2] = {0, 0}; + // Visit all quadgrams if (src[0] == ' ') {++src;} while (src < srclimit) { @@ -594,9 +598,14 @@ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj, DbgQuadTermToStderr(quadhash, probs, src, len); } if (probs != 0) { - ProcessProbV25Tote(probs, chunk_tote); - ++(*tote_grams); - if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);} + // Filter out recent repeats. If this works out, use in the other lookups + if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) { + prior_quads[next_prior] = quadhash; + next_prior = (next_prior + 1) & 1; + ProcessProbV25Tote(probs, chunk_tote); + ++(*tote_grams); + if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);} + } } // Advance all the way past word if at end-of-word -- cgit v1.1