summaryrefslogtreecommitdiffstats
path: root/third_party/cld
diff options
context:
space:
mode:
authorjcivelli@google.com <jcivelli@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2010-04-15 22:02:02 +0000
committerjcivelli@google.com <jcivelli@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2010-04-15 22:02:02 +0000
commit5273ac8ec0daaaa716b40d00661722f803e1198e (patch)
tree77653da67866e861a9d92a638ba928b9ac7483fc /third_party/cld
parent0e53755613769c9661d628f1ae267150994c721b (diff)
downloadchromium_src-5273ac8ec0daaaa716b40d00661722f803e1198e.zip
chromium_src-5273ac8ec0daaaa716b40d00661722f803e1198e.tar.gz
chromium_src-5273ac8ec0daaaa716b40d00661722f803e1198e.tar.bz2
Merging in a fix for Dick sites that reduces bad detection
with random text. BUG=39217 TEST=Open pages in various languages, make sure the infobar appears and reports the right language for the page. Review URL: http://codereview.chromium.org/1559035 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@44711 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party/cld')
-rw-r--r--third_party/cld/encodings/compact_lang_det/cldutil.cc15
1 files changed, 12 insertions, 3 deletions
diff --git a/third_party/cld/encodings/compact_lang_det/cldutil.cc b/third_party/cld/encodings/compact_lang_det/cldutil.cc
index 6da7fce..5bbed82 100644
--- a/third_party/cld/encodings/compact_lang_det/cldutil.cc
+++ b/third_party/cld/encodings/compact_lang_det/cldutil.cc
@@ -570,6 +570,10 @@ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
+ // Run a little cache of last hits to catch overly-repetitive "text"
+ int next_prior = 0;
+ uint32 prior_quads[2] = {0, 0};
+
// Visit all quadgrams
if (src[0] == ' ') {++src;}
while (src < srclimit) {
@@ -594,9 +598,14 @@ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
DbgQuadTermToStderr(quadhash, probs, src, len);
}
if (probs != 0) {
- ProcessProbV25Tote(probs, chunk_tote);
- ++(*tote_grams);
- if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
+ // Filter out recent repeats. If this works out, use in the other lookups
+ if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
+ prior_quads[next_prior] = quadhash;
+ next_prior = (next_prior + 1) & 1;
+ ProcessProbV25Tote(probs, chunk_tote);
+ ++(*tote_grams);
+ if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
+ }
}
// Advance all the way past word if at end-of-word