diff options
author | jcivelli@google.com <jcivelli@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-04-15 22:02:02 +0000 |
---|---|---|
committer | jcivelli@google.com <jcivelli@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-04-15 22:02:02 +0000 |
commit | 5273ac8ec0daaaa716b40d00661722f803e1198e (patch) | |
tree | 77653da67866e861a9d92a638ba928b9ac7483fc /third_party | |
parent | 0e53755613769c9661d628f1ae267150994c721b (diff) | |
download | chromium_src-5273ac8ec0daaaa716b40d00661722f803e1198e.zip chromium_src-5273ac8ec0daaaa716b40d00661722f803e1198e.tar.gz chromium_src-5273ac8ec0daaaa716b40d00661722f803e1198e.tar.bz2 |
Merging in a fix for Dick sites that reduces bad detection
with random text.
BUG=39217
TEST=Open pages in various languages, make sure the infobar
appears and reports the right language for the page.
Review URL: http://codereview.chromium.org/1559035
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@44711 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party')
-rw-r--r-- | third_party/cld/encodings/compact_lang_det/cldutil.cc | 15 |
1 files changed, 12 insertions, 3 deletions
diff --git a/third_party/cld/encodings/compact_lang_det/cldutil.cc b/third_party/cld/encodings/compact_lang_det/cldutil.cc index 6da7fce..5bbed82 100644 --- a/third_party/cld/encodings/compact_lang_det/cldutil.cc +++ b/third_party/cld/encodings/compact_lang_det/cldutil.cc @@ -570,6 +570,10 @@ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj, if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);} + // Run a little cache of last hits to catch overly-repetitive "text" + int next_prior = 0; + uint32 prior_quads[2] = {0, 0}; + // Visit all quadgrams if (src[0] == ' ') {++src;} while (src < srclimit) { @@ -594,9 +598,14 @@ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj, DbgQuadTermToStderr(quadhash, probs, src, len); } if (probs != 0) { - ProcessProbV25Tote(probs, chunk_tote); - ++(*tote_grams); - if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);} + // Filter out recent repeats. If this works out, use in the other lookups + if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) { + prior_quads[next_prior] = quadhash; + next_prior = (next_prior + 1) & 1; + ProcessProbV25Tote(probs, chunk_tote); + ++(*tote_grams); + if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);} + } } // Advance all the way past word if at end-of-word |