Changes query parser to sort and coalesce overlapping match positions.

BUG=5305 TEST=see bug Review URL: http://codereview.chromium.org/13666 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@6595 0039d316-1c4b-4281-b951-d872f2087c98
author: sky@google.com <sky@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2008-12-09 18:25:38 +0000
committer: sky@google.com <sky@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2008-12-09 18:25:38 +0000
commit: 32cbfa4f0c14b80fb551530b15051b7d0d5bb8a4 (patch)
tree: 84307691175f3326ec3040d21056f535b6ca0ca0 /chrome/browser/history
parent: 02a742798da28b5d6901821b8c799c5d668e8ae4 (diff)
download: chromium_src-32cbfa4f0c14b80fb551530b15051b7d0d5bb8a4.zip
chromium_src-32cbfa4f0c14b80fb551530b15051b7d0d5bb8a4.tar.gz
chromium_src-32cbfa4f0c14b80fb551530b15051b7d0d5bb8a4.tar.bz2
2 files changed, 53 insertions, 2 deletions
diff --git a/chrome/browser/history/query_parser.cc b/chrome/browser/history/query_parser.cc
index 4d79a4c..ef40601 100644
--- a/chrome/browser/history/query_parser.cc
+++ b/chrome/browser/history/query_parser.cc
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+#include <algorithm>
+
 #include "chrome/browser/history/query_parser.h"
 
 #include "base/logging.h"
@@ -11,6 +13,49 @@
 #include "chrome/common/scoped_vector.h"
 #include "unicode/uscript.h"
 
+namespace {
+
+// Returns true if |mp1.first| is less than |mp2.first|. This is used to
+// sort match positions.
+int CompareMatchPosition(const Snippet::MatchPosition& mp1,
+                         const Snippet::MatchPosition& mp2) {
+  return mp1.first < mp2.first;
+}
+
+// Returns true if |mp2| intersects |mp1|. This is intended for use by
+// CoalesceMatchesFrom and isn't meant as a general intersectpion comparison
+// function.
+bool SnippetIntersects(const Snippet::MatchPosition& mp1,
+                       const Snippet::MatchPosition& mp2) {
+  return mp2.first >= mp1.first && mp2.first <= mp1.second;
+}
+
+// Coalesces match positions in |matches| after index that intersect the match
+// position at |index|.
+void CoalesceMatchesFrom(size_t index,
+                         Snippet::MatchPositions* matches) {
+  Snippet::MatchPosition& mp = (*matches)[index];
+  for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
+       i != matches->end(); ) {
+    if (SnippetIntersects(mp, *i)) {
+      mp.second = i->second;
+      i = matches->erase(i);
+    } else {
+      return;
+    }
+  }
+}
+
+// Sorts the match positions in |matches| by their first index, then coalesces
+// any match positions that intersect each other.
+void CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) {
+  std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
+  // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
+  // from matches.
+  for (size_t i = 0; i < matches->size(); ++i)
+    CoalesceMatchesFrom(i, matches);
+}
+
 // For CJK ideographs and Korean Hangul, even a single character
 // can be useful in prefix matching, but that may give us too many
 // false positives. Moreover, the current ICU word breaker gives us
@@ -18,7 +63,7 @@
 // point doing anything for them and we only adjust the minimum length
 // to 2 for Korean Hangul while using 3 for others. This is a temporary
 // hack until we have a segmentation support.
-static inline bool IsWordLongEnoughForPrefixSearch(const std::wstring& word)
+inline bool IsWordLongEnoughForPrefixSearch(const std::wstring& word)
 {
   DCHECK(word.size() > 0);
   size_t minimum_length = 3;
@@ -30,6 +75,8 @@ static inline bool IsWordLongEnoughForPrefixSearch(const std::wstring& word)
   return word.size() >= minimum_length;
 }
 
+} // namespace
+
 // Inheritance structure:
 // Queries are represented as trees of QueryNodes.
 // QueryNodes are either a collection of subnodes (a QueryNodeList)
@@ -229,6 +276,7 @@ bool QueryParser::DoesQueryMatch(const std::wstring& text,
     if (!query_nodes[i]->HasMatchIn(query_words, &matches))
       return false;
   }
+  CoalseAndSortMatchPositions(&matches);
   match_positions->swap(matches);
   return true;
 }
diff --git a/chrome/browser/history/query_parser_unittest.cc b/chrome/browser/history/query_parser_unittest.cc
index 7246adc..50cb343 100644
--- a/chrome/browser/history/query_parser_unittest.cc
+++ b/chrome/browser/history/query_parser_unittest.cc
@@ -93,12 +93,15 @@ TEST_F(QueryParserTest, ParseQueryNodesAndMatch) {
     const int m2_start;
     const int m2_end;
   } data[] = {
+    { L"foo foo",       L"foo",              true,  0, 3, 0, 0 },
+    { L"foo fooey",     L"fooey",            true,  0, 5, 0, 0 },
+    { L"foo fooey bar", L"bar fooey",        true,  0, 3, 4, 9 },
     { L"blah",          L"blah",             true,  0, 4, 0, 0 },
     { L"blah",          L"foo",              false, 0, 0, 0, 0 },
     { L"blah",          L"blahblah",         true,  0, 4, 0, 0 },
     { L"blah",          L"foo blah",         true,  4, 8, 0, 0 },
     { L"foo blah",      L"blah",             false, 0, 0, 0, 0 },
-    { L"foo blah",      L"blahx foobar",     true,  6, 9, 0, 4 },
+    { L"foo blah",      L"blahx foobar",     true,  0, 4, 6, 9 },
     { L"\"foo blah\"",  L"foo blah",         true,  0, 8, 0, 0 },
     { L"\"foo blah\"",  L"foox blahx",       false, 0, 0, 0, 0 },
     { L"\"foo blah\"",  L"foo blah",         true,  0, 8, 0, 0 },
author	sky@google.com <sky@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2008-12-09 18:25:38 +0000
committer	sky@google.com <sky@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2008-12-09 18:25:38 +0000
commit	32cbfa4f0c14b80fb551530b15051b7d0d5bb8a4 (patch)
tree	84307691175f3326ec3040d21056f535b6ca0ca0 /chrome/browser/history
parent	02a742798da28b5d6901821b8c799c5d668e8ae4 (diff)
download	chromium_src-32cbfa4f0c14b80fb551530b15051b7d0d5bb8a4.zip chromium_src-32cbfa4f0c14b80fb551530b15051b7d0d5bb8a4.tar.gz chromium_src-32cbfa4f0c14b80fb551530b15051b7d0d5bb8a4.tar.bz2