summaryrefslogtreecommitdiffstats
path: root/chrome/browser/history/query_parser.cc
diff options
context:
space:
mode:
authorinitial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98>2008-07-26 23:55:29 +0000
committerinitial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98>2008-07-26 23:55:29 +0000
commit09911bf300f1a419907a9412154760efd0b7abc3 (patch)
treef131325fb4e2ad12c6d3504ab75b16dd92facfed /chrome/browser/history/query_parser.cc
parent586acc5fe142f498261f52c66862fa417c3d52d2 (diff)
downloadchromium_src-09911bf300f1a419907a9412154760efd0b7abc3.zip
chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.tar.gz
chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.tar.bz2
Add chrome to the repository.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@15 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/history/query_parser.cc')
-rw-r--r--chrome/browser/history/query_parser.cc317
1 files changed, 317 insertions, 0 deletions
diff --git a/chrome/browser/history/query_parser.cc b/chrome/browser/history/query_parser.cc
new file mode 100644
index 0000000..86285f3
--- /dev/null
+++ b/chrome/browser/history/query_parser.cc
@@ -0,0 +1,317 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "chrome/browser/history/query_parser.h"
+
+#include "base/logging.h"
+#include "base/string_util.h"
+#include "base/word_iterator.h"
+#include "chrome/common/l10n_util.h"
+#include "chrome/common/scoped_vector.h"
+#include "unicode/uscript.h"
+
+// For CJK ideographs and Korean Hangul, even a single character
+// can be useful in prefix matching, but that may give us too many
+// false positives. Moreover, the current ICU word breaker gives us
+// back every single Chinese character as a word so that there's no
+// point doing anything for them and we only adjust the minimum length
+// to 2 for Korean Hangul while using 3 for others. This is a temporary
+// hack until we have a segmentation support.
+static inline bool IsWordLongEnoughForPrefixSearch(const std::wstring& word)
+{
+ DCHECK(word.size() > 0);
+ size_t minimum_length = 3;
+ // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
+ // because they 'behave like' Latin letters. Moreover, we should
+ // normalize the former before reaching here.
+ if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
+ minimum_length = 2;
+ return word.size() >= minimum_length;
+}
+
+// Inheritance structure:
+// Queries are represented as trees of QueryNodes.
+// QueryNodes are either a collection of subnodes (a QueryNodeList)
+// or a single word (a QueryNodeWord).
+
+// A QueryNodeWord is a single word in the query.
+class QueryNodeWord : public QueryNode {
+ public:
+ QueryNodeWord(const std::wstring& word) : word_(word), literal_(false) {}
+ virtual ~QueryNodeWord() {}
+ virtual int AppendToSQLiteQuery(std::wstring* query) const;
+ virtual bool IsWord() const { return true; }
+
+ const std::wstring& word() const { return word_; }
+ void set_literal(bool literal) { literal_ = literal; }
+
+ virtual bool HasMatchIn(const std::vector<std::wstring>& words) const;
+
+ virtual bool Matches(const std::wstring& word, bool exact) const;
+
+ private:
+ std::wstring word_;
+ bool literal_;
+};
+
+bool QueryNodeWord::HasMatchIn(const std::vector<std::wstring>& words) const {
+ for (size_t i = 0; i < words.size(); ++i) {
+ if (Matches(words[i], false))
+ return true;
+ }
+ return false;
+}
+
+bool QueryNodeWord::Matches(const std::wstring& word, bool exact) const {
+ if (exact || !IsWordLongEnoughForPrefixSearch(word_))
+ return word == word_;
+ return word.size() >= word_.size() &&
+ (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
+}
+
+int QueryNodeWord::AppendToSQLiteQuery(std::wstring* query) const {
+ query->append(word_);
+
+ // Use prefix search if we're not literal and long enough.
+ if (!literal_ && IsWordLongEnoughForPrefixSearch(word_))
+ *query += L'*';
+ return 1;
+}
+
+// A QueryNodeList has a collection of child QueryNodes
+// which it cleans up after.
+class QueryNodeList : public QueryNode {
+ public:
+ virtual ~QueryNodeList();
+
+ virtual int AppendToSQLiteQuery(std::wstring* query) const {
+ return AppendChildrenToString(query);
+ }
+ virtual bool IsWord() const { return false; }
+
+ void AddChild(QueryNode* node) { children_.push_back(node); }
+
+ typedef std::vector<QueryNode*> QueryNodeVector;
+ QueryNodeVector* children() { return &children_; }
+
+ // Remove empty subnodes left over from other parsing.
+ void RemoveEmptySubnodes();
+
+ // QueryNodeList is never used with Matches or HasMatchIn.
+ virtual bool Matches(const std::wstring& word, bool exact) const {
+ NOTREACHED();
+ return false;
+ }
+ virtual bool HasMatchIn(const std::vector<std::wstring>& words) const {
+ NOTREACHED();
+ return false;
+ }
+
+ protected:
+ int AppendChildrenToString(std::wstring* query) const;
+
+ QueryNodeVector children_;
+};
+
+QueryNodeList::~QueryNodeList() {
+ for (QueryNodeVector::iterator node = children_.begin();
+ node != children_.end(); ++node)
+ delete *node;
+}
+
+int QueryNodeList::AppendChildrenToString(std::wstring* query) const {
+ int num_words = 0;
+ for (QueryNodeVector::const_iterator node = children_.begin();
+ node != children_.end(); ++node) {
+ if (node != children_.begin())
+ query->push_back(L' ');
+ num_words += (*node)->AppendToSQLiteQuery(query);
+ }
+ return num_words;
+}
+
+// A QueryNodePhrase is a phrase query ("quoted").
+class QueryNodePhrase : public QueryNodeList {
+ public:
+ virtual int AppendToSQLiteQuery(std::wstring* query) const {
+ query->push_back(L'"');
+ int num_words = AppendChildrenToString(query);
+ query->push_back(L'"');
+ return num_words;
+ }
+
+ virtual bool Matches(const std::wstring& word, bool exact) const;
+ virtual bool HasMatchIn(const std::vector<std::wstring>& words) const;
+};
+
+bool QueryNodePhrase::Matches(const std::wstring& word, bool exact) const {
+ NOTREACHED();
+ return false;
+}
+
+bool QueryNodePhrase::HasMatchIn(const std::vector<std::wstring>& words) const {
+ if (words.size() < children_.size())
+ return false;
+
+ for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
+ bool matched_all = true;
+ for (size_t j = 0; j < children_.size(); ++j) {
+ if (!children_[j]->Matches(words[i + j], true)) {
+ matched_all = false;
+ break;
+ }
+ }
+ if (matched_all)
+ return true;
+ }
+ return false;
+}
+
+QueryParser::QueryParser() {
+}
+
+// Returns true if the character is considered a quote.
+static bool IsQueryQuote(wchar_t ch) {
+ return ch == '"' ||
+ ch == 0xab || // left pointing double angle bracket
+ ch == 0xbb || // right pointing double angle bracket
+ ch == 0x201c || // left double quotation mark
+ ch == 0x201d || // right double quotation mark
+ ch == 0x201e; // double low-9 quotation mark
+}
+
+int QueryParser::ParseQuery(const std::wstring& query,
+ std::wstring* sqlite_query) {
+ QueryNodeList root;
+ if (!ParseQueryImpl(query, &root))
+ return 0;
+ return root.AppendToSQLiteQuery(sqlite_query);
+}
+
+void QueryParser::ParseQuery(const std::wstring& query,
+ std::vector<QueryNode*>* nodes) {
+ QueryNodeList root;
+ if (ParseQueryImpl(l10n_util::ToLower(query), &root))
+ nodes->swap(*root.children());
+}
+
+bool QueryParser::DoesQueryMatch(const std::wstring& text,
+ const std::vector<QueryNode*>& query_nodes) {
+ if (query_nodes.empty())
+ return false;
+
+ std::vector<std::wstring> query_words;
+ ExtractWords(l10n_util::ToLower(text), &query_words);
+
+ if (query_words.empty())
+ return false;
+
+ for (size_t i = 0; i < query_nodes.size(); ++i) {
+ if (!query_nodes[i]->HasMatchIn(query_words))
+ return false;
+ }
+ return true;
+}
+
+bool QueryParser::ParseQueryImpl(const std::wstring& query,
+ QueryNodeList* root) {
+ WordIterator iter(query, WordIterator::BREAK_WORD);
+ // TODO(evanm): support a locale here
+ if (!iter.Init())
+ return false;
+
+ // To handle nesting, we maintain a stack of QueryNodeLists.
+ // The last element (back) of the stack contains the current, deepest node.
+ std::vector<QueryNodeList*> query_stack;
+ query_stack.push_back(root);
+
+ bool in_quotes = false; // whether we're currently in a quoted phrase
+ while (iter.Advance()) {
+ // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
+ // is not necessarily a word, but could also be a sequence of punctuation
+ // or whitespace.
+ if (iter.IsWord()) {
+ std::wstring word = iter.GetWord();
+
+ QueryNodeWord* word_node = new QueryNodeWord(word);
+ if (in_quotes)
+ word_node->set_literal(true);
+ query_stack.back()->AddChild(word_node);
+ } else { // Punctuation.
+ if (IsQueryQuote(query[iter.prev()])) {
+ if (!in_quotes) {
+ QueryNodeList* quotes_node = new QueryNodePhrase;
+ query_stack.back()->AddChild(quotes_node);
+ query_stack.push_back(quotes_node);
+ in_quotes = true;
+ } else {
+ query_stack.pop_back(); // stop adding to the quoted phrase
+ in_quotes = false;
+ }
+ }
+ }
+ }
+
+ root->RemoveEmptySubnodes();
+ return true;
+}
+
+void QueryParser::ExtractWords(const std::wstring& text,
+ std::vector<std::wstring>* words) {
+ WordIterator iter(text, WordIterator::BREAK_WORD);
+ // TODO(evanm): support a locale here
+ if (!iter.Init())
+ return;
+
+ while (iter.Advance()) {
+ // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
+ // is not necessarily a word, but could also be a sequence of punctuation
+ // or whitespace.
+ if (iter.IsWord()) {
+ std::wstring word = iter.GetWord();
+ if (!word.empty())
+ words->push_back(word);
+ }
+ }
+}
+
+void QueryNodeList::RemoveEmptySubnodes() {
+ for (size_t i = 0; i < children_.size(); ++i) {
+ if (children_[i]->IsWord())
+ continue;
+
+ QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
+ list_node->RemoveEmptySubnodes();
+ if (list_node->children()->empty()) {
+ children_.erase(children_.begin() + i);
+ --i;
+ delete list_node;
+ }
+ }
+}