summaryrefslogtreecommitdiffstats
path: root/components/url_matcher/substring_set_matcher.h
diff options
context:
space:
mode:
authorjoaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-12-13 20:36:53 +0000
committerjoaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-12-13 20:36:53 +0000
commit716c016d95025a8f4d42baab6639b9dc90498f2d (patch)
tree9efb703e070ecbfb1b73bfac9b350a3b81af14f6 /components/url_matcher/substring_set_matcher.h
parent32c90a98f03fa68da4ba3d97a8e56ca70e92a07d (diff)
downloadchromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.zip
chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.gz
chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.bz2
Move extensions/common/matcher into components/url_matcher.
This allows using that code in builds that don't include extensions without having to introduce layering exceptions. This is meant for inclusion on the iOS build. BUG=271392 TBR=brettw@chromium.org Review URL: https://codereview.chromium.org/113903002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@240736 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'components/url_matcher/substring_set_matcher.h')
-rw-r--r--components/url_matcher/substring_set_matcher.h141
1 files changed, 141 insertions, 0 deletions
diff --git a/components/url_matcher/substring_set_matcher.h b/components/url_matcher/substring_set_matcher.h
new file mode 100644
index 0000000..8a05ef3
--- /dev/null
+++ b/components/url_matcher/substring_set_matcher.h
@@ -0,0 +1,141 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
+#define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
+
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "base/basictypes.h"
+#include "components/url_matcher/string_pattern.h"
+#include "components/url_matcher/url_matcher_export.h"
+
+namespace url_matcher {
+
+// Class that store a set of string patterns and can find for a string S,
+// which string patterns occur in S.
+class URL_MATCHER_EXPORT SubstringSetMatcher {
+ public:
+ SubstringSetMatcher();
+ ~SubstringSetMatcher();
+
+ // Registers all |patterns|. The ownership remains with the caller.
+ // The same pattern cannot be registered twice and each pattern needs to have
+ // a unique ID.
+ // Ownership of the patterns remains with the caller.
+ void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
+
+ // Unregisters the passed |patterns|.
+ void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
+
+ // Analogous to RegisterPatterns and UnregisterPatterns but executes both
+ // operations in one step, which is cheaper in the execution.
+ void RegisterAndUnregisterPatterns(
+ const std::vector<const StringPattern*>& to_register,
+ const std::vector<const StringPattern*>& to_unregister);
+
+ // Matches |text| against all registered StringPatterns. Stores the IDs
+ // of matching patterns in |matches|. |matches| is not cleared before adding
+ // to it.
+ bool Match(const std::string& text,
+ std::set<StringPattern::ID>* matches) const;
+
+ // Returns true if this object retains no allocated data. Only for debugging.
+ bool IsEmpty() const;
+
+ private:
+ // A node of an Aho Corasick Tree. This is implemented according to
+ // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
+ //
+ // The algorithm is based on the idea of building a trie of all registered
+ // patterns. Each node of the tree is annotated with a set of pattern
+ // IDs that are used to report matches.
+ //
+ // The root of the trie represents an empty match. If we were looking whether
+ // any registered pattern matches a text at the beginning of the text (i.e.
+ // whether any pattern is a prefix of the text), we could just follow
+ // nodes in the trie according to the matching characters in the text.
+ // E.g., if text == "foobar", we would follow the trie from the root node
+ // to its child labeled 'f', from there to child 'o', etc. In this process we
+ // would report all pattern IDs associated with the trie nodes as matches.
+ //
+ // As we are not looking for all prefix matches but all substring matches,
+ // this algorithm would need to compare text.substr(0), text.substr(1), ...
+ // against the trie, which is in O(|text|^2).
+ //
+ // The Aho Corasick algorithm improves this runtime by using failure edges.
+ // In case we have found a partial match of length k in the text
+ // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
+ // a node at depth k, but cannot find a match in the trie for character
+ // text[i + k] at depth k + 1, we follow a failure edge. This edge
+ // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
+ // is a prefix of any registered pattern.
+ //
+ // If your brain thinks "Forget it, let's go shopping.", don't worry.
+ // Take a nap and read an introductory text on the Aho Corasick algorithm.
+ // It will make sense. Eventually.
+ class AhoCorasickNode {
+ public:
+ // Key: label of the edge, value: node index in |tree_| of parent class.
+ typedef std::map<char, uint32> Edges;
+ typedef std::set<StringPattern::ID> Matches;
+
+ static const uint32 kNoSuchEdge; // Represents an invalid node index.
+
+ AhoCorasickNode();
+ ~AhoCorasickNode();
+ AhoCorasickNode(const AhoCorasickNode& other);
+ AhoCorasickNode& operator=(const AhoCorasickNode& other);
+
+ uint32 GetEdge(char c) const;
+ void SetEdge(char c, uint32 node);
+ const Edges& edges() const { return edges_; }
+
+ uint32 failure() const { return failure_; }
+ void set_failure(uint32 failure) { failure_ = failure; }
+
+ void AddMatch(StringPattern::ID id);
+ void AddMatches(const Matches& matches);
+ const Matches& matches() const { return matches_; }
+
+ private:
+ // Outgoing edges of current node.
+ Edges edges_;
+
+ // Node index that failure edge leads to.
+ uint32 failure_;
+
+ // Identifiers of matches.
+ Matches matches_;
+ };
+
+ typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
+ typedef std::vector<const StringPattern*> SubstringPatternVector;
+
+ // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
+ void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
+
+ // Inserts a path for |pattern->pattern()| into the tree and adds
+ // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
+ // the caller.
+ void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
+ void CreateFailureEdges();
+
+ // Set of all registered StringPatterns. Used to regenerate the
+ // Aho-Corasick tree in case patterns are registered or unregistered.
+ SubstringPatternMap patterns_;
+
+ // The nodes of a Aho-Corasick tree.
+ std::vector<AhoCorasickNode> tree_;
+
+ DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
+};
+
+} // namespace url_matcher
+
+#endif // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_