diff options
author | joaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-12-13 20:36:53 +0000 |
---|---|---|
committer | joaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-12-13 20:36:53 +0000 |
commit | 716c016d95025a8f4d42baab6639b9dc90498f2d (patch) | |
tree | 9efb703e070ecbfb1b73bfac9b350a3b81af14f6 /components/url_matcher/substring_set_matcher.h | |
parent | 32c90a98f03fa68da4ba3d97a8e56ca70e92a07d (diff) | |
download | chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.zip chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.gz chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.bz2 |
Move extensions/common/matcher into components/url_matcher.
This allows using that code in builds that don't include extensions without
having to introduce layering exceptions. This is meant for inclusion on the
iOS build.
BUG=271392
TBR=brettw@chromium.org
Review URL: https://codereview.chromium.org/113903002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@240736 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'components/url_matcher/substring_set_matcher.h')
-rw-r--r-- | components/url_matcher/substring_set_matcher.h | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/components/url_matcher/substring_set_matcher.h b/components/url_matcher/substring_set_matcher.h new file mode 100644 index 0000000..8a05ef3 --- /dev/null +++ b/components/url_matcher/substring_set_matcher.h @@ -0,0 +1,141 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_ +#define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_ + +#include <limits> +#include <map> +#include <set> +#include <string> +#include <vector> + +#include "base/basictypes.h" +#include "components/url_matcher/string_pattern.h" +#include "components/url_matcher/url_matcher_export.h" + +namespace url_matcher { + +// Class that store a set of string patterns and can find for a string S, +// which string patterns occur in S. +class URL_MATCHER_EXPORT SubstringSetMatcher { + public: + SubstringSetMatcher(); + ~SubstringSetMatcher(); + + // Registers all |patterns|. The ownership remains with the caller. + // The same pattern cannot be registered twice and each pattern needs to have + // a unique ID. + // Ownership of the patterns remains with the caller. + void RegisterPatterns(const std::vector<const StringPattern*>& patterns); + + // Unregisters the passed |patterns|. + void UnregisterPatterns(const std::vector<const StringPattern*>& patterns); + + // Analogous to RegisterPatterns and UnregisterPatterns but executes both + // operations in one step, which is cheaper in the execution. + void RegisterAndUnregisterPatterns( + const std::vector<const StringPattern*>& to_register, + const std::vector<const StringPattern*>& to_unregister); + + // Matches |text| against all registered StringPatterns. Stores the IDs + // of matching patterns in |matches|. |matches| is not cleared before adding + // to it. + bool Match(const std::string& text, + std::set<StringPattern::ID>* matches) const; + + // Returns true if this object retains no allocated data. Only for debugging. + bool IsEmpty() const; + + private: + // A node of an Aho Corasick Tree. This is implemented according to + // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf + // + // The algorithm is based on the idea of building a trie of all registered + // patterns. Each node of the tree is annotated with a set of pattern + // IDs that are used to report matches. + // + // The root of the trie represents an empty match. If we were looking whether + // any registered pattern matches a text at the beginning of the text (i.e. + // whether any pattern is a prefix of the text), we could just follow + // nodes in the trie according to the matching characters in the text. + // E.g., if text == "foobar", we would follow the trie from the root node + // to its child labeled 'f', from there to child 'o', etc. In this process we + // would report all pattern IDs associated with the trie nodes as matches. + // + // As we are not looking for all prefix matches but all substring matches, + // this algorithm would need to compare text.substr(0), text.substr(1), ... + // against the trie, which is in O(|text|^2). + // + // The Aho Corasick algorithm improves this runtime by using failure edges. + // In case we have found a partial match of length k in the text + // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at + // a node at depth k, but cannot find a match in the trie for character + // text[i + k] at depth k + 1, we follow a failure edge. This edge + // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that + // is a prefix of any registered pattern. + // + // If your brain thinks "Forget it, let's go shopping.", don't worry. + // Take a nap and read an introductory text on the Aho Corasick algorithm. + // It will make sense. Eventually. + class AhoCorasickNode { + public: + // Key: label of the edge, value: node index in |tree_| of parent class. + typedef std::map<char, uint32> Edges; + typedef std::set<StringPattern::ID> Matches; + + static const uint32 kNoSuchEdge; // Represents an invalid node index. + + AhoCorasickNode(); + ~AhoCorasickNode(); + AhoCorasickNode(const AhoCorasickNode& other); + AhoCorasickNode& operator=(const AhoCorasickNode& other); + + uint32 GetEdge(char c) const; + void SetEdge(char c, uint32 node); + const Edges& edges() const { return edges_; } + + uint32 failure() const { return failure_; } + void set_failure(uint32 failure) { failure_ = failure; } + + void AddMatch(StringPattern::ID id); + void AddMatches(const Matches& matches); + const Matches& matches() const { return matches_; } + + private: + // Outgoing edges of current node. + Edges edges_; + + // Node index that failure edge leads to. + uint32 failure_; + + // Identifiers of matches. + Matches matches_; + }; + + typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap; + typedef std::vector<const StringPattern*> SubstringPatternVector; + + // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string. + void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns); + + // Inserts a path for |pattern->pattern()| into the tree and adds + // |pattern->id()| to the set of matches. Ownership of |pattern| remains with + // the caller. + void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern); + void CreateFailureEdges(); + + // Set of all registered StringPatterns. Used to regenerate the + // Aho-Corasick tree in case patterns are registered or unregistered. + SubstringPatternMap patterns_; + + // The nodes of a Aho-Corasick tree. + std::vector<AhoCorasickNode> tree_; + + DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher); +}; + +} // namespace url_matcher + +#endif // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_ |