diff options
author | joaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-12-13 20:36:53 +0000 |
---|---|---|
committer | joaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-12-13 20:36:53 +0000 |
commit | 716c016d95025a8f4d42baab6639b9dc90498f2d (patch) | |
tree | 9efb703e070ecbfb1b73bfac9b350a3b81af14f6 /components/url_matcher/url_matcher.cc | |
parent | 32c90a98f03fa68da4ba3d97a8e56ca70e92a07d (diff) | |
download | chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.zip chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.gz chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.bz2 |
Move extensions/common/matcher into components/url_matcher.
This allows using that code in builds that don't include extensions without
having to introduce layering exceptions. This is meant for inclusion on the
iOS build.
BUG=271392
TBR=brettw@chromium.org
Review URL: https://codereview.chromium.org/113903002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@240736 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'components/url_matcher/url_matcher.cc')
-rw-r--r-- | components/url_matcher/url_matcher.cc | 880 |
1 files changed, 880 insertions, 0 deletions
diff --git a/components/url_matcher/url_matcher.cc b/components/url_matcher/url_matcher.cc new file mode 100644 index 0000000..de8632d --- /dev/null +++ b/components/url_matcher/url_matcher.cc @@ -0,0 +1,880 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/url_matcher/url_matcher.h" + +#include <algorithm> +#include <iterator> + +#include "base/logging.h" +#include "url/gurl.h" +#include "url/url_canon.h" + +namespace url_matcher { + +// This set of classes implement a mapping of URL Component Patterns, such as +// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns +// for use in substring comparisons. +// +// The idea of this mapping is to reduce the problem of comparing many +// URL Component Patterns against one URL to the problem of searching many +// substrings in one string: +// +// ---------------------- ----------------- +// | URL Query operator | ----translate----> | StringPattern | +// ---------------------- ----------------- +// ^ +// | +// compare +// | +// v +// ---------------------- ----------------- +// | URL to compare | | | +// | to all URL Query | ----translate----> | String | +// | operators | | | +// ---------------------- ----------------- +// +// The reason for this problem reduction is that there are efficient algorithms +// for searching many substrings in one string (see Aho-Corasick algorithm). +// +// Additionally, some of the same pieces are reused to implement regular +// expression comparisons. The FilteredRE2 implementation for matching many +// regular expressions against one string uses prefiltering, in which a set +// of substrings (derived from the regexes) are first searched for, to reduce +// the number of regular expressions to test; the prefiltering step also +// uses Aho-Corasick. +// +// Case 1: {host,path,query}_{prefix,suffix,equals} searches. +// ========================================================== +// +// For searches in this class, we normalize URLs as follows: +// +// Step 1: +// Remove scheme, port and segment from URL: +// -> http://www.example.com:8080/index.html?search=foo#first_match becomes +// www.example.com/index.html?search=foo +// +// We remove the scheme and port number because they can be checked later +// in a secondary filter step. We remove the segment (the #... part) because +// this is not guaranteed to be ASCII-7 encoded. +// +// Step 2: +// Translate URL to String and add the following position markers: +// - BU = Beginning of URL +// - ED = End of Domain +// - EP = End of Path +// - EU = End of URL +// Furthermore, the hostname is canonicalized to start with a ".". +// +// Position markers are represented as characters >127, which are therefore +// guaranteed not to be part of the ASCII-7 encoded URL character set. +// +// -> www.example.com/index.html?search=foo becomes +// BU .www.example.com ED /index.html EP ?search=foo EU +// +// -> www.example.com/index.html becomes +// BU .www.example.com ED /index.html EP EU +// +// Step 3: +// Translate URL Component Patterns as follows: +// +// host_prefix(prefix) = BU add_missing_dot_prefix(prefix) +// -> host_prefix("www.example") = BU .www.example +// +// host_suffix(suffix) = suffix ED +// -> host_suffix("example.com") = example.com ED +// -> host_suffix(".example.com") = .example.com ED +// +// host_equals(domain) = BU add_missing_dot_prefix(domain) ED +// -> host_equals("www.example.com") = BU .www.example.com ED +// +// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}). +// +// With this, we can search the StringPatterns in the normalized URL. +// +// +// Case 2: url_{prefix,suffix,equals,contains} searches. +// ===================================================== +// +// Step 1: as above, except that +// - the scheme is not removed +// - the port is not removed if it is specified and does not match the default +// port for the given scheme. +// +// Step 2: +// Translate URL to String and add the following position markers: +// - BU = Beginning of URL +// - EU = End of URL +// +// -> http://www.example.com:8080/index.html?search=foo#first_match becomes +// BU http://www.example.com:8080/index.html?search=foo EU +// -> http://www.example.com:80/index.html?search=foo#first_match becomes +// BU http://www.example.com/index.html?search=foo EU +// +// url_prefix(prefix) = BU prefix +// -> url_prefix("http://www.example") = BU http://www.example +// +// url_contains(substring) = substring +// -> url_contains("index") = index +// +// +// Case 3: {host,path,query}_contains searches. +// ============================================ +// +// These kinds of searches are not supported directly but can be derived +// by a combination of a url_contains() query followed by an explicit test: +// +// host_contains(str) = url_contains(str) followed by test whether str occurs +// in host component of original URL. +// -> host_contains("example.co") = example.co +// followed by gurl.host().find("example.co"); +// +// [similarly for path_contains and query_contains]. +// +// +// Regular expression matching (url_matches searches) +// ================================================== +// +// This class also supports matching regular expressions (RE2 syntax) +// against full URLs, which are transformed as in case 2. + +namespace { + +bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) { + return criterion == URLMatcherCondition::URL_MATCHES; +} + +bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) { + return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES; +} + +} // namespace + +// +// URLMatcherCondition +// + +URLMatcherCondition::URLMatcherCondition() + : criterion_(HOST_PREFIX), + string_pattern_(NULL) {} + +URLMatcherCondition::~URLMatcherCondition() {} + +URLMatcherCondition::URLMatcherCondition( + Criterion criterion, + const StringPattern* string_pattern) + : criterion_(criterion), + string_pattern_(string_pattern) {} + +URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs) + : criterion_(rhs.criterion_), + string_pattern_(rhs.string_pattern_) {} + +URLMatcherCondition& URLMatcherCondition::operator=( + const URLMatcherCondition& rhs) { + criterion_ = rhs.criterion_; + string_pattern_ = rhs.string_pattern_; + return *this; +} + +bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const { + if (criterion_ < rhs.criterion_) return true; + if (criterion_ > rhs.criterion_) return false; + if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) + return *string_pattern_ < *rhs.string_pattern_; + if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true; + // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, + // or both are NULL. + return false; +} + +bool URLMatcherCondition::IsFullURLCondition() const { + // For these criteria the SubstringMatcher needs to be executed on the + // GURL that is canonicalized with + // URLMatcherConditionFactory::CanonicalizeURLForFullSearches. + switch (criterion_) { + case HOST_CONTAINS: + case PATH_CONTAINS: + case QUERY_CONTAINS: + case URL_PREFIX: + case URL_SUFFIX: + case URL_CONTAINS: + case URL_EQUALS: + return true; + default: + break; + } + return false; +} + +bool URLMatcherCondition::IsRegexCondition() const { + return IsRegexCriterion(criterion_); +} + +bool URLMatcherCondition::IsOriginAndPathRegexCondition() const { + return IsOriginAndPathRegexCriterion(criterion_); +} + +bool URLMatcherCondition::IsMatch( + const std::set<StringPattern::ID>& matching_patterns, + const GURL& url) const { + DCHECK(string_pattern_); + if (!ContainsKey(matching_patterns, string_pattern_->id())) + return false; + // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on + // a substring match on the raw URL. In case of a match, we need to verify + // that the match was found in the correct component of the URL. + switch (criterion_) { + case HOST_CONTAINS: + return url.host().find(string_pattern_->pattern()) != + std::string::npos; + case PATH_CONTAINS: + return url.path().find(string_pattern_->pattern()) != + std::string::npos; + case QUERY_CONTAINS: + return url.query().find(string_pattern_->pattern()) != + std::string::npos; + default: + break; + } + return true; +} + +// +// URLMatcherConditionFactory +// + +namespace { +// These are symbols that are not contained in 7-bit ASCII used in GURLs. +const char kBeginningOfURL[] = {static_cast<char>(-1), 0}; +const char kEndOfDomain[] = {static_cast<char>(-2), 0}; +const char kEndOfPath[] = {static_cast<char>(-3), 0}; +const char kEndOfURL[] = {static_cast<char>(-4), 0}; +} // namespace + +URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {} + +URLMatcherConditionFactory::~URLMatcherConditionFactory() { + STLDeleteElements(&substring_pattern_singletons_); + STLDeleteElements(®ex_pattern_singletons_); + STLDeleteElements(&origin_and_path_regex_pattern_singletons_); +} + +std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches( + const GURL& url) const { + return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain + + url.path() + kEndOfPath + + (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL; +} + +URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition( + const std::string& prefix) { + return CreateCondition(URLMatcherCondition::HOST_PREFIX, + kBeginningOfURL + CanonicalizeHostname(prefix)); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition( + const std::string& suffix) { + return CreateCondition(URLMatcherCondition::HOST_SUFFIX, + suffix + kEndOfDomain); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition( + const std::string& str) { + return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition( + const std::string& str) { + return CreateCondition(URLMatcherCondition::HOST_EQUALS, + kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain); +} + +URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition( + const std::string& prefix) { + return CreateCondition(URLMatcherCondition::PATH_PREFIX, + kEndOfDomain + prefix); +} + +URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition( + const std::string& suffix) { + return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath); +} + +URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition( + const std::string& str) { + return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str); +} + +URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition( + const std::string& str) { + return CreateCondition(URLMatcherCondition::PATH_EQUALS, + kEndOfDomain + str + kEndOfPath); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition( + const std::string& prefix) { + std::string pattern; + if (!prefix.empty() && prefix[0] == '?') + pattern = kEndOfPath + prefix; + else + pattern = kEndOfPath + ('?' + prefix); + + return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition( + const std::string& suffix) { + if (!suffix.empty() && suffix[0] == '?') { + return CreateQueryEqualsCondition(suffix); + } else { + return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, + suffix + kEndOfURL); + } +} + +URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition( + const std::string& str) { + if (!str.empty() && str[0] == '?') + return CreateQueryPrefixCondition(str); + else + return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition( + const std::string& str) { + std::string pattern; + if (!str.empty() && str[0] == '?') + pattern = kEndOfPath + str + kEndOfURL; + else + pattern = kEndOfPath + ('?' + str) + kEndOfURL; + + return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern); +} + +URLMatcherCondition + URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition( + const std::string& host_suffix, + const std::string& path_prefix) { + return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX, + host_suffix + kEndOfDomain + path_prefix); +} + +URLMatcherCondition +URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition( + const std::string& host, + const std::string& path_prefix) { + return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX, + kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain + + path_prefix); +} + +std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches( + const GURL& url) const { + GURL::Replacements replacements; + replacements.ClearPassword(); + replacements.ClearUsername(); + replacements.ClearRef(); + // Clear port if it is implicit from scheme. + if (url.has_port()) { + const std::string& port = url.scheme(); + if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) == + url.EffectiveIntPort()) { + replacements.ClearPort(); + } + } + return kBeginningOfURL + url.ReplaceComponents(replacements).spec() + + kEndOfURL; +} + +static std::string CanonicalizeURLForRegexSearchesHelper( + const GURL& url, + bool clear_query) { + GURL::Replacements replacements; + replacements.ClearPassword(); + replacements.ClearUsername(); + replacements.ClearRef(); + if (clear_query) + replacements.ClearQuery(); + // Clear port if it is implicit from scheme. + if (url.has_port()) { + const std::string& port = url.scheme(); + if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) == + url.EffectiveIntPort()) { + replacements.ClearPort(); + } + } + return url.ReplaceComponents(replacements).spec(); +} + +std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches( + const GURL& url) const { + return CanonicalizeURLForRegexSearchesHelper(url, false); +} + +std::string +URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches( + const GURL& url) const { + return CanonicalizeURLForRegexSearchesHelper(url, true); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition( + const std::string& prefix) { + return CreateCondition(URLMatcherCondition::URL_PREFIX, + kBeginningOfURL + prefix); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition( + const std::string& suffix) { + return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition( + const std::string& str) { + return CreateCondition(URLMatcherCondition::URL_CONTAINS, str); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition( + const std::string& str) { + return CreateCondition(URLMatcherCondition::URL_EQUALS, + kBeginningOfURL + str + kEndOfURL); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition( + const std::string& regex) { + return CreateCondition(URLMatcherCondition::URL_MATCHES, regex); +} + +URLMatcherCondition +URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition( + const std::string& regex) { + return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex); +} + +void URLMatcherConditionFactory::ForgetUnusedPatterns( + const std::set<StringPattern::ID>& used_patterns) { + PatternSingletons::iterator i = substring_pattern_singletons_.begin(); + while (i != substring_pattern_singletons_.end()) { + if (ContainsKey(used_patterns, (*i)->id())) { + ++i; + } else { + delete *i; + substring_pattern_singletons_.erase(i++); + } + } + i = regex_pattern_singletons_.begin(); + while (i != regex_pattern_singletons_.end()) { + if (ContainsKey(used_patterns, (*i)->id())) { + ++i; + } else { + delete *i; + regex_pattern_singletons_.erase(i++); + } + } + i = origin_and_path_regex_pattern_singletons_.begin(); + while (i != origin_and_path_regex_pattern_singletons_.end()) { + if (ContainsKey(used_patterns, (*i)->id())) { + ++i; + } else { + delete *i; + origin_and_path_regex_pattern_singletons_.erase(i++); + } + } +} + +bool URLMatcherConditionFactory::IsEmpty() const { + return substring_pattern_singletons_.empty() && + regex_pattern_singletons_.empty() && + origin_and_path_regex_pattern_singletons_.empty(); +} + +URLMatcherCondition URLMatcherConditionFactory::CreateCondition( + URLMatcherCondition::Criterion criterion, + const std::string& pattern) { + StringPattern search_pattern(pattern, 0); + PatternSingletons* pattern_singletons = NULL; + if (IsRegexCriterion(criterion)) + pattern_singletons = ®ex_pattern_singletons_; + else if (IsOriginAndPathRegexCriterion(criterion)) + pattern_singletons = &origin_and_path_regex_pattern_singletons_; + else + pattern_singletons = &substring_pattern_singletons_; + + PatternSingletons::const_iterator iter = + pattern_singletons->find(&search_pattern); + + if (iter != pattern_singletons->end()) { + return URLMatcherCondition(criterion, *iter); + } else { + StringPattern* new_pattern = + new StringPattern(pattern, id_counter_++); + pattern_singletons->insert(new_pattern); + return URLMatcherCondition(criterion, new_pattern); + } +} + +std::string URLMatcherConditionFactory::CanonicalizeHostname( + const std::string& hostname) const { + if (!hostname.empty() && hostname[0] == '.') + return hostname; + else + return "." + hostname; +} + +bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()( + StringPattern* lhs, + StringPattern* rhs) const { + if (lhs == NULL && rhs != NULL) return true; + if (lhs != NULL && rhs != NULL) + return lhs->pattern() < rhs->pattern(); + // Either both are NULL or only rhs is NULL. + return false; +} + +// +// URLMatcherSchemeFilter +// + +URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter) + : filters_(1) { + filters_.push_back(filter); +} + +URLMatcherSchemeFilter::URLMatcherSchemeFilter( + const std::vector<std::string>& filters) + : filters_(filters) {} + +URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {} + +bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const { + return std::find(filters_.begin(), filters_.end(), url.scheme()) != + filters_.end(); +} + +// +// URLMatcherPortFilter +// + +URLMatcherPortFilter::URLMatcherPortFilter( + const std::vector<URLMatcherPortFilter::Range>& ranges) + : ranges_(ranges) {} + +URLMatcherPortFilter::~URLMatcherPortFilter() {} + +bool URLMatcherPortFilter::IsMatch(const GURL& url) const { + int port = url.EffectiveIntPort(); + for (std::vector<Range>::const_iterator i = ranges_.begin(); + i != ranges_.end(); ++i) { + if (i->first <= port && port <= i->second) + return true; + } + return false; +} + +// static +URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from, + int to) { + return Range(from, to); +} + +// static +URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) { + return Range(port, port); +} + +// +// URLMatcherConditionSet +// + +URLMatcherConditionSet::~URLMatcherConditionSet() {} + +URLMatcherConditionSet::URLMatcherConditionSet( + ID id, + const Conditions& conditions) + : id_(id), + conditions_(conditions) {} + +URLMatcherConditionSet::URLMatcherConditionSet( + ID id, + const Conditions& conditions, + scoped_ptr<URLMatcherSchemeFilter> scheme_filter, + scoped_ptr<URLMatcherPortFilter> port_filter) + : id_(id), + conditions_(conditions), + scheme_filter_(scheme_filter.Pass()), + port_filter_(port_filter.Pass()) {} + +bool URLMatcherConditionSet::IsMatch( + const std::set<StringPattern::ID>& matching_patterns, + const GURL& url) const { + for (Conditions::const_iterator i = conditions_.begin(); + i != conditions_.end(); ++i) { + if (!i->IsMatch(matching_patterns, url)) + return false; + } + if (scheme_filter_.get() && !scheme_filter_->IsMatch(url)) + return false; + if (port_filter_.get() && !port_filter_->IsMatch(url)) + return false; + return true; +} + +// +// URLMatcher +// + +URLMatcher::URLMatcher() {} + +URLMatcher::~URLMatcher() {} + +void URLMatcher::AddConditionSets( + const URLMatcherConditionSet::Vector& condition_sets) { + for (URLMatcherConditionSet::Vector::const_iterator i = + condition_sets.begin(); i != condition_sets.end(); ++i) { + DCHECK(url_matcher_condition_sets_.find((*i)->id()) == + url_matcher_condition_sets_.end()); + url_matcher_condition_sets_[(*i)->id()] = *i; + } + UpdateInternalDatastructures(); +} + +void URLMatcher::RemoveConditionSets( + const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) { + for (std::vector<URLMatcherConditionSet::ID>::const_iterator i = + condition_set_ids.begin(); i != condition_set_ids.end(); ++i) { + DCHECK(url_matcher_condition_sets_.find(*i) != + url_matcher_condition_sets_.end()); + url_matcher_condition_sets_.erase(*i); + } + UpdateInternalDatastructures(); +} + +void URLMatcher::ClearUnusedConditionSets() { + UpdateConditionFactory(); +} + +std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( + const GURL& url) const { + // Find all IDs of StringPatterns that match |url|. + // See URLMatcherConditionFactory for the canonicalization of URLs and the + // distinction between full url searches and url component searches. + std::set<StringPattern::ID> matches; + if (!full_url_matcher_.IsEmpty()) { + full_url_matcher_.Match( + condition_factory_.CanonicalizeURLForFullSearches(url), &matches); + } + if (!url_component_matcher_.IsEmpty()) { + url_component_matcher_.Match( + condition_factory_.CanonicalizeURLForComponentSearches(url), &matches); + } + if (!regex_set_matcher_.IsEmpty()) { + regex_set_matcher_.Match( + condition_factory_.CanonicalizeURLForRegexSearches(url), &matches); + } + if (!origin_and_path_regex_set_matcher_.IsEmpty()) { + origin_and_path_regex_set_matcher_.Match( + condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url), + &matches); + } + + // Calculate all URLMatcherConditionSets for which all URLMatcherConditions + // were fulfilled. + std::set<URLMatcherConditionSet::ID> result; + for (std::set<StringPattern::ID>::const_iterator i = matches.begin(); + i != matches.end(); ++i) { + // For each URLMatcherConditionSet there is exactly one condition + // registered in substring_match_triggers_. This means that the following + // logic tests each URLMatcherConditionSet exactly once if it can be + // completely fulfilled. + StringPatternTriggers::const_iterator triggered_condition_sets_iter = + substring_match_triggers_.find(*i); + if (triggered_condition_sets_iter == substring_match_triggers_.end()) + continue; // Not all substring matches are triggers for a condition set. + const std::set<URLMatcherConditionSet::ID>& condition_sets = + triggered_condition_sets_iter->second; + for (std::set<URLMatcherConditionSet::ID>::const_iterator j = + condition_sets.begin(); j != condition_sets.end(); ++j) { + URLMatcherConditionSets::const_iterator condition_set_iter = + url_matcher_condition_sets_.find(*j); + DCHECK(condition_set_iter != url_matcher_condition_sets_.end()); + if (condition_set_iter->second->IsMatch(matches, url)) + result.insert(*j); + } + } + + return result; +} + +bool URLMatcher::IsEmpty() const { + return condition_factory_.IsEmpty() && + url_matcher_condition_sets_.empty() && + substring_match_triggers_.empty() && + full_url_matcher_.IsEmpty() && + url_component_matcher_.IsEmpty() && + regex_set_matcher_.IsEmpty() && + origin_and_path_regex_set_matcher_.IsEmpty() && + registered_full_url_patterns_.empty() && + registered_url_component_patterns_.empty(); +} + +void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { + // The purpose of |full_url_conditions| is just that we need to execute + // the same logic once for Full URL searches and once for URL Component + // searches (see URLMatcherConditionFactory). + + // Determine which patterns need to be registered when this function + // terminates. + std::set<const StringPattern*> new_patterns; + for (URLMatcherConditionSets::const_iterator condition_set_iter = + url_matcher_condition_sets_.begin(); + condition_set_iter != url_matcher_condition_sets_.end(); + ++condition_set_iter) { + const URLMatcherConditionSet::Conditions& conditions = + condition_set_iter->second->conditions(); + for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = + conditions.begin(); condition_iter != conditions.end(); + ++condition_iter) { + // If we are called to process Full URL searches, ignore others, and + // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.) + if (!condition_iter->IsRegexCondition() && + !condition_iter->IsOriginAndPathRegexCondition() && + full_url_conditions == condition_iter->IsFullURLCondition()) + new_patterns.insert(condition_iter->string_pattern()); + } + } + + // This is the set of patterns that were registered before this function + // is called. + std::set<const StringPattern*>& registered_patterns = + full_url_conditions ? registered_full_url_patterns_ + : registered_url_component_patterns_; + + // Add all patterns that are in new_patterns but not in registered_patterns. + std::vector<const StringPattern*> patterns_to_register = + base::STLSetDifference<std::vector<const StringPattern*> >( + new_patterns, registered_patterns); + + // Remove all patterns that are in registered_patterns but not in + // new_patterns. + std::vector<const StringPattern*> patterns_to_unregister = + base::STLSetDifference<std::vector<const StringPattern*> >( + registered_patterns, new_patterns); + + // Update the SubstringSetMatcher. + SubstringSetMatcher& url_matcher = + full_url_conditions ? full_url_matcher_ : url_component_matcher_; + url_matcher.RegisterAndUnregisterPatterns(patterns_to_register, + patterns_to_unregister); + + // Update the set of registered_patterns for the next time this function + // is being called. + registered_patterns.swap(new_patterns); +} + +void URLMatcher::UpdateRegexSetMatcher() { + std::vector<const StringPattern*> new_patterns; + std::vector<const StringPattern*> new_origin_and_path_patterns; + + for (URLMatcherConditionSets::const_iterator condition_set_iter = + url_matcher_condition_sets_.begin(); + condition_set_iter != url_matcher_condition_sets_.end(); + ++condition_set_iter) { + const URLMatcherConditionSet::Conditions& conditions = + condition_set_iter->second->conditions(); + for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = + conditions.begin(); condition_iter != conditions.end(); + ++condition_iter) { + if (condition_iter->IsRegexCondition()) { + new_patterns.push_back(condition_iter->string_pattern()); + } else if (condition_iter->IsOriginAndPathRegexCondition()) { + new_origin_and_path_patterns.push_back( + condition_iter->string_pattern()); + } + } + } + + // Start over from scratch. We can't really do better than this, since the + // FilteredRE2 backend doesn't support incremental updates. + regex_set_matcher_.ClearPatterns(); + regex_set_matcher_.AddPatterns(new_patterns); + origin_and_path_regex_set_matcher_.ClearPatterns(); + origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns); +} + +void URLMatcher::UpdateTriggers() { + // Count substring pattern frequencies. + std::map<StringPattern::ID, size_t> substring_pattern_frequencies; + for (URLMatcherConditionSets::const_iterator condition_set_iter = + url_matcher_condition_sets_.begin(); + condition_set_iter != url_matcher_condition_sets_.end(); + ++condition_set_iter) { + const URLMatcherConditionSet::Conditions& conditions = + condition_set_iter->second->conditions(); + for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = + conditions.begin(); condition_iter != conditions.end(); + ++condition_iter) { + const StringPattern* pattern = condition_iter->string_pattern(); + substring_pattern_frequencies[pattern->id()]++; + } + } + + // Update trigger conditions: Determine for each URLMatcherConditionSet which + // URLMatcherCondition contains a StringPattern that occurs least + // frequently in this URLMatcher. We assume that this condition is very + // specific and occurs rarely in URLs. If a match occurs for this + // URLMatcherCondition, we want to test all other URLMatcherCondition in the + // respective URLMatcherConditionSet as well to see whether the entire + // URLMatcherConditionSet is considered matching. + substring_match_triggers_.clear(); + for (URLMatcherConditionSets::const_iterator condition_set_iter = + url_matcher_condition_sets_.begin(); + condition_set_iter != url_matcher_condition_sets_.end(); + ++condition_set_iter) { + const URLMatcherConditionSet::Conditions& conditions = + condition_set_iter->second->conditions(); + if (conditions.empty()) + continue; + URLMatcherConditionSet::Conditions::const_iterator condition_iter = + conditions.begin(); + StringPattern::ID trigger = condition_iter->string_pattern()->id(); + // We skip the first element in the following loop. + ++condition_iter; + for (; condition_iter != conditions.end(); ++condition_iter) { + StringPattern::ID current_id = + condition_iter->string_pattern()->id(); + if (substring_pattern_frequencies[trigger] > + substring_pattern_frequencies[current_id]) { + trigger = current_id; + } + } + substring_match_triggers_[trigger].insert(condition_set_iter->second->id()); + } +} + +void URLMatcher::UpdateConditionFactory() { + std::set<StringPattern::ID> used_patterns; + for (URLMatcherConditionSets::const_iterator condition_set_iter = + url_matcher_condition_sets_.begin(); + condition_set_iter != url_matcher_condition_sets_.end(); + ++condition_set_iter) { + const URLMatcherConditionSet::Conditions& conditions = + condition_set_iter->second->conditions(); + for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = + conditions.begin(); condition_iter != conditions.end(); + ++condition_iter) { + used_patterns.insert(condition_iter->string_pattern()->id()); + } + } + condition_factory_.ForgetUnusedPatterns(used_patterns); +} + +void URLMatcher::UpdateInternalDatastructures() { + UpdateSubstringSetMatcher(false); + UpdateSubstringSetMatcher(true); + UpdateRegexSetMatcher(); + UpdateTriggers(); + UpdateConditionFactory(); +} + +} // namespace url_matcher |