Move extensions/common/matcher into components/url_matcher.

This allows using that code in builds that don't include extensions without having to introduce layering exceptions. This is meant for inclusion on the iOS build. BUG=271392 TBR=brettw@chromium.org Review URL: https://codereview.chromium.org/113903002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@240736 0039d316-1c4b-4281-b951-d872f2087c98
author: joaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-12-13 20:36:53 +0000
committer: joaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-12-13 20:36:53 +0000
commit: 716c016d95025a8f4d42baab6639b9dc90498f2d (patch)
tree: 9efb703e070ecbfb1b73bfac9b350a3b81af14f6 /components/url_matcher/url_matcher.cc
parent: 32c90a98f03fa68da4ba3d97a8e56ca70e92a07d (diff)
download: chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.zip
chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.gz
chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.bz2
1 files changed, 880 insertions, 0 deletions
diff --git a/components/url_matcher/url_matcher.cc b/components/url_matcher/url_matcher.cc
new file mode 100644
index 0000000..de8632d
--- /dev/null
+++ b/components/url_matcher/url_matcher.cc
@@ -0,0 +1,880 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/url_matcher/url_matcher.h"
+
+#include <algorithm>
+#include <iterator>
+
+#include "base/logging.h"
+#include "url/gurl.h"
+#include "url/url_canon.h"
+
+namespace url_matcher {
+
+// This set of classes implement a mapping of URL Component Patterns, such as
+// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
+// for use in substring comparisons.
+//
+// The idea of this mapping is to reduce the problem of comparing many
+// URL Component Patterns against one URL to the problem of searching many
+// substrings in one string:
+//
+// ----------------------                    -----------------
+// | URL Query operator | ----translate----> | StringPattern |
+// ----------------------                    -----------------
+//                                                   ^
+//                                                   |
+//                                                compare
+//                                                   |
+//                                                   v
+// ----------------------                    -----------------
+// | URL to compare     |                    |               |
+// | to all URL Query   | ----translate----> | String        |
+// | operators          |                    |               |
+// ----------------------                    -----------------
+//
+// The reason for this problem reduction is that there are efficient algorithms
+// for searching many substrings in one string (see Aho-Corasick algorithm).
+//
+// Additionally, some of the same pieces are reused to implement regular
+// expression comparisons. The FilteredRE2 implementation for matching many
+// regular expressions against one string uses prefiltering, in which a set
+// of substrings (derived from the regexes) are first searched for, to reduce
+// the number of regular expressions to test; the prefiltering step also
+// uses Aho-Corasick.
+//
+// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
+// ==========================================================
+//
+// For searches in this class, we normalize URLs as follows:
+//
+// Step 1:
+// Remove scheme, port and segment from URL:
+// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
+//    www.example.com/index.html?search=foo
+//
+// We remove the scheme and port number because they can be checked later
+// in a secondary filter step. We remove the segment (the #... part) because
+// this is not guaranteed to be ASCII-7 encoded.
+//
+// Step 2:
+// Translate URL to String and add the following position markers:
+// - BU = Beginning of URL
+// - ED = End of Domain
+// - EP = End of Path
+// - EU = End of URL
+// Furthermore, the hostname is canonicalized to start with a ".".
+//
+// Position markers are represented as characters >127, which are therefore
+// guaranteed not to be part of the ASCII-7 encoded URL character set.
+//
+// -> www.example.com/index.html?search=foo becomes
+// BU .www.example.com ED /index.html EP ?search=foo EU
+//
+// -> www.example.com/index.html becomes
+// BU .www.example.com ED /index.html EP EU
+//
+// Step 3:
+// Translate URL Component Patterns as follows:
+//
+// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
+// -> host_prefix("www.example") = BU .www.example
+//
+// host_suffix(suffix) = suffix ED
+// -> host_suffix("example.com") = example.com ED
+// -> host_suffix(".example.com") = .example.com ED
+//
+// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
+// -> host_equals("www.example.com") = BU .www.example.com ED
+//
+// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
+//
+// With this, we can search the StringPatterns in the normalized URL.
+//
+//
+// Case 2: url_{prefix,suffix,equals,contains} searches.
+// =====================================================
+//
+// Step 1: as above, except that
+// - the scheme is not removed
+// - the port is not removed if it is specified and does not match the default
+//   port for the given scheme.
+//
+// Step 2:
+// Translate URL to String and add the following position markers:
+// - BU = Beginning of URL
+// - EU = End of URL
+//
+// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
+// BU http://www.example.com:8080/index.html?search=foo EU
+// -> http://www.example.com:80/index.html?search=foo#first_match becomes
+// BU http://www.example.com/index.html?search=foo EU
+//
+// url_prefix(prefix) = BU prefix
+// -> url_prefix("http://www.example") = BU http://www.example
+//
+// url_contains(substring) = substring
+// -> url_contains("index") = index
+//
+//
+// Case 3: {host,path,query}_contains searches.
+// ============================================
+//
+// These kinds of searches are not supported directly but can be derived
+// by a combination of a url_contains() query followed by an explicit test:
+//
+// host_contains(str) = url_contains(str) followed by test whether str occurs
+//   in host component of original URL.
+// -> host_contains("example.co") = example.co
+//    followed by gurl.host().find("example.co");
+//
+// [similarly for path_contains and query_contains].
+//
+//
+// Regular expression matching (url_matches searches)
+// ==================================================
+//
+// This class also supports matching regular expressions (RE2 syntax)
+// against full URLs, which are transformed as in case 2.
+
+namespace {
+
+bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
+  return criterion == URLMatcherCondition::URL_MATCHES;
+}
+
+bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
+  return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
+}
+
+}  // namespace
+
+//
+// URLMatcherCondition
+//
+
+URLMatcherCondition::URLMatcherCondition()
+    : criterion_(HOST_PREFIX),
+      string_pattern_(NULL) {}
+
+URLMatcherCondition::~URLMatcherCondition() {}
+
+URLMatcherCondition::URLMatcherCondition(
+    Criterion criterion,
+    const StringPattern* string_pattern)
+    : criterion_(criterion),
+      string_pattern_(string_pattern) {}
+
+URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
+    : criterion_(rhs.criterion_),
+      string_pattern_(rhs.string_pattern_) {}
+
+URLMatcherCondition& URLMatcherCondition::operator=(
+    const URLMatcherCondition& rhs) {
+  criterion_ = rhs.criterion_;
+  string_pattern_ = rhs.string_pattern_;
+  return *this;
+}
+
+bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
+  if (criterion_ < rhs.criterion_) return true;
+  if (criterion_ > rhs.criterion_) return false;
+  if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
+    return *string_pattern_ < *rhs.string_pattern_;
+  if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
+  // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
+  // or both are NULL.
+  return false;
+}
+
+bool URLMatcherCondition::IsFullURLCondition() const {
+  // For these criteria the SubstringMatcher needs to be executed on the
+  // GURL that is canonicalized with
+  // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
+  switch (criterion_) {
+    case HOST_CONTAINS:
+    case PATH_CONTAINS:
+    case QUERY_CONTAINS:
+    case URL_PREFIX:
+    case URL_SUFFIX:
+    case URL_CONTAINS:
+    case URL_EQUALS:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool URLMatcherCondition::IsRegexCondition() const {
+  return IsRegexCriterion(criterion_);
+}
+
+bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
+  return IsOriginAndPathRegexCriterion(criterion_);
+}
+
+bool URLMatcherCondition::IsMatch(
+    const std::set<StringPattern::ID>& matching_patterns,
+    const GURL& url) const {
+  DCHECK(string_pattern_);
+  if (!ContainsKey(matching_patterns, string_pattern_->id()))
+    return false;
+  // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
+  // a substring match on the raw URL. In case of a match, we need to verify
+  // that the match was found in the correct component of the URL.
+  switch (criterion_) {
+    case HOST_CONTAINS:
+      return url.host().find(string_pattern_->pattern()) !=
+          std::string::npos;
+    case PATH_CONTAINS:
+      return url.path().find(string_pattern_->pattern()) !=
+          std::string::npos;
+    case QUERY_CONTAINS:
+      return url.query().find(string_pattern_->pattern()) !=
+          std::string::npos;
+    default:
+      break;
+  }
+  return true;
+}
+
+//
+// URLMatcherConditionFactory
+//
+
+namespace {
+// These are symbols that are not contained in 7-bit ASCII used in GURLs.
+const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
+const char kEndOfDomain[] = {static_cast<char>(-2), 0};
+const char kEndOfPath[] = {static_cast<char>(-3), 0};
+const char kEndOfURL[] = {static_cast<char>(-4), 0};
+}  // namespace
+
+URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
+
+URLMatcherConditionFactory::~URLMatcherConditionFactory() {
+  STLDeleteElements(&substring_pattern_singletons_);
+  STLDeleteElements(&regex_pattern_singletons_);
+  STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
+}
+
+std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
+    const GURL& url) const {
+  return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
+         url.path() + kEndOfPath +
+         (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
+    const std::string& prefix) {
+  return CreateCondition(URLMatcherCondition::HOST_PREFIX,
+      kBeginningOfURL + CanonicalizeHostname(prefix));
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
+    const std::string& suffix) {
+  return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
+      suffix + kEndOfDomain);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
+    const std::string& str) {
+  return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
+    const std::string& str) {
+  return CreateCondition(URLMatcherCondition::HOST_EQUALS,
+      kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
+    const std::string& prefix) {
+  return CreateCondition(URLMatcherCondition::PATH_PREFIX,
+      kEndOfDomain + prefix);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
+    const std::string& suffix) {
+  return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
+    const std::string& str) {
+  return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
+    const std::string& str) {
+  return CreateCondition(URLMatcherCondition::PATH_EQUALS,
+      kEndOfDomain + str + kEndOfPath);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
+    const std::string& prefix) {
+  std::string pattern;
+  if (!prefix.empty() && prefix[0] == '?')
+    pattern = kEndOfPath + prefix;
+  else
+    pattern = kEndOfPath + ('?' + prefix);
+
+  return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
+    const std::string& suffix) {
+  if (!suffix.empty() && suffix[0] == '?') {
+    return CreateQueryEqualsCondition(suffix);
+  } else {
+    return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
+                           suffix + kEndOfURL);
+  }
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
+    const std::string& str) {
+  if (!str.empty() && str[0] == '?')
+    return CreateQueryPrefixCondition(str);
+  else
+    return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
+    const std::string& str) {
+  std::string pattern;
+  if (!str.empty() && str[0] == '?')
+    pattern = kEndOfPath + str + kEndOfURL;
+  else
+    pattern = kEndOfPath + ('?' + str) + kEndOfURL;
+
+  return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
+}
+
+URLMatcherCondition
+    URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
+    const std::string& host_suffix,
+    const std::string& path_prefix) {
+  return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
+      host_suffix + kEndOfDomain + path_prefix);
+}
+
+URLMatcherCondition
+URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
+    const std::string& host,
+    const std::string& path_prefix) {
+  return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
+      kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
+      path_prefix);
+}
+
+std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
+    const GURL& url) const {
+  GURL::Replacements replacements;
+  replacements.ClearPassword();
+  replacements.ClearUsername();
+  replacements.ClearRef();
+  // Clear port if it is implicit from scheme.
+  if (url.has_port()) {
+    const std::string& port = url.scheme();
+    if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
+            url.EffectiveIntPort()) {
+      replacements.ClearPort();
+    }
+  }
+  return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
+      kEndOfURL;
+}
+
+static std::string CanonicalizeURLForRegexSearchesHelper(
+    const GURL& url,
+    bool clear_query) {
+  GURL::Replacements replacements;
+  replacements.ClearPassword();
+  replacements.ClearUsername();
+  replacements.ClearRef();
+  if (clear_query)
+    replacements.ClearQuery();
+  // Clear port if it is implicit from scheme.
+  if (url.has_port()) {
+    const std::string& port = url.scheme();
+    if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
+            url.EffectiveIntPort()) {
+      replacements.ClearPort();
+    }
+  }
+  return url.ReplaceComponents(replacements).spec();
+}
+
+std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
+    const GURL& url) const {
+  return CanonicalizeURLForRegexSearchesHelper(url, false);
+}
+
+std::string
+URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
+    const GURL& url) const {
+  return CanonicalizeURLForRegexSearchesHelper(url, true);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
+    const std::string& prefix) {
+  return CreateCondition(URLMatcherCondition::URL_PREFIX,
+      kBeginningOfURL + prefix);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
+    const std::string& suffix) {
+  return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
+    const std::string& str) {
+  return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
+    const std::string& str) {
+  return CreateCondition(URLMatcherCondition::URL_EQUALS,
+      kBeginningOfURL + str + kEndOfURL);
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
+    const std::string& regex) {
+  return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
+}
+
+URLMatcherCondition
+URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
+    const std::string& regex) {
+  return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
+}
+
+void URLMatcherConditionFactory::ForgetUnusedPatterns(
+      const std::set<StringPattern::ID>& used_patterns) {
+  PatternSingletons::iterator i = substring_pattern_singletons_.begin();
+  while (i != substring_pattern_singletons_.end()) {
+    if (ContainsKey(used_patterns, (*i)->id())) {
+      ++i;
+    } else {
+      delete *i;
+      substring_pattern_singletons_.erase(i++);
+    }
+  }
+  i = regex_pattern_singletons_.begin();
+  while (i != regex_pattern_singletons_.end()) {
+    if (ContainsKey(used_patterns, (*i)->id())) {
+      ++i;
+    } else {
+      delete *i;
+      regex_pattern_singletons_.erase(i++);
+    }
+  }
+  i = origin_and_path_regex_pattern_singletons_.begin();
+  while (i != origin_and_path_regex_pattern_singletons_.end()) {
+    if (ContainsKey(used_patterns, (*i)->id())) {
+      ++i;
+    } else {
+      delete *i;
+      origin_and_path_regex_pattern_singletons_.erase(i++);
+    }
+  }
+}
+
+bool URLMatcherConditionFactory::IsEmpty() const {
+  return substring_pattern_singletons_.empty() &&
+      regex_pattern_singletons_.empty() &&
+      origin_and_path_regex_pattern_singletons_.empty();
+}
+
+URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
+    URLMatcherCondition::Criterion criterion,
+    const std::string& pattern) {
+  StringPattern search_pattern(pattern, 0);
+  PatternSingletons* pattern_singletons = NULL;
+  if (IsRegexCriterion(criterion))
+    pattern_singletons = &regex_pattern_singletons_;
+  else if (IsOriginAndPathRegexCriterion(criterion))
+    pattern_singletons = &origin_and_path_regex_pattern_singletons_;
+  else
+    pattern_singletons = &substring_pattern_singletons_;
+
+  PatternSingletons::const_iterator iter =
+      pattern_singletons->find(&search_pattern);
+
+  if (iter != pattern_singletons->end()) {
+    return URLMatcherCondition(criterion, *iter);
+  } else {
+    StringPattern* new_pattern =
+        new StringPattern(pattern, id_counter_++);
+    pattern_singletons->insert(new_pattern);
+    return URLMatcherCondition(criterion, new_pattern);
+  }
+}
+
+std::string URLMatcherConditionFactory::CanonicalizeHostname(
+    const std::string& hostname) const {
+  if (!hostname.empty() && hostname[0] == '.')
+    return hostname;
+  else
+    return "." + hostname;
+}
+
+bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
+    StringPattern* lhs,
+    StringPattern* rhs) const {
+  if (lhs == NULL && rhs != NULL) return true;
+  if (lhs != NULL && rhs != NULL)
+    return lhs->pattern() < rhs->pattern();
+  // Either both are NULL or only rhs is NULL.
+  return false;
+}
+
+//
+// URLMatcherSchemeFilter
+//
+
+URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
+    : filters_(1) {
+  filters_.push_back(filter);
+}
+
+URLMatcherSchemeFilter::URLMatcherSchemeFilter(
+    const std::vector<std::string>& filters)
+    : filters_(filters) {}
+
+URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
+
+bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
+  return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
+      filters_.end();
+}
+
+//
+// URLMatcherPortFilter
+//
+
+URLMatcherPortFilter::URLMatcherPortFilter(
+    const std::vector<URLMatcherPortFilter::Range>& ranges)
+    : ranges_(ranges) {}
+
+URLMatcherPortFilter::~URLMatcherPortFilter() {}
+
+bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
+  int port = url.EffectiveIntPort();
+  for (std::vector<Range>::const_iterator i = ranges_.begin();
+       i != ranges_.end(); ++i) {
+    if (i->first <= port && port <= i->second)
+      return true;
+  }
+  return false;
+}
+
+// static
+URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
+                                                              int to) {
+  return Range(from, to);
+}
+
+// static
+URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
+  return Range(port, port);
+}
+
+//
+// URLMatcherConditionSet
+//
+
+URLMatcherConditionSet::~URLMatcherConditionSet() {}
+
+URLMatcherConditionSet::URLMatcherConditionSet(
+    ID id,
+    const Conditions& conditions)
+    : id_(id),
+      conditions_(conditions) {}
+
+URLMatcherConditionSet::URLMatcherConditionSet(
+    ID id,
+    const Conditions& conditions,
+    scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
+    scoped_ptr<URLMatcherPortFilter> port_filter)
+    : id_(id),
+      conditions_(conditions),
+      scheme_filter_(scheme_filter.Pass()),
+      port_filter_(port_filter.Pass()) {}
+
+bool URLMatcherConditionSet::IsMatch(
+    const std::set<StringPattern::ID>& matching_patterns,
+    const GURL& url) const {
+  for (Conditions::const_iterator i = conditions_.begin();
+       i != conditions_.end(); ++i) {
+    if (!i->IsMatch(matching_patterns, url))
+      return false;
+  }
+  if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
+    return false;
+  if (port_filter_.get() && !port_filter_->IsMatch(url))
+    return false;
+  return true;
+}
+
+//
+// URLMatcher
+//
+
+URLMatcher::URLMatcher() {}
+
+URLMatcher::~URLMatcher() {}
+
+void URLMatcher::AddConditionSets(
+    const URLMatcherConditionSet::Vector& condition_sets) {
+  for (URLMatcherConditionSet::Vector::const_iterator i =
+       condition_sets.begin(); i != condition_sets.end(); ++i) {
+    DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
+        url_matcher_condition_sets_.end());
+    url_matcher_condition_sets_[(*i)->id()] = *i;
+  }
+  UpdateInternalDatastructures();
+}
+
+void URLMatcher::RemoveConditionSets(
+    const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
+  for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
+       condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
+    DCHECK(url_matcher_condition_sets_.find(*i) !=
+        url_matcher_condition_sets_.end());
+    url_matcher_condition_sets_.erase(*i);
+  }
+  UpdateInternalDatastructures();
+}
+
+void URLMatcher::ClearUnusedConditionSets() {
+  UpdateConditionFactory();
+}
+
+std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
+    const GURL& url) const {
+  // Find all IDs of StringPatterns that match |url|.
+  // See URLMatcherConditionFactory for the canonicalization of URLs and the
+  // distinction between full url searches and url component searches.
+  std::set<StringPattern::ID> matches;
+  if (!full_url_matcher_.IsEmpty()) {
+    full_url_matcher_.Match(
+        condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
+  }
+  if (!url_component_matcher_.IsEmpty()) {
+    url_component_matcher_.Match(
+        condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
+  }
+  if (!regex_set_matcher_.IsEmpty()) {
+    regex_set_matcher_.Match(
+        condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
+  }
+  if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
+    origin_and_path_regex_set_matcher_.Match(
+        condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
+        &matches);
+  }
+
+  // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
+  // were fulfilled.
+  std::set<URLMatcherConditionSet::ID> result;
+  for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
+       i != matches.end(); ++i) {
+    // For each URLMatcherConditionSet there is exactly one condition
+    // registered in substring_match_triggers_. This means that the following
+    // logic tests each URLMatcherConditionSet exactly once if it can be
+    // completely fulfilled.
+    StringPatternTriggers::const_iterator triggered_condition_sets_iter =
+        substring_match_triggers_.find(*i);
+    if (triggered_condition_sets_iter == substring_match_triggers_.end())
+      continue;  // Not all substring matches are triggers for a condition set.
+    const std::set<URLMatcherConditionSet::ID>& condition_sets =
+        triggered_condition_sets_iter->second;
+    for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
+         condition_sets.begin(); j != condition_sets.end(); ++j) {
+      URLMatcherConditionSets::const_iterator condition_set_iter =
+          url_matcher_condition_sets_.find(*j);
+      DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
+      if (condition_set_iter->second->IsMatch(matches, url))
+        result.insert(*j);
+    }
+  }
+
+  return result;
+}
+
+bool URLMatcher::IsEmpty() const {
+  return condition_factory_.IsEmpty() &&
+      url_matcher_condition_sets_.empty() &&
+      substring_match_triggers_.empty() &&
+      full_url_matcher_.IsEmpty() &&
+      url_component_matcher_.IsEmpty() &&
+      regex_set_matcher_.IsEmpty() &&
+      origin_and_path_regex_set_matcher_.IsEmpty() &&
+      registered_full_url_patterns_.empty() &&
+      registered_url_component_patterns_.empty();
+}
+
+void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
+  // The purpose of |full_url_conditions| is just that we need to execute
+  // the same logic once for Full URL searches and once for URL Component
+  // searches (see URLMatcherConditionFactory).
+
+  // Determine which patterns need to be registered when this function
+  // terminates.
+  std::set<const StringPattern*> new_patterns;
+  for (URLMatcherConditionSets::const_iterator condition_set_iter =
+      url_matcher_condition_sets_.begin();
+      condition_set_iter != url_matcher_condition_sets_.end();
+      ++condition_set_iter) {
+    const URLMatcherConditionSet::Conditions& conditions =
+        condition_set_iter->second->conditions();
+    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+         conditions.begin(); condition_iter != conditions.end();
+         ++condition_iter) {
+      // If we are called to process Full URL searches, ignore others, and
+      // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
+      if (!condition_iter->IsRegexCondition() &&
+          !condition_iter->IsOriginAndPathRegexCondition() &&
+          full_url_conditions == condition_iter->IsFullURLCondition())
+        new_patterns.insert(condition_iter->string_pattern());
+    }
+  }
+
+  // This is the set of patterns that were registered before this function
+  // is called.
+  std::set<const StringPattern*>& registered_patterns =
+      full_url_conditions ? registered_full_url_patterns_
+                          : registered_url_component_patterns_;
+
+  // Add all patterns that are in new_patterns but not in registered_patterns.
+  std::vector<const StringPattern*> patterns_to_register =
+      base::STLSetDifference<std::vector<const StringPattern*> >(
+          new_patterns, registered_patterns);
+
+  // Remove all patterns that are in registered_patterns but not in
+  // new_patterns.
+  std::vector<const StringPattern*> patterns_to_unregister =
+      base::STLSetDifference<std::vector<const StringPattern*> >(
+           registered_patterns, new_patterns);
+
+  // Update the SubstringSetMatcher.
+  SubstringSetMatcher& url_matcher =
+      full_url_conditions ? full_url_matcher_ : url_component_matcher_;
+  url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
+                                            patterns_to_unregister);
+
+  // Update the set of registered_patterns for the next time this function
+  // is being called.
+  registered_patterns.swap(new_patterns);
+}
+
+void URLMatcher::UpdateRegexSetMatcher() {
+  std::vector<const StringPattern*> new_patterns;
+  std::vector<const StringPattern*> new_origin_and_path_patterns;
+
+  for (URLMatcherConditionSets::const_iterator condition_set_iter =
+      url_matcher_condition_sets_.begin();
+      condition_set_iter != url_matcher_condition_sets_.end();
+      ++condition_set_iter) {
+    const URLMatcherConditionSet::Conditions& conditions =
+        condition_set_iter->second->conditions();
+    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+         conditions.begin(); condition_iter != conditions.end();
+         ++condition_iter) {
+      if (condition_iter->IsRegexCondition()) {
+        new_patterns.push_back(condition_iter->string_pattern());
+      } else if (condition_iter->IsOriginAndPathRegexCondition()) {
+        new_origin_and_path_patterns.push_back(
+            condition_iter->string_pattern());
+      }
+    }
+  }
+
+  // Start over from scratch. We can't really do better than this, since the
+  // FilteredRE2 backend doesn't support incremental updates.
+  regex_set_matcher_.ClearPatterns();
+  regex_set_matcher_.AddPatterns(new_patterns);
+  origin_and_path_regex_set_matcher_.ClearPatterns();
+  origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
+}
+
+void URLMatcher::UpdateTriggers() {
+  // Count substring pattern frequencies.
+  std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
+  for (URLMatcherConditionSets::const_iterator condition_set_iter =
+      url_matcher_condition_sets_.begin();
+      condition_set_iter != url_matcher_condition_sets_.end();
+      ++condition_set_iter) {
+    const URLMatcherConditionSet::Conditions& conditions =
+        condition_set_iter->second->conditions();
+    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+         conditions.begin(); condition_iter != conditions.end();
+         ++condition_iter) {
+      const StringPattern* pattern = condition_iter->string_pattern();
+      substring_pattern_frequencies[pattern->id()]++;
+    }
+  }
+
+  // Update trigger conditions: Determine for each URLMatcherConditionSet which
+  // URLMatcherCondition contains a StringPattern that occurs least
+  // frequently in this URLMatcher. We assume that this condition is very
+  // specific and occurs rarely in URLs. If a match occurs for this
+  // URLMatcherCondition, we want to test all other URLMatcherCondition in the
+  // respective URLMatcherConditionSet as well to see whether the entire
+  // URLMatcherConditionSet is considered matching.
+  substring_match_triggers_.clear();
+  for (URLMatcherConditionSets::const_iterator condition_set_iter =
+      url_matcher_condition_sets_.begin();
+      condition_set_iter != url_matcher_condition_sets_.end();
+      ++condition_set_iter) {
+    const URLMatcherConditionSet::Conditions& conditions =
+        condition_set_iter->second->conditions();
+    if (conditions.empty())
+      continue;
+    URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+        conditions.begin();
+    StringPattern::ID trigger = condition_iter->string_pattern()->id();
+    // We skip the first element in the following loop.
+    ++condition_iter;
+    for (; condition_iter != conditions.end(); ++condition_iter) {
+      StringPattern::ID current_id =
+          condition_iter->string_pattern()->id();
+      if (substring_pattern_frequencies[trigger] >
+          substring_pattern_frequencies[current_id]) {
+        trigger = current_id;
+      }
+    }
+    substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
+  }
+}
+
+void URLMatcher::UpdateConditionFactory() {
+  std::set<StringPattern::ID> used_patterns;
+  for (URLMatcherConditionSets::const_iterator condition_set_iter =
+      url_matcher_condition_sets_.begin();
+      condition_set_iter != url_matcher_condition_sets_.end();
+      ++condition_set_iter) {
+    const URLMatcherConditionSet::Conditions& conditions =
+        condition_set_iter->second->conditions();
+    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+         conditions.begin(); condition_iter != conditions.end();
+         ++condition_iter) {
+      used_patterns.insert(condition_iter->string_pattern()->id());
+    }
+  }
+  condition_factory_.ForgetUnusedPatterns(used_patterns);
+}
+
+void URLMatcher::UpdateInternalDatastructures() {
+  UpdateSubstringSetMatcher(false);
+  UpdateSubstringSetMatcher(true);
+  UpdateRegexSetMatcher();
+  UpdateTriggers();
+  UpdateConditionFactory();
+}
+
+}  // namespace url_matcher
author	joaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-12-13 20:36:53 +0000
committer	joaodasilva@chromium.org <joaodasilva@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-12-13 20:36:53 +0000
commit	716c016d95025a8f4d42baab6639b9dc90498f2d (patch)
tree	9efb703e070ecbfb1b73bfac9b350a3b81af14f6 /components/url_matcher/url_matcher.cc
parent	32c90a98f03fa68da4ba3d97a8e56ca70e92a07d (diff)
download	chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.zip chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.gz chromium_src-716c016d95025a8f4d42baab6639b9dc90498f2d.tar.bz2