summaryrefslogtreecommitdiffstats
path: root/components/url_matcher/regex_set_matcher.h
blob: e4eb1dc052f1e0bb14be7371f8992a4d7abc02da (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_
#define COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_

#include <map>
#include <set>
#include <string>
#include <vector>

#include "base/memory/scoped_ptr.h"
#include "components/url_matcher/string_pattern.h"
#include "components/url_matcher/substring_set_matcher.h"
#include "components/url_matcher/url_matcher_export.h"

namespace re2 {
class FilteredRE2;
}

namespace url_matcher {

// Efficiently matches URLs against a collection of regular expressions,
// using FilteredRE2 to reduce the number of regexes that must be matched
// by pre-filtering with substring matching. See:
// http://swtch.com/~rsc/regexp/regexp3.html#analysis
class URL_MATCHER_EXPORT RegexSetMatcher {
 public:
  RegexSetMatcher();
  virtual ~RegexSetMatcher();

  // Adds the regex patterns in |regex_list| to the matcher. Also rebuilds
  // the FilteredRE2 matcher; thus, for efficiency, prefer adding multiple
  // patterns at once.
  // Ownership of the patterns remains with the caller.
  void AddPatterns(const std::vector<const StringPattern*>& regex_list);

  // Removes all regex patterns.
  void ClearPatterns();

  // Appends the IDs of regular expressions in our set that match the |text|
  // to |matches|.
  bool Match(const std::string& text,
             std::set<StringPattern::ID>* matches) const;

  bool IsEmpty() const;

 private:
  typedef int RE2ID;
  typedef std::map<StringPattern::ID, const StringPattern*> RegexMap;
  typedef std::vector<StringPattern::ID> RE2IDMap;

  // Use Aho-Corasick SubstringSetMatcher to find which literal patterns
  // match the |text|.
  std::vector<RE2ID> FindSubstringMatches(const std::string& text) const;

  // Rebuild FilteredRE2 from scratch. Needs to be called whenever
  // our set of regexes changes.
  // TODO(yoz): investigate if it could be done incrementally;
  // apparently not supported by FilteredRE2.
  void RebuildMatcher();

  // Clean up StringPatterns in |substring_patterns_|.
  void DeleteSubstringPatterns();

  // Mapping of regex StringPattern::IDs to regexes.
  RegexMap regexes_;
  // Mapping of RE2IDs from FilteredRE2 (which are assigned in order)
  // to regex StringPattern::IDs.
  RE2IDMap re2_id_map_;

  scoped_ptr<re2::FilteredRE2> filtered_re2_;
  scoped_ptr<SubstringSetMatcher> substring_matcher_;

  // The substring patterns from FilteredRE2, which are used in
  // |substring_matcher_| but whose lifetime is managed here.
  std::vector<const StringPattern*> substring_patterns_;
};

}  // namespace url_matcher

#endif  // COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_