summaryrefslogtreecommitdiffstats
path: root/components/url_matcher/substring_set_matcher.h
blob: 8a05ef30f26e4c88975644b8b0cb8ba3affcbdc2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
#define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_

#include <limits>
#include <map>
#include <set>
#include <string>
#include <vector>

#include "base/basictypes.h"
#include "components/url_matcher/string_pattern.h"
#include "components/url_matcher/url_matcher_export.h"

namespace url_matcher {

// Class that store a set of string patterns and can find for a string S,
// which string patterns occur in S.
class URL_MATCHER_EXPORT SubstringSetMatcher {
 public:
  SubstringSetMatcher();
  ~SubstringSetMatcher();

  // Registers all |patterns|. The ownership remains with the caller.
  // The same pattern cannot be registered twice and each pattern needs to have
  // a unique ID.
  // Ownership of the patterns remains with the caller.
  void RegisterPatterns(const std::vector<const StringPattern*>& patterns);

  // Unregisters the passed |patterns|.
  void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);

  // Analogous to RegisterPatterns and UnregisterPatterns but executes both
  // operations in one step, which is cheaper in the execution.
  void RegisterAndUnregisterPatterns(
      const std::vector<const StringPattern*>& to_register,
      const std::vector<const StringPattern*>& to_unregister);

  // Matches |text| against all registered StringPatterns. Stores the IDs
  // of matching patterns in |matches|. |matches| is not cleared before adding
  // to it.
  bool Match(const std::string& text,
             std::set<StringPattern::ID>* matches) const;

  // Returns true if this object retains no allocated data. Only for debugging.
  bool IsEmpty() const;

 private:
  // A node of an Aho Corasick Tree. This is implemented according to
  // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
  //
  // The algorithm is based on the idea of building a trie of all registered
  // patterns. Each node of the tree is annotated with a set of pattern
  // IDs that are used to report matches.
  //
  // The root of the trie represents an empty match. If we were looking whether
  // any registered pattern matches a text at the beginning of the text (i.e.
  // whether any pattern is a prefix of the text), we could just follow
  // nodes in the trie according to the matching characters in the text.
  // E.g., if text == "foobar", we would follow the trie from the root node
  // to its child labeled 'f', from there to child 'o', etc. In this process we
  // would report all pattern IDs associated with the trie nodes as matches.
  //
  // As we are not looking for all prefix matches but all substring matches,
  // this algorithm would need to compare text.substr(0), text.substr(1), ...
  // against the trie, which is in O(|text|^2).
  //
  // The Aho Corasick algorithm improves this runtime by using failure edges.
  // In case we have found a partial match of length k in the text
  // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
  // a node at depth k, but cannot find a match in the trie for character
  // text[i + k] at depth k + 1, we follow a failure edge. This edge
  // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
  // is a prefix of any registered pattern.
  //
  // If your brain thinks "Forget it, let's go shopping.", don't worry.
  // Take a nap and read an introductory text on the Aho Corasick algorithm.
  // It will make sense. Eventually.
  class AhoCorasickNode {
   public:
    // Key: label of the edge, value: node index in |tree_| of parent class.
    typedef std::map<char, uint32> Edges;
    typedef std::set<StringPattern::ID> Matches;

    static const uint32 kNoSuchEdge;  // Represents an invalid node index.

    AhoCorasickNode();
    ~AhoCorasickNode();
    AhoCorasickNode(const AhoCorasickNode& other);
    AhoCorasickNode& operator=(const AhoCorasickNode& other);

    uint32 GetEdge(char c) const;
    void SetEdge(char c, uint32 node);
    const Edges& edges() const { return edges_; }

    uint32 failure() const { return failure_; }
    void set_failure(uint32 failure) { failure_ = failure; }

    void AddMatch(StringPattern::ID id);
    void AddMatches(const Matches& matches);
    const Matches& matches() const { return matches_; }

   private:
    // Outgoing edges of current node.
    Edges edges_;

    // Node index that failure edge leads to.
    uint32 failure_;

    // Identifiers of matches.
    Matches matches_;
  };

  typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
  typedef std::vector<const StringPattern*> SubstringPatternVector;

  // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
  void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);

  // Inserts a path for |pattern->pattern()| into the tree and adds
  // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
  // the caller.
  void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
  void CreateFailureEdges();

  // Set of all registered StringPatterns. Used to regenerate the
  // Aho-Corasick tree in case patterns are registered or unregistered.
  SubstringPatternMap patterns_;

  // The nodes of a Aho-Corasick tree.
  std::vector<AhoCorasickNode> tree_;

  DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
};

}  // namespace url_matcher

#endif  // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_