summaryrefslogtreecommitdiffstats
path: root/tools/gn/pattern.cc
blob: e4ac8df160b450ee0247ec0d4e3b2044a798b6a4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "tools/gn/pattern.h"

#include "tools/gn/value.h"

const char kPattern_Help[] =
    "Patterns\n"
    "  Patterns are VERY limited regular expressions that are used in\n"
    "  several places.\n"
    "\n"
    "  Patterns must match the entire input string to be counted as a match.\n"
    "  In regular expression parlance, there is an implicit \"^...$\"\n"
    "  surrounding your input. If you want to match a substring, you need to\n"
    "  use wildcards at the beginning and end.\n"
    "\n"
    "  There are only two special tokens understood by the pattern matcher.\n"
    "  Everything else is a literal.\n"
    "\n"
    "   * Matches zero or more of any character. It does not depend on the\n"
    "     preceding character (in regular expression parlance it is\n"
    "     equivalent to \".*\").\n"
    "\n"
    "  \\b Matches a path boundary. This will match the beginning or end of\n"
    "     a string, or a slash.\n"
    "\n"
    "Examples\n"
    "\n"
    "  \"*asdf*\"\n"
    "      Matches a string containing \"asdf\" anywhere.\n"
    "\n"
    "  \"asdf\"\n"
    "      Matches only the exact string \"asdf\".\n"
    "\n"
    "  \"*.cc\"\n"
    "      Matches strings ending in the literal \".cc\".\n"
    "\n"
    "  \"\\bwin/*\"\n"
    "      Matches \"win/foo\" and \"foo/win/bar.cc\" but not \"iwin/foo\".\n";

namespace {

void ParsePattern(const std::string& s, std::vector<Pattern::Subrange>* out) {
  // Set when the last subrange is a literal so we can just append when we
  // find another literal.
  Pattern::Subrange* last_literal = NULL;

  for (size_t i = 0; i < s.size(); i++) {
    if (s[i] == '*') {
      // Don't allow two **.
      if (out->size() == 0 ||
          (*out)[out->size() - 1].type != Pattern::Subrange::ANYTHING)
        out->push_back(Pattern::Subrange(Pattern::Subrange::ANYTHING));
      last_literal = NULL;
    } else if (s[i] == '\\') {
      if (i < s.size() - 1 && s[i + 1] == 'b') {
        // "\b" means path boundary.
        i++;
        out->push_back(Pattern::Subrange(Pattern::Subrange::PATH_BOUNDARY));
        last_literal = NULL;
      } else {
        // Backslash + anything else means that literal char.
        if (!last_literal) {
          out->push_back(Pattern::Subrange(Pattern::Subrange::LITERAL));
          last_literal = &(*out)[out->size() - 1];
        }
        if (i < s.size() - 1) {
          i++;
          last_literal->literal.push_back(s[i]);
        } else {
          // Single backslash at end, use literal backslash.
          last_literal->literal.push_back('\\');
        }
      }
    } else {
      if (!last_literal) {
        out->push_back(Pattern::Subrange(Pattern::Subrange::LITERAL));
        last_literal = &(*out)[out->size() - 1];
      }
      last_literal->literal.push_back(s[i]);
    }
  }
}

}  // namespace

Pattern::Pattern(const std::string& s) {
  ParsePattern(s, &subranges_);
  is_suffix_ =
      (subranges_.size() == 2 &&
       subranges_[0].type == Subrange::ANYTHING &&
       subranges_[1].type == Subrange::LITERAL);
}

Pattern::~Pattern() {
}

bool Pattern::MatchesString(const std::string& s) const {
  // Empty pattern matches only empty string.
  if (subranges_.empty())
    return s.empty();

  if (is_suffix_) {
    const std::string& suffix = subranges_[1].literal;
    if (suffix.size() > s.size())
      return false;  // Too short.
    return s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0;
  }

  return RecursiveMatch(s, 0, 0, true);
}

// We assume the number of ranges is small so recursive is always reasonable.
// Could be optimized to only be recursive for *.
bool Pattern::RecursiveMatch(const std::string& s,
                             size_t begin_char,
                             size_t subrange_index,
                             bool allow_implicit_path_boundary) const {
  if (subrange_index >= subranges_.size()) {
    // Hit the end of our subranges, the text should also be at the end for a
    // match.
    return begin_char == s.size();
  }

  const Subrange& sr = subranges_[subrange_index];
  switch (sr.type) {
    case Subrange::LITERAL: {
      if (s.size() - begin_char < sr.literal.size())
        return false;  // Not enough room.
      if (s.compare(begin_char, sr.literal.size(), sr.literal) != 0)
        return false;  // Literal doesn't match.

      // Recursively check the next one.
      return RecursiveMatch(s, begin_char + sr.literal.size(),
                            subrange_index + 1, true);
    }

    case Subrange::PATH_BOUNDARY: {
      // When we can accept an implicit path boundary, we have to check both
      // a match of the literal and the implicit one.
      if (allow_implicit_path_boundary &&
          (begin_char == 0 || begin_char == s.size())) {
        // At implicit path boundary, see if the rest of the pattern matches.
        if (RecursiveMatch(s, begin_char, subrange_index + 1, false))
          return true;
      }

      // Check for a literal "/".
      if (begin_char < s.size() && s[begin_char] == '/') {
        // At explicit boundary, see if the rest of the pattern matches.
        if (RecursiveMatch(s, begin_char + 1, subrange_index + 1, true))
          return true;
      }
      return false;
    }

    case Subrange::ANYTHING: {
      if (subrange_index == subranges_.size() - 1)
        return true;  // * at the end, consider it matching.

      size_t min_next_size = sr.MinSize();

      // We don't care about exactly what matched as long as there was a match,
      // so we can do this front-to-back. If we needed the match, we would
      // normally want "*" to be greedy so would work backwards.
      for (size_t i = begin_char; i < s.size() - min_next_size; i++) {
        // Note: this could probably be faster by detecting the type of the
        // next match in advance and checking for a match in this loop rather
        // than doing a full recursive call for each character.
        if (RecursiveMatch(s, i, subrange_index + 1, true))
          return true;
      }
      return false;
    }

    default:
      NOTREACHED();
  }

  return false;
}

PatternList::PatternList() {
}

PatternList::~PatternList() {
}

void PatternList::Append(const Pattern& pattern) {
  patterns_.push_back(pattern);
}

void PatternList::SetFromValue(const Value& v, Err* err) {
  patterns_.clear();

  if (v.type() != Value::LIST) {
    *err = Err(v.origin(), "This value must be a list.");
    return;
  }

  const std::vector<Value>& list = v.list_value();
  for (size_t i = 0; i < list.size(); i++) {
    if (!list[i].VerifyTypeIs(Value::STRING, err))
      return;
    patterns_.push_back(Pattern(list[i].string_value()));
  }
}

bool PatternList::MatchesString(const std::string& s) const {
  for (size_t i = 0; i < patterns_.size(); i++) {
    if (patterns_[i].MatchesString(s))
      return true;
  }
  return false;
}

bool PatternList::MatchesValue(const Value& v) const {
  if (v.type() == Value::STRING)
    return MatchesString(v.string_value());
  return false;
}