net/base/registry_controlled_domain.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

//* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Mozilla TLD Service
 *
 * The Initial Developer of the Original Code is
 * Google Inc.
 * Portions created by the Initial Developer are Copyright (C) 2006
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Pamela Greene <pamg.bugs@gmail.com> (original author)
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

// NB: Modelled after Mozilla's code (originally written by Pamela Greene,
// later modified by others), but almost entirely rewritten for Chrome.

/*
  (Documentation based on the Mozilla documentation currently at
  http://wiki.mozilla.org/Gecko:Effective_TLD_Service, written by the same
  author.)

  The RegistryControlledDomainService examines the hostname of a GURL passed to
  it and determines the longest portion that is controlled by a registrar.
  Although technically the top-level domain (TLD) for a hostname is the last
  dot-portion of the name (such as .com or .org), many domains (such as co.uk)
  function as though they were TLDs, allocating any number of more specific,
  essentially unrelated names beneath them.  For example, .uk is a TLD, but
  nobody is allowed to register a domain directly under .uk; the "effective"
  TLDs are ac.uk, co.uk, and so on.  We wouldn't want to allow any site in
  *.co.uk to set a cookie for the entire co.uk domain, so it's important to be
  able to identify which higher-level domains function as effective TLDs and
  which can be registered.

  The service obtains its information about effective TLDs from a text resource
  that must be in the following format:

  * It should use plain ASCII.
  * It should contain one domain rule per line, terminated with \n, with nothing
    else on the line.  (The last rule in the file may omit the ending \n.)
  * Rules should have been normalized using the same canonicalization that GURL
    applies.  For ASCII, that means they're not case-sensitive, among other
    things; other normalizations are applied for other characters.
  * Each rule should list the entire TLD-like domain name, with any subdomain
    portions separated by dots (.) as usual.
  * Rules should neither begin nor end with a dot.
  * If a hostname matches more than one rule, the most specific rule (that is,
    the one with more dot-levels) will be used.
  * Other than in the case of wildcards (see below), rules do not implicitly
    include their subcomponents.  For example, "bar.baz.uk" does not imply
    "baz.uk", and if "bar.baz.uk" is the only rule in the list, "foo.bar.baz.uk"
    will match, but "baz.uk" and "qux.baz.uk" won't.
  * The wildcard character '*' will match any valid sequence of characters.
  * Wildcards may only appear as the entire most specific level of a rule.  That
    is, a wildcard must come at the beginning of a line and must be followed by
    a dot.  (You may not use a wildcard as the entire rule.)
  * A wildcard rule implies a rule for the entire non-wildcard portion.  For
    example, the rule "*.foo.bar" implies the rule "foo.bar" (but not the rule
    "bar").  This is typically important in the case of exceptions (see below).
  * The exception character '!' before a rule marks an exception to a wildcard
    rule.  If your rules are "*.tokyo.jp" and "!pref.tokyo.jp", then
    "a.b.tokyo.jp" has an effective TLD of "b.tokyo.jp", but "a.pref.tokyo.jp"
    has an effective TLD of "tokyo.jp" (the exception prevents the wildcard
    match, and we thus fall through to matching on the implied "tokyo.jp" rule
    from the wildcard).
  * If you use an exception rule without a corresponding wildcard rule, the
    behavior is undefined.

  Firefox has a very similar service, and it's their data file we use to
  construct our resource.  However, the data expected by this implementation
  differs from the Mozilla file in several important ways:
   (1) We require that all single-level TLDs (com, edu, etc.) be explicitly
       listed.  As of this writing, Mozilla's file includes the single-level
       TLDs too, but that might change.
   (2) Our data is expected be in pure ASCII: all UTF-8 or otherwise encoded
       items must already have been normalized.
   (3) We do not allow comments, rule notes, blank lines, or line endings other
       than LF.
  Rules are also expected to be syntactically valid.

  The utility application tld_cleanup.exe converts a Mozilla-style file into a
  Chrome one, making sure that single-level TLDs are explicitly listed, using
  GURL to normalize rules, and validating the rules.
*/

#ifndef NET_BASE_REGISTRY_CONTROLLED_DOMAIN_H_
#define NET_BASE_REGISTRY_CONTROLLED_DOMAIN_H_

#include <map>
#include <string>

#include "base/basictypes.h"

class GURL;

namespace net {

struct RegistryControlledDomainServiceSingletonTraits;

// This class is a singleton.
class RegistryControlledDomainService {
 public:
   ~RegistryControlledDomainService() { }

  // Returns the registered, organization-identifying host and all its registry
  // information, but no subdomains, from the given GURL.  Returns an empty
  // string if the GURL is invalid, has no host (e.g. a file: URL), has multiple
  // trailing dots, is an IP address, has only one subcomponent (i.e. no dots
  // other than leading/trailing ones), or is itself a recognized registry
  // identifier.  If no matching rule is found in the effective-TLD data (or in
  // the default data, if the resource failed to load), the last subcomponent of
  // the host is assumed to be the registry.
  //
  // Examples:
  //   http://www.google.com/file.html -> "google.com"  (com)
  //   http://..google.com/file.html   -> "google.com"  (com)
  //   http://google.com./file.html    -> "google.com." (com)
  //   http://a.b.co.uk/file.html      -> "b.co.uk"     (co.uk)
  //   file:///C:/bar.html             -> ""            (no host)
  //   http://foo.com../file.html      -> ""            (multiple trailing dots)
  //   http://192.168.0.1/file.html    -> ""            (IP address)
  //   http://bar/file.html            -> ""            (no subcomponents)
  //   http://co.uk/file.html          -> ""            (host is a registry)
  //   http://foo.bar/file.html        -> "foo.bar"     (no rule; assume bar)
  static std::string GetDomainAndRegistry(const GURL& gurl);

  // Like the GURL version, but takes a host (which is canonicalized internally)
  // instead of a full GURL.
  static std::string GetDomainAndRegistry(const std::string& host);
  static std::string GetDomainAndRegistry(const std::wstring& host);

  // This convenience function returns true if the two GURLs both have hosts
  // and one of the following is true:
  // * They each have a known domain and registry, and it is the same for both
  //   URLs.  Note that this means the trailing dot, if any, must match too.
  // * They don't have known domains/registries, but the hosts are identical.
  // Effectively, callers can use this function to check whether the input URLs
  // represent hosts "on the same site".
  static bool SameDomainOrHost(const GURL& gurl1, const GURL& gurl2);

  // Finds the length in bytes of the registrar portion of the host in the
  // given GURL.  Returns std::string::npos if the GURL is invalid or has no
  // host (e.g. a file: URL).  Returns 0 if the GURL has multiple trailing dots,
  // is an IP address, has no subcomponents, or is itself a recognized registry
  // identifier.  If no matching rule is found in the effective-TLD data (or in
  // the default data, if the resource failed to load), returns 0 if
  // |allow_unknown_registries| is false, or the length of the last subcomponent
  // if |allow_unknown_registries| is true.
  //
  // Examples:
  //   http://www.google.com/file.html -> 3                 (com)
  //   http://..google.com/file.html   -> 3                 (com)
  //   http://google.com./file.html    -> 4                 (com)
  //   http://a.b.co.uk/file.html      -> 5                 (co.uk)
  //   file:///C:/bar.html             -> std::string::npos (no host)
  //   http://foo.com../file.html      -> 0                 (multiple trailing
  //                                                         dots)
  //   http://192.168.0.1/file.html    -> 0                 (IP address)
  //   http://bar/file.html            -> 0                 (no subcomponents)
  //   http://co.uk/file.html          -> 0                 (host is a registry)
  //   http://foo.bar/file.html        -> 0 or 3, depending (no rule; assume
  //                                                         bar)
  static size_t GetRegistryLength(const GURL& gurl,
                                  bool allow_unknown_registries);

  // Like the GURL version, but takes a host (which is canonicalized internally)
  // instead of a full GURL.
  static size_t GetRegistryLength(const std::string& host,
                                  bool allow_unknown_registries);
  static size_t GetRegistryLength(const std::wstring& host,
                                  bool allow_unknown_registries);

 protected:
  // The entire protected API is only for unit testing.  I mean it.  Don't make
  // me come over there!
   RegistryControlledDomainService() { }

  // Set the RegistryControledDomainService instance to be used internally.
  // |instance| will supersede the singleton instance normally used.  If
  // |instance| is NULL, normal behavior is restored, and internal operations
  // will return to using the singleton.  This function always returns the
  // instance set by the most recent call to SetInstance.
  static RegistryControlledDomainService* SetInstance(
      RegistryControlledDomainService* instance);

  // Sets the domain_data_ of the current instance (creating one, if necessary),
  // then parses it.
  static void UseDomainData(const std::string& data);

 private:
  // To allow construction of the internal singleton instance.
  friend struct RegistryControlledDomainServiceSingletonTraits;

  // Using the StringSegment class, we can compare portions of strings without
  // needing to allocate or copy them.
  class StringSegment {
   public:
    StringSegment() : data_(0), begin_(0), len_(0) { }
    ~StringSegment() { }

    void Set(const char* data, size_t begin, size_t len) {
      data_ = data;
      begin_ = begin;
      len_ = len;
    }

    // Returns the character at the given offset from the start of the segment,
    // or '\0' if the offset lies outside the segment.
    char CharAt(size_t offset) const {
      return (offset < len_) ? data_[begin_ + offset] : '\0';
    }

    // Removes a maximum of |trimmed| number of characters, up to the length of
    // the segment, from the start of the StringSegment.
    void TrimFromStart(size_t trimmed) {
      if (trimmed > len_)
        trimmed = len_;
      begin_ += trimmed;
      len_ -= trimmed;
    }

    const char* data() const { return data_; }

    // This comparator is needed by std::map.  Note that since we don't care
    // about the exact sorting, we use a somewhat less intuitive, but efficient,
    // comparison.
    bool operator<(const StringSegment& other) const;

   private:
    const char* data_;
    size_t begin_;
    size_t len_;
  };

  // The full domain rule data, loaded from a resource or set by a unit test.
  std::string domain_data_;

  // An entry in the map of domain specifications, describing the properties
  // that apply to that domain rule.
  struct DomainEntry {
    DomainEntry() : exception(false), wildcard(false) { }
    bool exception;
    bool wildcard;
  };
  typedef std::map<StringSegment, DomainEntry> DomainMap;

  // A map from a StringSegment holding a domain name (rule) to its DomainEntry.
  // The StringSegments in the domain_map_ hold pointers to the domain_data_
  // data; that's cheaper than copying the string data itself.
  // TODO(pamg): Since all the domain_map_ entries have the same data_, it's
  // redundant.  Is it worth subclassing StringSegment to avoid that?
  DomainMap domain_map_;

  // Parses a list of effective-TLD rules, building the domain_map_.  Rules are
  // assumed to be syntactically valid.
  void ParseDomainData();

  // Returns the singleton instance, after attempting to initialize it.
  // NOTE that if the effective-TLD data resource can't be found, the instance
  // will be initialized and continue operation with an empty domain_map_.
  static RegistryControlledDomainService* GetInstance();

  // Loads and parses the effective-TLD data resource.
  void Init();

  // Adds one rule, assumed to be valid, to the domain_map_.
  // WARNING: As implied by the non-const status of the incoming rule, this
  // method may MODIFY that rule (in particular, change its start and length).
  // This is a performance optimization.
  void AddRule(StringSegment* rule);

  // Internal workings of the static public methods.  See above.
  static std::string GetDomainAndRegistryImpl(const std::string& host);
  size_t GetRegistryLengthImpl(const std::string& host,
                               bool allow_unknown_registries);

  DISALLOW_COPY_AND_ASSIGN(RegistryControlledDomainService);
};

}  // namespace net

#endif  // NET_BASE_REGISTRY_CONTROLLED_DOMAIN_H_