1 |
// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 |
// Use of this source code is governed by a BSD-style license that can be |
3 |
// found in the LICENSE file. |
4 |
|
5 |
// NB: Modelled after Mozilla's code (originally written by Pamela Greene, |
6 |
// later modified by others), but almost entirely rewritten for Chrome. |
7 |
// (netwerk/dns/src/nsEffectiveTLDService.cpp) |
8 |
/* ***** BEGIN LICENSE BLOCK ***** |
9 |
* Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
10 |
* |
11 |
* The contents of this file are subject to the Mozilla Public License Version |
12 |
* 1.1 (the "License"); you may not use this file except in compliance with |
13 |
* the License. You may obtain a copy of the License at |
14 |
* http://www.mozilla.org/MPL/ |
15 |
* |
16 |
* Software distributed under the License is distributed on an "AS IS" basis, |
17 |
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
18 |
* for the specific language governing rights and limitations under the |
19 |
* License. |
20 |
* |
21 |
* The Original Code is Mozilla Effective-TLD Service |
22 |
* |
23 |
* The Initial Developer of the Original Code is |
24 |
* Google Inc. |
25 |
* Portions created by the Initial Developer are Copyright (C) 2006 |
26 |
* the Initial Developer. All Rights Reserved. |
27 |
* |
28 |
* Contributor(s): |
29 |
* Pamela Greene <[email protected]> (original author) |
30 |
* Daniel Witte <[email protected]> |
31 |
* |
32 |
* Alternatively, the contents of this file may be used under the terms of |
33 |
* either the GNU General Public License Version 2 or later (the "GPL"), or |
34 |
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
35 |
* in which case the provisions of the GPL or the LGPL are applicable instead |
36 |
* of those above. If you wish to allow use of your version of this file only |
37 |
* under the terms of either the GPL or the LGPL, and not to allow others to |
38 |
* use your version of this file under the terms of the MPL, indicate your |
39 |
* decision by deleting the provisions above and replace them with the notice |
40 |
* and other provisions required by the GPL or the LGPL. If you do not delete |
41 |
* the provisions above, a recipient may use your version of this file under |
42 |
* the terms of any one of the MPL, the GPL or the LGPL. |
43 |
* |
44 |
* ***** END LICENSE BLOCK ***** */ |
45 |
|
46 |
#include "net/base/registry_controlled_domains/registry_controlled_domain.h" |
47 |
|
48 |
#include "base/logging.h" |
49 |
#include "base/strings/string_util.h" |
50 |
#include "base/strings/utf_string_conversions.h" |
51 |
#include "net/base/net_module.h" |
52 |
#include "net/base/net_util.h" |
53 |
#include "url/gurl.h" |
54 |
#include "url/url_parse.h" |
55 |
|
56 |
namespace net { |
57 |
namespace registry_controlled_domains { |
58 |
|
59 |
namespace { |
60 |
#include "net/base/registry_controlled_domains/effective_tld_names-inc.cc" |
61 |
|
62 |
// See make_dafsa.py for documentation of the generated dafsa byte array. |
63 |
|
64 |
const unsigned char* g_graph = kDafsa; |
65 |
size_t g_graph_length = sizeof(kDafsa); |
66 |
|
67 |
const int kNotFound = -1; |
68 |
const int kExceptionRule = 1; |
69 |
const int kWildcardRule = 2; |
70 |
const int kPrivateRule = 4; |
71 |
|
72 |
// Read next offset from pos. |
73 |
// Returns true if an offset could be read, false otherwise. |
74 |
bool GetNextOffset(const unsigned char** pos, const unsigned char* end, |
75 |
const unsigned char** offset) { |
76 |
if (*pos == end) |
77 |
return false; |
78 |
|
79 |
// When reading an offset the byte array must always contain at least |
80 |
// three more bytes to consume. First the offset to read, then a node |
81 |
// to skip over and finally a destination node. No object can be smaller |
82 |
// than one byte. |
83 |
CHECK_LT(*pos + 2, end); |
84 |
size_t bytes_consumed; |
85 |
switch (**pos & 0x60) { |
86 |
case 0x60: // Read three byte offset |
87 |
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2]; |
88 |
bytes_consumed = 3; |
89 |
break; |
90 |
case 0x40: // Read two byte offset |
91 |
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1]; |
92 |
bytes_consumed = 2; |
93 |
break; |
94 |
default: |
95 |
*offset += (*pos)[0] & 0x3F; |
96 |
bytes_consumed = 1; |
97 |
} |
98 |
if ((**pos & 0x80) != 0) { |
99 |
*pos = end; |
100 |
} else { |
101 |
*pos += bytes_consumed; |
102 |
} |
103 |
return true; |
104 |
} |
105 |
|
106 |
// Check if byte at offset is last in label. |
107 |
bool IsEOL(const unsigned char* offset, const unsigned char* end) { |
108 |
CHECK_LT(offset, end); |
109 |
return (*offset & 0x80) != 0; |
110 |
} |
111 |
|
112 |
// Check if byte at offset matches first character in key. |
113 |
// This version matches characters not last in label. |
114 |
bool IsMatch(const unsigned char* offset, const unsigned char* end, |
115 |
const char* key) { |
116 |
CHECK_LT(offset, end); |
117 |
return *offset == *key; |
118 |
} |
119 |
|
120 |
// Check if byte at offset matches first character in key. |
121 |
// This version matches characters last in label. |
122 |
bool IsEndCharMatch(const unsigned char* offset, const unsigned char* end, |
123 |
const char* key) { |
124 |
CHECK_LT(offset, end); |
125 |
return *offset == (*key | 0x80); |
126 |
} |
127 |
|
128 |
// Read return value at offset. |
129 |
// Returns true if a return value could be read, false otherwise. |
130 |
bool GetReturnValue(const unsigned char* offset, const unsigned char* end, |
131 |
int* return_value) { |
132 |
CHECK_LT(offset, end); |
133 |
if ((*offset & 0xE0) == 0x80) { |
134 |
*return_value = *offset & 0x0F; |
135 |
return true; |
136 |
} |
137 |
return false; |
138 |
} |
139 |
|
140 |
// Lookup a domain key in a byte array generated by make_dafsa.py. |
141 |
// The rule type is returned if key is found, otherwise kNotFound is returned. |
142 |
int LookupString(const unsigned char* graph, size_t length, const char* key, |
143 |
size_t key_length) { |
144 |
const unsigned char* pos = graph; |
145 |
const unsigned char* end = graph + length; |
146 |
const unsigned char* offset = pos; |
147 |
const char* key_end = key + key_length; |
148 |
while (GetNextOffset(&pos, end, &offset)) { |
149 |
// char <char>+ end_char offsets |
150 |
// char <char>+ return value |
151 |
// char end_char offsets |
152 |
// char return value |
153 |
// end_char offsets |
154 |
// return_value |
155 |
bool did_consume = false; |
156 |
if (key != key_end && !IsEOL(offset, end)) { |
157 |
// Leading <char> is not a match. Don't dive into this child |
158 |
if (!IsMatch(offset, end, key)) |
159 |
continue; |
160 |
did_consume = true; |
161 |
++offset; |
162 |
++key; |
163 |
// Possible matches at this point: |
164 |
// <char>+ end_char offsets |
165 |
// <char>+ return value |
166 |
// end_char offsets |
167 |
// return value |
168 |
// Remove all remaining <char> nodes possible |
169 |
while (!IsEOL(offset, end) && key != key_end) { |
170 |
if (!IsMatch(offset, end, key)) |
171 |
return kNotFound; |
172 |
++key; |
173 |
++offset; |
174 |
} |
175 |
} |
176 |
// Possible matches at this point: |
177 |
// end_char offsets |
178 |
// return_value |
179 |
// If one or more <char> elements were consumed, a failure |
180 |
// to match is terminal. Otherwise, try the next node. |
181 |
if (key == key_end) { |
182 |
int return_value; |
183 |
if (GetReturnValue(offset, end, &return_value)) |
184 |
return return_value; |
185 |
// The DAFSA guarantees that if the first char is a match, all |
186 |
// remaining char elements MUST match if the key is truly present. |
187 |
if (did_consume) |
188 |
return kNotFound; |
189 |
continue; |
190 |
} |
191 |
if (!IsEndCharMatch(offset, end, key)) { |
192 |
if (did_consume) |
193 |
return kNotFound; // Unexpected |
194 |
continue; |
195 |
} |
196 |
++key; |
197 |
pos = ++offset; // Dive into child |
198 |
} |
199 |
return kNotFound; // No match |
200 |
} |
201 |
|
202 |
size_t GetRegistryLengthImpl( |
203 |
const std::string& host, |
204 |
UnknownRegistryFilter unknown_filter, |
205 |
PrivateRegistryFilter private_filter) { |
206 |
DCHECK(!host.empty()); |
207 |
|
208 |
// Skip leading dots. |
209 |
const size_t host_check_begin = host.find_first_not_of('.'); |
210 |
if (host_check_begin == std::string::npos) |
211 |
return 0; // Host is only dots. |
212 |
|
213 |
// A single trailing dot isn't relevant in this determination, but does need |
214 |
// to be included in the final returned length. |
215 |
size_t host_check_len = host.length(); |
216 |
if (host[host_check_len - 1] == '.') { |
217 |
--host_check_len; |
218 |
DCHECK(host_check_len > 0); // If this weren't true, the host would be ".", |
219 |
// and we'd have already returned above. |
220 |
if (host[host_check_len - 1] == '.') |
221 |
return 0; // Multiple trailing dots. |
222 |
} |
223 |
|
224 |
// Walk up the domain tree, most specific to least specific, |
225 |
// looking for matches at each level. |
226 |
size_t prev_start = std::string::npos; |
227 |
size_t curr_start = host_check_begin; |
228 |
size_t next_dot = host.find('.', curr_start); |
229 |
if (next_dot >= host_check_len) // Catches std::string::npos as well. |
230 |
return 0; // This can't have a registry + domain. |
231 |
while (1) { |
232 |
const char* domain_str = host.data() + curr_start; |
233 |
size_t domain_length = host_check_len - curr_start; |
234 |
int type = LookupString(g_graph, g_graph_length, domain_str, domain_length); |
235 |
bool do_check = |
236 |
type != kNotFound && (!(type & kPrivateRule) || |
237 |
private_filter == INCLUDE_PRIVATE_REGISTRIES); |
238 |
|
239 |
// If the apparent match is a private registry and we're not including |
240 |
// those, it can't be an actual match. |
241 |
if (do_check) { |
242 |
// Exception rules override wildcard rules when the domain is an exact |
243 |
// match, but wildcards take precedence when there's a subdomain. |
244 |
if (type & kWildcardRule && (prev_start != std::string::npos)) { |
245 |
// If prev_start == host_check_begin, then the host is the registry |
246 |
// itself, so return 0. |
247 |
return (prev_start == host_check_begin) ? 0 |
248 |
: (host.length() - prev_start); |
249 |
} |
250 |
|
251 |
if (type & kExceptionRule) { |
252 |
if (next_dot == std::string::npos) { |
253 |
// If we get here, we had an exception rule with no dots (e.g. |
254 |
// "!foo"). This would only be valid if we had a corresponding |
255 |
// wildcard rule, which would have to be "*". But we explicitly |
256 |
// disallow that case, so this kind of rule is invalid. |
257 |
NOTREACHED() << "Invalid exception rule"; |
258 |
return 0; |
259 |
} |
260 |
return host.length() - next_dot - 1; |
261 |
} |
262 |
|
263 |
// If curr_start == host_check_begin, then the host is the registry |
264 |
// itself, so return 0. |
265 |
return (curr_start == host_check_begin) ? 0 |
266 |
: (host.length() - curr_start); |
267 |
} |
268 |
|
269 |
if (next_dot >= host_check_len) // Catches std::string::npos as well. |
270 |
break; |
271 |
|
272 |
prev_start = curr_start; |
273 |
curr_start = next_dot + 1; |
274 |
next_dot = host.find('.', curr_start); |
275 |
} |
276 |
|
277 |
// No rule found in the registry. curr_start now points to the first |
278 |
// character of the last subcomponent of the host, so if we allow unknown |
279 |
// registries, return the length of this subcomponent. |
280 |
return unknown_filter == INCLUDE_UNKNOWN_REGISTRIES ? |
281 |
(host.length() - curr_start) : 0; |
282 |
} |
283 |
|
284 |
std::string GetDomainAndRegistryImpl( |
285 |
const std::string& host, PrivateRegistryFilter private_filter) { |
286 |
DCHECK(!host.empty()); |
287 |
|
288 |
// Find the length of the registry for this host. |
289 |
const size_t registry_length = |
290 |
GetRegistryLengthImpl(host, INCLUDE_UNKNOWN_REGISTRIES, private_filter); |
291 |
if ((registry_length == std::string::npos) || (registry_length == 0)) |
292 |
return std::string(); // No registry. |
293 |
// The "2" in this next line is 1 for the dot, plus a 1-char minimum preceding |
294 |
// subcomponent length. |
295 |
DCHECK(host.length() >= 2); |
296 |
if (registry_length > (host.length() - 2)) { |
297 |
NOTREACHED() << |
298 |
"Host does not have at least one subcomponent before registry!"; |
299 |
return std::string(); |
300 |
} |
301 |
|
302 |
// Move past the dot preceding the registry, and search for the next previous |
303 |
// dot. Return the host from after that dot, or the whole host when there is |
304 |
// no dot. |
305 |
const size_t dot = host.rfind('.', host.length() - registry_length - 2); |
306 |
if (dot == std::string::npos) |
307 |
return host; |
308 |
return host.substr(dot + 1); |
309 |
} |
310 |
|
311 |
} // namespace |
312 |
|
313 |
std::string GetDomainAndRegistry( |
314 |
const GURL& gurl, |
315 |
PrivateRegistryFilter filter) { |
316 |
const url::Component host = gurl.parsed_for_possibly_invalid_spec().host; |
317 |
if ((host.len <= 0) || gurl.HostIsIPAddress()) |
318 |
return std::string(); |
319 |
return GetDomainAndRegistryImpl(std::string( |
320 |
gurl.possibly_invalid_spec().data() + host.begin, host.len), filter); |
321 |
} |
322 |
|
323 |
std::string GetDomainAndRegistry( |
324 |
const std::string& host, |
325 |
PrivateRegistryFilter filter) { |
326 |
url::CanonHostInfo host_info; |
327 |
const std::string canon_host(CanonicalizeHost(host, &host_info)); |
328 |
if (canon_host.empty() || host_info.IsIPAddress()) |
329 |
return std::string(); |
330 |
return GetDomainAndRegistryImpl(canon_host, filter); |
331 |
} |
332 |
|
333 |
bool SameDomainOrHost( |
334 |
const GURL& gurl1, |
335 |
const GURL& gurl2, |
336 |
PrivateRegistryFilter filter) { |
337 |
// See if both URLs have a known domain + registry, and those values are the |
338 |
// same. |
339 |
const std::string domain1(GetDomainAndRegistry(gurl1, filter)); |
340 |
const std::string domain2(GetDomainAndRegistry(gurl2, filter)); |
341 |
if (!domain1.empty() || !domain2.empty()) |
342 |
return domain1 == domain2; |
343 |
|
344 |
// No domains. See if the hosts are identical. |
345 |
const url::Component host1 = gurl1.parsed_for_possibly_invalid_spec().host; |
346 |
const url::Component host2 = gurl2.parsed_for_possibly_invalid_spec().host; |
347 |
if ((host1.len <= 0) || (host1.len != host2.len)) |
348 |
return false; |
349 |
return !strncmp(gurl1.possibly_invalid_spec().data() + host1.begin, |
350 |
gurl2.possibly_invalid_spec().data() + host2.begin, |
351 |
host1.len); |
352 |
} |
353 |
|
354 |
size_t GetRegistryLength( |
355 |
const GURL& gurl, |
356 |
UnknownRegistryFilter unknown_filter, |
357 |
PrivateRegistryFilter private_filter) { |
358 |
const url::Component host = gurl.parsed_for_possibly_invalid_spec().host; |
359 |
if (host.len <= 0) |
360 |
return std::string::npos; |
361 |
if (gurl.HostIsIPAddress()) |
362 |
return 0; |
363 |
return GetRegistryLengthImpl( |
364 |
std::string(gurl.possibly_invalid_spec().data() + host.begin, host.len), |
365 |
unknown_filter, |
366 |
private_filter); |
367 |
} |
368 |
|
369 |
size_t GetRegistryLength( |
370 |
const std::string& host, |
371 |
UnknownRegistryFilter unknown_filter, |
372 |
PrivateRegistryFilter private_filter) { |
373 |
url::CanonHostInfo host_info; |
374 |
const std::string canon_host(CanonicalizeHost(host, &host_info)); |
375 |
if (canon_host.empty()) |
376 |
return std::string::npos; |
377 |
if (host_info.IsIPAddress()) |
378 |
return 0; |
379 |
return GetRegistryLengthImpl(canon_host, unknown_filter, private_filter); |
380 |
} |
381 |
|
382 |
void SetFindDomainGraph() { |
383 |
g_graph = kDafsa; |
384 |
g_graph_length = sizeof(kDafsa); |
385 |
} |
386 |
|
387 |
void SetFindDomainGraph(const unsigned char* domains, size_t length) { |
388 |
CHECK(domains); |
389 |
CHECK_NE(length, 0u); |
390 |
g_graph = domains; |
391 |
g_graph_length = length; |
392 |
} |
393 |
|
394 |
} // namespace registry_controlled_domains |
395 |
} // namespace net |