/[chrome]/trunk/src/net/base/registry_controlled_domains/registry_controlled_domain.cc
Chromium logo

Contents of /trunk/src/net/base/registry_controlled_domains/registry_controlled_domain.cc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 270039 - (show annotations)
Tue May 13 07:01:34 2014 UTC (3 years, 7 months ago) by [email protected]
File size: 14256 byte(s)
Reduce footprint of registry controlled domain table

The perfect hash table containing all registry controlled domains is
replaced by a compact graph (a dafsa) to reduce binary size and PSS
of the running process. Size of the new structure is about 33kB
compared to 380kB for the perfect hash table.

Patch by Olle Liljenzin <ollel@opera.com>, originally at
https://codereview.chromium.org/197183002/

BUG=370672
R=brettw@chromium.org

Review URL: https://codereview.chromium.org/270363003
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // NB: Modelled after Mozilla's code (originally written by Pamela Greene,
6 // later modified by others), but almost entirely rewritten for Chrome.
7 // (netwerk/dns/src/nsEffectiveTLDService.cpp)
8 /* ***** BEGIN LICENSE BLOCK *****
9 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
10 *
11 * The contents of this file are subject to the Mozilla Public License Version
12 * 1.1 (the "License"); you may not use this file except in compliance with
13 * the License. You may obtain a copy of the License at
14 * http://www.mozilla.org/MPL/
15 *
16 * Software distributed under the License is distributed on an "AS IS" basis,
17 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
18 * for the specific language governing rights and limitations under the
19 * License.
20 *
21 * The Original Code is Mozilla Effective-TLD Service
22 *
23 * The Initial Developer of the Original Code is
24 * Google Inc.
25 * Portions created by the Initial Developer are Copyright (C) 2006
26 * the Initial Developer. All Rights Reserved.
27 *
28 * Contributor(s):
29 * Pamela Greene <[email protected]> (original author)
30 * Daniel Witte <[email protected]>
31 *
32 * Alternatively, the contents of this file may be used under the terms of
33 * either the GNU General Public License Version 2 or later (the "GPL"), or
34 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
35 * in which case the provisions of the GPL or the LGPL are applicable instead
36 * of those above. If you wish to allow use of your version of this file only
37 * under the terms of either the GPL or the LGPL, and not to allow others to
38 * use your version of this file under the terms of the MPL, indicate your
39 * decision by deleting the provisions above and replace them with the notice
40 * and other provisions required by the GPL or the LGPL. If you do not delete
41 * the provisions above, a recipient may use your version of this file under
42 * the terms of any one of the MPL, the GPL or the LGPL.
43 *
44 * ***** END LICENSE BLOCK ***** */
45
46 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
47
48 #include "base/logging.h"
49 #include "base/strings/string_util.h"
50 #include "base/strings/utf_string_conversions.h"
51 #include "net/base/net_module.h"
52 #include "net/base/net_util.h"
53 #include "url/gurl.h"
54 #include "url/url_parse.h"
55
56 namespace net {
57 namespace registry_controlled_domains {
58
59 namespace {
60 #include "net/base/registry_controlled_domains/effective_tld_names-inc.cc"
61
62 // See make_dafsa.py for documentation of the generated dafsa byte array.
63
64 const unsigned char* g_graph = kDafsa;
65 size_t g_graph_length = sizeof(kDafsa);
66
67 const int kNotFound = -1;
68 const int kExceptionRule = 1;
69 const int kWildcardRule = 2;
70 const int kPrivateRule = 4;
71
72 // Read next offset from pos.
73 // Returns true if an offset could be read, false otherwise.
74 bool GetNextOffset(const unsigned char** pos, const unsigned char* end,
75 const unsigned char** offset) {
76 if (*pos == end)
77 return false;
78
79 // When reading an offset the byte array must always contain at least
80 // three more bytes to consume. First the offset to read, then a node
81 // to skip over and finally a destination node. No object can be smaller
82 // than one byte.
83 CHECK_LT(*pos + 2, end);
84 size_t bytes_consumed;
85 switch (**pos & 0x60) {
86 case 0x60: // Read three byte offset
87 *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
88 bytes_consumed = 3;
89 break;
90 case 0x40: // Read two byte offset
91 *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
92 bytes_consumed = 2;
93 break;
94 default:
95 *offset += (*pos)[0] & 0x3F;
96 bytes_consumed = 1;
97 }
98 if ((**pos & 0x80) != 0) {
99 *pos = end;
100 } else {
101 *pos += bytes_consumed;
102 }
103 return true;
104 }
105
106 // Check if byte at offset is last in label.
107 bool IsEOL(const unsigned char* offset, const unsigned char* end) {
108 CHECK_LT(offset, end);
109 return (*offset & 0x80) != 0;
110 }
111
112 // Check if byte at offset matches first character in key.
113 // This version matches characters not last in label.
114 bool IsMatch(const unsigned char* offset, const unsigned char* end,
115 const char* key) {
116 CHECK_LT(offset, end);
117 return *offset == *key;
118 }
119
120 // Check if byte at offset matches first character in key.
121 // This version matches characters last in label.
122 bool IsEndCharMatch(const unsigned char* offset, const unsigned char* end,
123 const char* key) {
124 CHECK_LT(offset, end);
125 return *offset == (*key | 0x80);
126 }
127
128 // Read return value at offset.
129 // Returns true if a return value could be read, false otherwise.
130 bool GetReturnValue(const unsigned char* offset, const unsigned char* end,
131 int* return_value) {
132 CHECK_LT(offset, end);
133 if ((*offset & 0xE0) == 0x80) {
134 *return_value = *offset & 0x0F;
135 return true;
136 }
137 return false;
138 }
139
140 // Lookup a domain key in a byte array generated by make_dafsa.py.
141 // The rule type is returned if key is found, otherwise kNotFound is returned.
142 int LookupString(const unsigned char* graph, size_t length, const char* key,
143 size_t key_length) {
144 const unsigned char* pos = graph;
145 const unsigned char* end = graph + length;
146 const unsigned char* offset = pos;
147 const char* key_end = key + key_length;
148 while (GetNextOffset(&pos, end, &offset)) {
149 // char <char>+ end_char offsets
150 // char <char>+ return value
151 // char end_char offsets
152 // char return value
153 // end_char offsets
154 // return_value
155 bool did_consume = false;
156 if (key != key_end && !IsEOL(offset, end)) {
157 // Leading <char> is not a match. Don't dive into this child
158 if (!IsMatch(offset, end, key))
159 continue;
160 did_consume = true;
161 ++offset;
162 ++key;
163 // Possible matches at this point:
164 // <char>+ end_char offsets
165 // <char>+ return value
166 // end_char offsets
167 // return value
168 // Remove all remaining <char> nodes possible
169 while (!IsEOL(offset, end) && key != key_end) {
170 if (!IsMatch(offset, end, key))
171 return kNotFound;
172 ++key;
173 ++offset;
174 }
175 }
176 // Possible matches at this point:
177 // end_char offsets
178 // return_value
179 // If one or more <char> elements were consumed, a failure
180 // to match is terminal. Otherwise, try the next node.
181 if (key == key_end) {
182 int return_value;
183 if (GetReturnValue(offset, end, &return_value))
184 return return_value;
185 // The DAFSA guarantees that if the first char is a match, all
186 // remaining char elements MUST match if the key is truly present.
187 if (did_consume)
188 return kNotFound;
189 continue;
190 }
191 if (!IsEndCharMatch(offset, end, key)) {
192 if (did_consume)
193 return kNotFound; // Unexpected
194 continue;
195 }
196 ++key;
197 pos = ++offset; // Dive into child
198 }
199 return kNotFound; // No match
200 }
201
202 size_t GetRegistryLengthImpl(
203 const std::string& host,
204 UnknownRegistryFilter unknown_filter,
205 PrivateRegistryFilter private_filter) {
206 DCHECK(!host.empty());
207
208 // Skip leading dots.
209 const size_t host_check_begin = host.find_first_not_of('.');
210 if (host_check_begin == std::string::npos)
211 return 0; // Host is only dots.
212
213 // A single trailing dot isn't relevant in this determination, but does need
214 // to be included in the final returned length.
215 size_t host_check_len = host.length();
216 if (host[host_check_len - 1] == '.') {
217 --host_check_len;
218 DCHECK(host_check_len > 0); // If this weren't true, the host would be ".",
219 // and we'd have already returned above.
220 if (host[host_check_len - 1] == '.')
221 return 0; // Multiple trailing dots.
222 }
223
224 // Walk up the domain tree, most specific to least specific,
225 // looking for matches at each level.
226 size_t prev_start = std::string::npos;
227 size_t curr_start = host_check_begin;
228 size_t next_dot = host.find('.', curr_start);
229 if (next_dot >= host_check_len) // Catches std::string::npos as well.
230 return 0; // This can't have a registry + domain.
231 while (1) {
232 const char* domain_str = host.data() + curr_start;
233 size_t domain_length = host_check_len - curr_start;
234 int type = LookupString(g_graph, g_graph_length, domain_str, domain_length);
235 bool do_check =
236 type != kNotFound && (!(type & kPrivateRule) ||
237 private_filter == INCLUDE_PRIVATE_REGISTRIES);
238
239 // If the apparent match is a private registry and we're not including
240 // those, it can't be an actual match.
241 if (do_check) {
242 // Exception rules override wildcard rules when the domain is an exact
243 // match, but wildcards take precedence when there's a subdomain.
244 if (type & kWildcardRule && (prev_start != std::string::npos)) {
245 // If prev_start == host_check_begin, then the host is the registry
246 // itself, so return 0.
247 return (prev_start == host_check_begin) ? 0
248 : (host.length() - prev_start);
249 }
250
251 if (type & kExceptionRule) {
252 if (next_dot == std::string::npos) {
253 // If we get here, we had an exception rule with no dots (e.g.
254 // "!foo"). This would only be valid if we had a corresponding
255 // wildcard rule, which would have to be "*". But we explicitly
256 // disallow that case, so this kind of rule is invalid.
257 NOTREACHED() << "Invalid exception rule";
258 return 0;
259 }
260 return host.length() - next_dot - 1;
261 }
262
263 // If curr_start == host_check_begin, then the host is the registry
264 // itself, so return 0.
265 return (curr_start == host_check_begin) ? 0
266 : (host.length() - curr_start);
267 }
268
269 if (next_dot >= host_check_len) // Catches std::string::npos as well.
270 break;
271
272 prev_start = curr_start;
273 curr_start = next_dot + 1;
274 next_dot = host.find('.', curr_start);
275 }
276
277 // No rule found in the registry. curr_start now points to the first
278 // character of the last subcomponent of the host, so if we allow unknown
279 // registries, return the length of this subcomponent.
280 return unknown_filter == INCLUDE_UNKNOWN_REGISTRIES ?
281 (host.length() - curr_start) : 0;
282 }
283
284 std::string GetDomainAndRegistryImpl(
285 const std::string& host, PrivateRegistryFilter private_filter) {
286 DCHECK(!host.empty());
287
288 // Find the length of the registry for this host.
289 const size_t registry_length =
290 GetRegistryLengthImpl(host, INCLUDE_UNKNOWN_REGISTRIES, private_filter);
291 if ((registry_length == std::string::npos) || (registry_length == 0))
292 return std::string(); // No registry.
293 // The "2" in this next line is 1 for the dot, plus a 1-char minimum preceding
294 // subcomponent length.
295 DCHECK(host.length() >= 2);
296 if (registry_length > (host.length() - 2)) {
297 NOTREACHED() <<
298 "Host does not have at least one subcomponent before registry!";
299 return std::string();
300 }
301
302 // Move past the dot preceding the registry, and search for the next previous
303 // dot. Return the host from after that dot, or the whole host when there is
304 // no dot.
305 const size_t dot = host.rfind('.', host.length() - registry_length - 2);
306 if (dot == std::string::npos)
307 return host;
308 return host.substr(dot + 1);
309 }
310
311 } // namespace
312
313 std::string GetDomainAndRegistry(
314 const GURL& gurl,
315 PrivateRegistryFilter filter) {
316 const url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
317 if ((host.len <= 0) || gurl.HostIsIPAddress())
318 return std::string();
319 return GetDomainAndRegistryImpl(std::string(
320 gurl.possibly_invalid_spec().data() + host.begin, host.len), filter);
321 }
322
323 std::string GetDomainAndRegistry(
324 const std::string& host,
325 PrivateRegistryFilter filter) {
326 url::CanonHostInfo host_info;
327 const std::string canon_host(CanonicalizeHost(host, &host_info));
328 if (canon_host.empty() || host_info.IsIPAddress())
329 return std::string();
330 return GetDomainAndRegistryImpl(canon_host, filter);
331 }
332
333 bool SameDomainOrHost(
334 const GURL& gurl1,
335 const GURL& gurl2,
336 PrivateRegistryFilter filter) {
337 // See if both URLs have a known domain + registry, and those values are the
338 // same.
339 const std::string domain1(GetDomainAndRegistry(gurl1, filter));
340 const std::string domain2(GetDomainAndRegistry(gurl2, filter));
341 if (!domain1.empty() || !domain2.empty())
342 return domain1 == domain2;
343
344 // No domains. See if the hosts are identical.
345 const url::Component host1 = gurl1.parsed_for_possibly_invalid_spec().host;
346 const url::Component host2 = gurl2.parsed_for_possibly_invalid_spec().host;
347 if ((host1.len <= 0) || (host1.len != host2.len))
348 return false;
349 return !strncmp(gurl1.possibly_invalid_spec().data() + host1.begin,
350 gurl2.possibly_invalid_spec().data() + host2.begin,
351 host1.len);
352 }
353
354 size_t GetRegistryLength(
355 const GURL& gurl,
356 UnknownRegistryFilter unknown_filter,
357 PrivateRegistryFilter private_filter) {
358 const url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
359 if (host.len <= 0)
360 return std::string::npos;
361 if (gurl.HostIsIPAddress())
362 return 0;
363 return GetRegistryLengthImpl(
364 std::string(gurl.possibly_invalid_spec().data() + host.begin, host.len),
365 unknown_filter,
366 private_filter);
367 }
368
369 size_t GetRegistryLength(
370 const std::string& host,
371 UnknownRegistryFilter unknown_filter,
372 PrivateRegistryFilter private_filter) {
373 url::CanonHostInfo host_info;
374 const std::string canon_host(CanonicalizeHost(host, &host_info));
375 if (canon_host.empty())
376 return std::string::npos;
377 if (host_info.IsIPAddress())
378 return 0;
379 return GetRegistryLengthImpl(canon_host, unknown_filter, private_filter);
380 }
381
382 void SetFindDomainGraph() {
383 g_graph = kDafsa;
384 g_graph_length = sizeof(kDafsa);
385 }
386
387 void SetFindDomainGraph(const unsigned char* domains, size_t length) {
388 CHECK(domains);
389 CHECK_NE(length, 0u);
390 g_graph = domains;
391 g_graph_length = length;
392 }
393
394 } // namespace registry_controlled_domains
395 } // namespace net

Properties

Name Value
svn:eol-style LF

Powered by ViewVC 1.1.22 ViewVC Help