• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Canonicalizers for random bits that aren't big enough for their own files.
6 
7 #include <string.h>
8 
9 #include "url/url_canon.h"
10 #include "url/url_canon_internal.h"
11 
12 namespace url {
13 
14 namespace {
15 
16 // Returns true if the given character should be removed from the middle of a
17 // URL.
IsRemovableURLWhitespace(int ch)18 inline bool IsRemovableURLWhitespace(int ch) {
19   return ch == '\r' || ch == '\n' || ch == '\t';
20 }
21 
22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
23 // It sucks that we have to do this, since this takes about 13% of the total URL
24 // canonicalization time.
25 template<typename CHAR>
DoRemoveURLWhitespace(const CHAR * input,int input_len,CanonOutputT<CHAR> * buffer,int * output_len)26 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
27                                   CanonOutputT<CHAR>* buffer,
28                                   int* output_len) {
29   // Fast verification that there's nothing that needs removal. This is the 99%
30   // case, so we want it to be fast and don't care about impacting the speed
31   // when we do find whitespace.
32   int found_whitespace = false;
33   for (int i = 0; i < input_len; i++) {
34     if (!IsRemovableURLWhitespace(input[i]))
35       continue;
36     found_whitespace = true;
37     break;
38   }
39 
40   if (!found_whitespace) {
41     // Didn't find any whitespace, we don't need to do anything. We can just
42     // return the input as the output.
43     *output_len = input_len;
44     return input;
45   }
46 
47   // Remove the whitespace into the new buffer and return it.
48   for (int i = 0; i < input_len; i++) {
49     if (!IsRemovableURLWhitespace(input[i]))
50       buffer->push_back(input[i]);
51   }
52   *output_len = buffer->length();
53   return buffer->data();
54 }
55 
56 // Contains the canonical version of each possible input letter in the scheme
57 // (basically, lower-cased). The corresponding entry will be 0 if the letter
58 // is not allowed in a scheme.
59 const char kSchemeCanonical[0x80] = {
60 // 00-1f: all are invalid
61      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
62      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
63 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
64      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
65 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
66     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
67 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
68      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
69 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
70     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
71 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
72      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
73 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
74     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
75 
76 // This could be a table lookup as well by setting the high bit for each
77 // valid character, but it's only called once per URL, and it makes the lookup
78 // table easier to read not having extra stuff in it.
IsSchemeFirstChar(unsigned char c)79 inline bool IsSchemeFirstChar(unsigned char c) {
80   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
81 }
82 
83 template<typename CHAR, typename UCHAR>
DoScheme(const CHAR * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)84 bool DoScheme(const CHAR* spec,
85               const Component& scheme,
86               CanonOutput* output,
87               Component* out_scheme) {
88   if (scheme.len <= 0) {
89     // Scheme is unspecified or empty, convert to empty by appending a colon.
90     *out_scheme = Component(output->length(), 0);
91     output->push_back(':');
92     return true;
93   }
94 
95   // The output scheme starts from the current position.
96   out_scheme->begin = output->length();
97 
98   // Danger: it's important that this code does not strip any characters: it
99   // only emits the canonical version (be it valid or escaped) of each of
100   // the input characters. Stripping would put it out of sync with
101   // FindAndCompareScheme, which could cause some security checks on
102   // schemes to be incorrect.
103   bool success = true;
104   int end = scheme.end();
105   for (int i = scheme.begin; i < end; i++) {
106     UCHAR ch = static_cast<UCHAR>(spec[i]);
107     char replacement = 0;
108     if (ch < 0x80) {
109       if (i == scheme.begin) {
110         // Need to do a special check for the first letter of the scheme.
111         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
112           replacement = kSchemeCanonical[ch];
113       } else {
114         replacement = kSchemeCanonical[ch];
115       }
116     }
117 
118     if (replacement) {
119       output->push_back(replacement);
120     } else if (ch == '%') {
121       // Canonicalizing the scheme multiple times should lead to the same
122       // result. Since invalid characters will be escaped, we need to preserve
123       // the percent to avoid multiple escaping. The scheme will be invalid.
124       success = false;
125       output->push_back('%');
126     } else {
127       // Invalid character, store it but mark this scheme as invalid.
128       success = false;
129 
130       // This will escape the output and also handle encoding issues.
131       // Ignore the return value since we already failed.
132       AppendUTF8EscapedChar(spec, &i, end, output);
133     }
134   }
135 
136   // The output scheme ends with the the current position, before appending
137   // the colon.
138   out_scheme->len = output->length() - out_scheme->begin;
139   output->push_back(':');
140   return success;
141 }
142 
143 // The username and password components reference ranges in the corresponding
144 // *_spec strings. Typically, these specs will be the same (we're
145 // canonicalizing a single source string), but may be different when
146 // replacing components.
147 template<typename CHAR, typename UCHAR>
DoUserInfo(const CHAR * username_spec,const Component & username,const CHAR * password_spec,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)148 bool DoUserInfo(const CHAR* username_spec,
149                 const Component& username,
150                 const CHAR* password_spec,
151                 const Component& password,
152                 CanonOutput* output,
153                 Component* out_username,
154                 Component* out_password) {
155   if (username.len <= 0 && password.len <= 0) {
156     // Common case: no user info. We strip empty username/passwords.
157     *out_username = Component();
158     *out_password = Component();
159     return true;
160   }
161 
162   // Write the username.
163   out_username->begin = output->length();
164   if (username.len > 0) {
165     // This will escape characters not valid for the username.
166     AppendStringOfType(&username_spec[username.begin], username.len,
167                        CHAR_USERINFO, output);
168   }
169   out_username->len = output->length() - out_username->begin;
170 
171   // When there is a password, we need the separator. Note that we strip
172   // empty but specified passwords.
173   if (password.len > 0) {
174     output->push_back(':');
175     out_password->begin = output->length();
176     AppendStringOfType(&password_spec[password.begin], password.len,
177                        CHAR_USERINFO, output);
178     out_password->len = output->length() - out_password->begin;
179   } else {
180     *out_password = Component();
181   }
182 
183   output->push_back('@');
184   return true;
185 }
186 
187 // Helper functions for converting port integers to strings.
WritePortInt(char * output,int output_len,int port)188 inline void WritePortInt(char* output, int output_len, int port) {
189   _itoa_s(port, output, output_len, 10);
190 }
191 
192 // This function will prepend the colon if there will be a port.
193 template<typename CHAR, typename UCHAR>
DoPort(const CHAR * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)194 bool DoPort(const CHAR* spec,
195             const Component& port,
196             int default_port_for_scheme,
197             CanonOutput* output,
198             Component* out_port) {
199   int port_num = ParsePort(spec, port);
200   if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
201     *out_port = Component();
202     return true;  // Leave port empty.
203   }
204 
205   if (port_num == PORT_INVALID) {
206     // Invalid port: We'll copy the text from the input so the user can see
207     // what the error was, and mark the URL as invalid by returning false.
208     output->push_back(':');
209     out_port->begin = output->length();
210     AppendInvalidNarrowString(spec, port.begin, port.end(), output);
211     out_port->len = output->length() - out_port->begin;
212     return false;
213   }
214 
215   // Convert port number back to an integer. Max port value is 5 digits, and
216   // the Parsed::ExtractPort will have made sure the integer is in range.
217   const int buf_size = 6;
218   char buf[buf_size];
219   WritePortInt(buf, buf_size, port_num);
220 
221   // Append the port number to the output, preceeded by a colon.
222   output->push_back(':');
223   out_port->begin = output->length();
224   for (int i = 0; i < buf_size && buf[i]; i++)
225     output->push_back(buf[i]);
226 
227   out_port->len = output->length() - out_port->begin;
228   return true;
229 }
230 
231 template<typename CHAR, typename UCHAR>
DoCanonicalizeRef(const CHAR * spec,const Component & ref,CanonOutput * output,Component * out_ref)232 void DoCanonicalizeRef(const CHAR* spec,
233                        const Component& ref,
234                        CanonOutput* output,
235                        Component* out_ref) {
236   if (ref.len < 0) {
237     // Common case of no ref.
238     *out_ref = Component();
239     return;
240   }
241 
242   // Append the ref separator. Note that we need to do this even when the ref
243   // is empty but present.
244   output->push_back('#');
245   out_ref->begin = output->length();
246 
247   // Now iterate through all the characters, converting to UTF-8 and validating.
248   int end = ref.end();
249   for (int i = ref.begin; i < end; i++) {
250     if (spec[i] == 0) {
251       // IE just strips NULLs, so we do too.
252       continue;
253     } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
254       // Unline IE seems to, we escape control characters. This will probably
255       // make the reference fragment unusable on a web page, but people
256       // shouldn't be using control characters in their anchor names.
257       AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
258     } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
259       // Normal ASCII characters are just appended.
260       output->push_back(static_cast<char>(spec[i]));
261     } else {
262       // Non-ASCII characters are appended unescaped, but only when they are
263       // valid. Invalid Unicode characters are replaced with the "invalid
264       // character" as IE seems to (ReadUTFChar puts the unicode replacement
265       // character in the output on failure for us).
266       unsigned code_point;
267       ReadUTFChar(spec, &i, end, &code_point);
268       AppendUTF8Value(code_point, output);
269     }
270   }
271 
272   out_ref->len = output->length() - out_ref->begin;
273 }
274 
275 }  // namespace
276 
RemoveURLWhitespace(const char * input,int input_len,CanonOutputT<char> * buffer,int * output_len)277 const char* RemoveURLWhitespace(const char* input, int input_len,
278                                 CanonOutputT<char>* buffer,
279                                 int* output_len) {
280   return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
281 }
282 
RemoveURLWhitespace(const base::char16 * input,int input_len,CanonOutputT<base::char16> * buffer,int * output_len)283 const base::char16* RemoveURLWhitespace(const base::char16* input,
284                                         int input_len,
285                                         CanonOutputT<base::char16>* buffer,
286                                         int* output_len) {
287   return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
288 }
289 
CanonicalSchemeChar(base::char16 ch)290 char CanonicalSchemeChar(base::char16 ch) {
291   if (ch >= 0x80)
292     return 0;  // Non-ASCII is not supported by schemes.
293   return kSchemeCanonical[ch];
294 }
295 
CanonicalizeScheme(const char * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)296 bool CanonicalizeScheme(const char* spec,
297                         const Component& scheme,
298                         CanonOutput* output,
299                         Component* out_scheme) {
300   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
301 }
302 
CanonicalizeScheme(const base::char16 * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)303 bool CanonicalizeScheme(const base::char16* spec,
304                         const Component& scheme,
305                         CanonOutput* output,
306                         Component* out_scheme) {
307   return DoScheme<base::char16, base::char16>(spec, scheme, output, out_scheme);
308 }
309 
CanonicalizeUserInfo(const char * username_source,const Component & username,const char * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)310 bool CanonicalizeUserInfo(const char* username_source,
311                           const Component& username,
312                           const char* password_source,
313                           const Component& password,
314                           CanonOutput* output,
315                           Component* out_username,
316                           Component* out_password) {
317   return DoUserInfo<char, unsigned char>(
318       username_source, username, password_source, password,
319       output, out_username, out_password);
320 }
321 
CanonicalizeUserInfo(const base::char16 * username_source,const Component & username,const base::char16 * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)322 bool CanonicalizeUserInfo(const base::char16* username_source,
323                           const Component& username,
324                           const base::char16* password_source,
325                           const Component& password,
326                           CanonOutput* output,
327                           Component* out_username,
328                           Component* out_password) {
329   return DoUserInfo<base::char16, base::char16>(
330       username_source, username, password_source, password,
331       output, out_username, out_password);
332 }
333 
CanonicalizePort(const char * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)334 bool CanonicalizePort(const char* spec,
335                       const Component& port,
336                       int default_port_for_scheme,
337                       CanonOutput* output,
338                       Component* out_port) {
339   return DoPort<char, unsigned char>(spec, port,
340                                      default_port_for_scheme,
341                                      output, out_port);
342 }
343 
CanonicalizePort(const base::char16 * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)344 bool CanonicalizePort(const base::char16* spec,
345                       const Component& port,
346                       int default_port_for_scheme,
347                       CanonOutput* output,
348                       Component* out_port) {
349   return DoPort<base::char16, base::char16>(spec, port, default_port_for_scheme,
350                                             output, out_port);
351 }
352 
CanonicalizeRef(const char * spec,const Component & ref,CanonOutput * output,Component * out_ref)353 void CanonicalizeRef(const char* spec,
354                      const Component& ref,
355                      CanonOutput* output,
356                      Component* out_ref) {
357   DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
358 }
359 
CanonicalizeRef(const base::char16 * spec,const Component & ref,CanonOutput * output,Component * out_ref)360 void CanonicalizeRef(const base::char16* spec,
361                      const Component& ref,
362                      CanonOutput* output,
363                      Component* out_ref) {
364   DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref);
365 }
366 
367 }  // namespace url
368