1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.haxx.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  ***************************************************************************/
22 
23 #include "curl_setup.h"
24 
25 #include "urldata.h"
26 #include "urlapi-int.h"
27 #include "strcase.h"
28 #include "dotdot.h"
29 #include "url.h"
30 #include "escape.h"
31 #include "curl_ctype.h"
32 
33 /* The last 3 #include files should be in this order */
34 #include "curl_printf.h"
35 #include "curl_memory.h"
36 #include "memdebug.h"
37 
38   /* MSDOS/Windows style drive prefix, eg c: in c:foo */
39 #define STARTS_WITH_DRIVE_PREFIX(str) \
40   ((('a' <= str[0] && str[0] <= 'z') || \
41     ('A' <= str[0] && str[0] <= 'Z')) && \
42    (str[1] == ':'))
43 
44   /* MSDOS/Windows style drive prefix, optionally with
45    * a '|' instead of ':', followed by a slash or NUL */
46 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
47   ((('a' <= (str)[0] && (str)[0] <= 'z') || \
48     ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
49    ((str)[1] == ':' || (str)[1] == '|') && \
50    ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
51 
52 /* Internal representation of CURLU. Point to URL-encoded strings. */
53 struct Curl_URL {
54   char *scheme;
55   char *user;
56   char *password;
57   char *options; /* IMAP only? */
58   char *host;
59   char *port;
60   char *path;
61   char *query;
62   char *fragment;
63 
64   char *scratch; /* temporary scratch area */
65   long portnum; /* the numerical version */
66 };
67 
68 #define DEFAULT_SCHEME "https"
69 
free_urlhandle(struct Curl_URL * u)70 static void free_urlhandle(struct Curl_URL *u)
71 {
72   free(u->scheme);
73   free(u->user);
74   free(u->password);
75   free(u->options);
76   free(u->host);
77   free(u->port);
78   free(u->path);
79   free(u->query);
80   free(u->fragment);
81   free(u->scratch);
82 }
83 
84 /* move the full contents of one handle onto another and
85    free the original */
mv_urlhandle(struct Curl_URL * from,struct Curl_URL * to)86 static void mv_urlhandle(struct Curl_URL *from,
87                          struct Curl_URL *to)
88 {
89   free_urlhandle(to);
90   *to = *from;
91   free(from);
92 }
93 
94 /*
95  * Find the separator at the end of the host name, or the '?' in cases like
96  * http://www.url.com?id=2380
97  */
find_host_sep(const char * url)98 static const char *find_host_sep(const char *url)
99 {
100   const char *sep;
101   const char *query;
102 
103   /* Find the start of the hostname */
104   sep = strstr(url, "//");
105   if(!sep)
106     sep = url;
107   else
108     sep += 2;
109 
110   query = strchr(sep, '?');
111   sep = strchr(sep, '/');
112 
113   if(!sep)
114     sep = url + strlen(url);
115 
116   if(!query)
117     query = url + strlen(url);
118 
119   return sep < query ? sep : query;
120 }
121 
122 /*
123  * Decide in an encoding-independent manner whether a character in an
124  * URL must be escaped. The same criterion must be used in strlen_url()
125  * and strcpy_url().
126  */
urlchar_needs_escaping(int c)127 static bool urlchar_needs_escaping(int c)
128 {
129     return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
130 }
131 
132 /*
133  * strlen_url() returns the length of the given URL if the spaces within the
134  * URL were properly URL encoded.
135  * URL encoding should be skipped for host names, otherwise IDN resolution
136  * will fail.
137  */
strlen_url(const char * url,bool relative)138 static size_t strlen_url(const char *url, bool relative)
139 {
140   const unsigned char *ptr;
141   size_t newlen = 0;
142   bool left = TRUE; /* left side of the ? */
143   const unsigned char *host_sep = (const unsigned char *) url;
144 
145   if(!relative)
146     host_sep = (const unsigned char *) find_host_sep(url);
147 
148   for(ptr = (unsigned char *)url; *ptr; ptr++) {
149 
150     if(ptr < host_sep) {
151       ++newlen;
152       continue;
153     }
154 
155     switch(*ptr) {
156     case '?':
157       left = FALSE;
158       /* FALLTHROUGH */
159     default:
160       if(urlchar_needs_escaping(*ptr))
161         newlen += 2;
162       newlen++;
163       break;
164     case ' ':
165       if(left)
166         newlen += 3;
167       else
168         newlen++;
169       break;
170     }
171   }
172   return newlen;
173 }
174 
175 /* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
176  * the source URL accordingly.
177  * URL encoding should be skipped for host names, otherwise IDN resolution
178  * will fail.
179  */
strcpy_url(char * output,const char * url,bool relative)180 static void strcpy_url(char *output, const char *url, bool relative)
181 {
182   /* we must add this with whitespace-replacing */
183   bool left = TRUE;
184   const unsigned char *iptr;
185   char *optr = output;
186   const unsigned char *host_sep = (const unsigned char *) url;
187 
188   if(!relative)
189     host_sep = (const unsigned char *) find_host_sep(url);
190 
191   for(iptr = (unsigned char *)url;    /* read from here */
192       *iptr;         /* until zero byte */
193       iptr++) {
194 
195     if(iptr < host_sep) {
196       *optr++ = *iptr;
197       continue;
198     }
199 
200     switch(*iptr) {
201     case '?':
202       left = FALSE;
203       /* FALLTHROUGH */
204     default:
205       if(urlchar_needs_escaping(*iptr)) {
206         msnprintf(optr, 4, "%%%02x", *iptr);
207         optr += 3;
208       }
209       else
210         *optr++=*iptr;
211       break;
212     case ' ':
213       if(left) {
214         *optr++='%'; /* add a '%' */
215         *optr++='2'; /* add a '2' */
216         *optr++='0'; /* add a '0' */
217       }
218       else
219         *optr++='+'; /* add a '+' here */
220       break;
221     }
222   }
223   *optr = 0; /* zero terminate output buffer */
224 
225 }
226 
227 /*
228  * Returns true if the given URL is absolute (as opposed to relative) within
229  * the buffer size. Returns the scheme in the buffer if TRUE and 'buf' is
230  * non-NULL.
231  */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen)232 bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
233 {
234   size_t i;
235 #ifdef WIN32
236   if(STARTS_WITH_DRIVE_PREFIX(url))
237     return FALSE;
238 #endif
239   for(i = 0; i < buflen && url[i]; ++i) {
240     char s = url[i];
241     if((s == ':') && (url[i + 1] == '/')) {
242       if(buf)
243         buf[i] = 0;
244       return TRUE;
245     }
246     /* RFC 3986 3.1 explains:
247       scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
248     */
249     else if(ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') ) {
250       if(buf)
251         buf[i] = (char)TOLOWER(s);
252     }
253     else
254       break;
255   }
256   return FALSE;
257 }
258 
259 /*
260  * Concatenate a relative URL to a base URL making it absolute.
261  * URL-encodes any spaces.
262  * The returned pointer must be freed by the caller unless NULL
263  * (returns NULL on out of memory).
264  */
concat_url(const char * base,const char * relurl)265 static char *concat_url(const char *base, const char *relurl)
266 {
267   /***
268    TRY to append this new path to the old URL
269    to the right of the host part. Oh crap, this is doomed to cause
270    problems in the future...
271   */
272   char *newest;
273   char *protsep;
274   char *pathsep;
275   size_t newlen;
276   bool host_changed = FALSE;
277 
278   const char *useurl = relurl;
279   size_t urllen;
280 
281   /* we must make our own copy of the URL to play with, as it may
282      point to read-only data */
283   char *url_clone = strdup(base);
284 
285   if(!url_clone)
286     return NULL; /* skip out of this NOW */
287 
288   /* protsep points to the start of the host name */
289   protsep = strstr(url_clone, "//");
290   if(!protsep)
291     protsep = url_clone;
292   else
293     protsep += 2; /* pass the slashes */
294 
295   if('/' != relurl[0]) {
296     int level = 0;
297 
298     /* First we need to find out if there's a ?-letter in the URL,
299        and cut it and the right-side of that off */
300     pathsep = strchr(protsep, '?');
301     if(pathsep)
302       *pathsep = 0;
303 
304     /* we have a relative path to append to the last slash if there's one
305        available, or if the new URL is just a query string (starts with a
306        '?')  we append the new one at the end of the entire currently worked
307        out URL */
308     if(useurl[0] != '?') {
309       pathsep = strrchr(protsep, '/');
310       if(pathsep)
311         *pathsep = 0;
312     }
313 
314     /* Check if there's any slash after the host name, and if so, remember
315        that position instead */
316     pathsep = strchr(protsep, '/');
317     if(pathsep)
318       protsep = pathsep + 1;
319     else
320       protsep = NULL;
321 
322     /* now deal with one "./" or any amount of "../" in the newurl
323        and act accordingly */
324 
325     if((useurl[0] == '.') && (useurl[1] == '/'))
326       useurl += 2; /* just skip the "./" */
327 
328     while((useurl[0] == '.') &&
329           (useurl[1] == '.') &&
330           (useurl[2] == '/')) {
331       level++;
332       useurl += 3; /* pass the "../" */
333     }
334 
335     if(protsep) {
336       while(level--) {
337         /* cut off one more level from the right of the original URL */
338         pathsep = strrchr(protsep, '/');
339         if(pathsep)
340           *pathsep = 0;
341         else {
342           *protsep = 0;
343           break;
344         }
345       }
346     }
347   }
348   else {
349     /* We got a new absolute path for this server */
350 
351     if((relurl[0] == '/') && (relurl[1] == '/')) {
352       /* the new URL starts with //, just keep the protocol part from the
353          original one */
354       *protsep = 0;
355       useurl = &relurl[2]; /* we keep the slashes from the original, so we
356                               skip the new ones */
357       host_changed = TRUE;
358     }
359     else {
360       /* cut off the original URL from the first slash, or deal with URLs
361          without slash */
362       pathsep = strchr(protsep, '/');
363       if(pathsep) {
364         /* When people use badly formatted URLs, such as
365            "http://www.url.com?dir=/home/daniel" we must not use the first
366            slash, if there's a ?-letter before it! */
367         char *sep = strchr(protsep, '?');
368         if(sep && (sep < pathsep))
369           pathsep = sep;
370         *pathsep = 0;
371       }
372       else {
373         /* There was no slash. Now, since we might be operating on a badly
374            formatted URL, such as "http://www.url.com?id=2380" which doesn't
375            use a slash separator as it is supposed to, we need to check for a
376            ?-letter as well! */
377         pathsep = strchr(protsep, '?');
378         if(pathsep)
379           *pathsep = 0;
380       }
381     }
382   }
383 
384   /* If the new part contains a space, this is a mighty stupid redirect
385      but we still make an effort to do "right". To the left of a '?'
386      letter we replace each space with %20 while it is replaced with '+'
387      on the right side of the '?' letter.
388   */
389   newlen = strlen_url(useurl, !host_changed);
390 
391   urllen = strlen(url_clone);
392 
393   newest = malloc(urllen + 1 + /* possible slash */
394                   newlen + 1 /* zero byte */);
395 
396   if(!newest) {
397     free(url_clone); /* don't leak this */
398     return NULL;
399   }
400 
401   /* copy over the root url part */
402   memcpy(newest, url_clone, urllen);
403 
404   /* check if we need to append a slash */
405   if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
406     ;
407   else
408     newest[urllen++]='/';
409 
410   /* then append the new piece on the right side */
411   strcpy_url(&newest[urllen], useurl, !host_changed);
412 
413   free(url_clone);
414 
415   return newest;
416 }
417 
418 /*
419  * parse_hostname_login()
420  *
421  * Parse the login details (user name, password and options) from the URL and
422  * strip them out of the host name
423  *
424  */
parse_hostname_login(struct Curl_URL * u,const struct Curl_handler * h,char ** hostname,unsigned int flags)425 static CURLUcode parse_hostname_login(struct Curl_URL *u,
426                                       const struct Curl_handler *h,
427                                       char **hostname,
428                                       unsigned int flags)
429 {
430   CURLUcode result = CURLUE_OK;
431   CURLcode ccode;
432   char *userp = NULL;
433   char *passwdp = NULL;
434   char *optionsp = NULL;
435 
436   /* At this point, we're hoping all the other special cases have
437    * been taken care of, so conn->host.name is at most
438    *    [user[:password][;options]]@]hostname
439    *
440    * We need somewhere to put the embedded details, so do that first.
441    */
442 
443   char *ptr = strchr(*hostname, '@');
444   char *login = *hostname;
445 
446   if(!ptr)
447     goto out;
448 
449   /* We will now try to extract the
450    * possible login information in a string like:
451    * ftp://user:password@ftp.my.site:8021/README */
452   *hostname = ++ptr;
453 
454   /* We could use the login information in the URL so extract it. Only parse
455      options if the handler says we should. Note that 'h' might be NULL! */
456   ccode = Curl_parse_login_details(login, ptr - login - 1,
457                                    &userp, &passwdp,
458                                    (h && (h->flags & PROTOPT_URLOPTIONS)) ?
459                                    &optionsp:NULL);
460   if(ccode) {
461     result = CURLUE_MALFORMED_INPUT;
462     goto out;
463   }
464 
465   if(userp) {
466     if(flags & CURLU_DISALLOW_USER) {
467       /* Option DISALLOW_USER is set and url contains username. */
468       result = CURLUE_USER_NOT_ALLOWED;
469       goto out;
470     }
471 
472     u->user = userp;
473   }
474 
475   if(passwdp)
476     u->password = passwdp;
477 
478   if(optionsp)
479     u->options = optionsp;
480 
481   return CURLUE_OK;
482   out:
483 
484   free(userp);
485   free(passwdp);
486   free(optionsp);
487 
488   return result;
489 }
490 
Curl_parse_port(struct Curl_URL * u,char * hostname)491 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname)
492 {
493   char *portptr = NULL;
494   char endbracket;
495   int len;
496 
497   /*
498    * Find the end of an IPv6 address, either on the ']' ending bracket or
499    * a percent-encoded zone index.
500    */
501   if(1 == sscanf(hostname, "[%*45[0123456789abcdefABCDEF:.]%c%n",
502                  &endbracket, &len)) {
503     if(']' == endbracket)
504       portptr = &hostname[len];
505     else if('%' == endbracket) {
506       int zonelen = len;
507       if(1 == sscanf(hostname + zonelen, "25%*[^]]%c%n", &endbracket, &len)) {
508         if(']' != endbracket)
509           return CURLUE_MALFORMED_INPUT;
510         portptr = &hostname[--zonelen + len + 1];
511       }
512       else
513         return CURLUE_MALFORMED_INPUT;
514     }
515     else
516       return CURLUE_MALFORMED_INPUT;
517 
518     /* this is a RFC2732-style specified IP-address */
519     if(portptr && *portptr) {
520       if(*portptr != ':')
521         return CURLUE_MALFORMED_INPUT;
522     }
523     else
524       portptr = NULL;
525   }
526   else
527     portptr = strchr(hostname, ':');
528 
529   if(portptr) {
530     char *rest;
531     long port;
532     char portbuf[7];
533 
534     /* Browser behavior adaptation. If there's a colon with no digits after,
535        just cut off the name there which makes us ignore the colon and just
536        use the default port. Firefox, Chrome and Safari all do that. */
537     if(!portptr[1]) {
538       *portptr = '\0';
539       return CURLUE_OK;
540     }
541 
542     if(!ISDIGIT(portptr[1]))
543       return CURLUE_BAD_PORT_NUMBER;
544 
545     port = strtol(portptr + 1, &rest, 10);  /* Port number must be decimal */
546 
547     if((port <= 0) || (port > 0xffff))
548       /* Single unix standard says port numbers are 16 bits long, but we don't
549          treat port zero as OK. */
550       return CURLUE_BAD_PORT_NUMBER;
551 
552     if(rest[0])
553       return CURLUE_BAD_PORT_NUMBER;
554 
555     *portptr++ = '\0'; /* cut off the name there */
556     *rest = 0;
557     /* generate a new port number string to get rid of leading zeroes etc */
558     msnprintf(portbuf, sizeof(portbuf), "%ld", port);
559     u->portnum = port;
560     u->port = strdup(portbuf);
561     if(!u->port)
562       return CURLUE_OUT_OF_MEMORY;
563   }
564 
565   return CURLUE_OK;
566 }
567 
568 /* scan for byte values < 31 or 127 */
junkscan(char * part)569 static CURLUcode junkscan(char *part)
570 {
571   if(part) {
572     static const char badbytes[]={
573       /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
574       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
575       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
576       0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
577       0x7f,
578       0x00 /* zero terminate */
579     };
580     size_t n = strlen(part);
581     size_t nfine = strcspn(part, badbytes);
582     if(nfine != n)
583       /* since we don't know which part is scanned, return a generic error
584          code */
585       return CURLUE_MALFORMED_INPUT;
586   }
587   return CURLUE_OK;
588 }
589 
hostname_check(char * hostname,unsigned int flags)590 static CURLUcode hostname_check(char *hostname, unsigned int flags)
591 {
592   const char *l = NULL; /* accepted characters */
593   size_t len;
594   size_t hlen = strlen(hostname);
595   (void)flags;
596 
597   if(hostname[0] == '[') {
598     hostname++;
599     l = "0123456789abcdefABCDEF::.%";
600     hlen -= 2;
601   }
602 
603   if(l) {
604     /* only valid letters are ok */
605     len = strspn(hostname, l);
606     if(hlen != len)
607       /* hostname with bad content */
608       return CURLUE_MALFORMED_INPUT;
609   }
610   else {
611     /* letters from the second string is not ok */
612     len = strcspn(hostname, " ");
613     if(hlen != len)
614       /* hostname with bad content */
615       return CURLUE_MALFORMED_INPUT;
616   }
617   return CURLUE_OK;
618 }
619 
620 #define HOSTNAME_END(x) (((x) == '/') || ((x) == '?') || ((x) == '#'))
621 
seturl(const char * url,CURLU * u,unsigned int flags)622 static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
623 {
624   char *path;
625   bool path_alloced = FALSE;
626   char *hostname;
627   char *query = NULL;
628   char *fragment = NULL;
629   CURLUcode result;
630   bool url_has_scheme = FALSE;
631   char schemebuf[MAX_SCHEME_LEN];
632   char *schemep = NULL;
633   size_t schemelen = 0;
634   size_t urllen;
635   const struct Curl_handler *h = NULL;
636 
637   if(!url)
638     return CURLUE_MALFORMED_INPUT;
639 
640   /*************************************************************
641    * Parse the URL.
642    ************************************************************/
643   /* allocate scratch area */
644   urllen = strlen(url);
645   path = u->scratch = malloc(urllen * 2 + 2);
646   if(!path)
647     return CURLUE_OUT_OF_MEMORY;
648 
649   hostname = &path[urllen + 1];
650   hostname[0] = 0;
651 
652   if(Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf))) {
653     url_has_scheme = TRUE;
654     schemelen = strlen(schemebuf);
655   }
656 
657   /* handle the file: scheme */
658   if(url_has_scheme && strcasecompare(schemebuf, "file")) {
659     /* path has been allocated large enough to hold this */
660     strcpy(path, &url[5]);
661 
662     hostname = NULL; /* no host for file: URLs */
663     u->scheme = strdup("file");
664     if(!u->scheme)
665       return CURLUE_OUT_OF_MEMORY;
666 
667     /* Extra handling URLs with an authority component (i.e. that start with
668      * "file://")
669      *
670      * We allow omitted hostname (e.g. file:/<path>) -- valid according to
671      * RFC 8089, but not the (current) WHAT-WG URL spec.
672      */
673     if(path[0] == '/' && path[1] == '/') {
674       /* swallow the two slashes */
675       char *ptr = &path[2];
676 
677       /*
678        * According to RFC 8089, a file: URL can be reliably dereferenced if:
679        *
680        *  o it has no/blank hostname, or
681        *
682        *  o the hostname matches "localhost" (case-insensitively), or
683        *
684        *  o the hostname is a FQDN that resolves to this machine.
685        *
686        * For brevity, we only consider URLs with empty, "localhost", or
687        * "127.0.0.1" hostnames as local.
688        *
689        * Additionally, there is an exception for URLs with a Windows drive
690        * letter in the authority (which was accidentally omitted from RFC 8089
691        * Appendix E, but believe me, it was meant to be there. --MK)
692        */
693       if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
694         /* the URL includes a host name, it must match "localhost" or
695            "127.0.0.1" to be valid */
696         if(!checkprefix("localhost/", ptr) &&
697            !checkprefix("127.0.0.1/", ptr)) {
698           /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
699              none */
700           return CURLUE_MALFORMED_INPUT;
701         }
702         ptr += 9; /* now points to the slash after the host */
703       }
704 
705       path = ptr;
706     }
707 
708 #if !defined(MSDOS) && !defined(WIN32) && !defined(__CYGWIN__)
709     /* Don't allow Windows drive letters when not in Windows.
710      * This catches both "file:/c:" and "file:c:" */
711     if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
712        STARTS_WITH_URL_DRIVE_PREFIX(path)) {
713       /* File drive letters are only accepted in MSDOS/Windows */
714       return CURLUE_MALFORMED_INPUT;
715     }
716 #else
717     /* If the path starts with a slash and a drive letter, ditch the slash */
718     if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
719       /* This cannot be done with strcpy, as the memory chunks overlap! */
720       memmove(path, &path[1], strlen(&path[1]) + 1);
721     }
722 #endif
723 
724   }
725   else {
726     /* clear path */
727     const char *p;
728     const char *hostp;
729     size_t len;
730     path[0] = 0;
731 
732     if(url_has_scheme) {
733       int i = 0;
734       p = &url[schemelen + 1];
735       while(p && (*p == '/') && (i < 4)) {
736         p++;
737         i++;
738       }
739       if((i < 1) || (i>3))
740         /* less than one or more than three slashes */
741         return CURLUE_MALFORMED_INPUT;
742 
743       schemep = schemebuf;
744       if(!Curl_builtin_scheme(schemep) &&
745          !(flags & CURLU_NON_SUPPORT_SCHEME))
746         return CURLUE_UNSUPPORTED_SCHEME;
747 
748       if(junkscan(schemep))
749         return CURLUE_MALFORMED_INPUT;
750     }
751     else {
752       /* no scheme! */
753 
754       if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME)))
755         return CURLUE_MALFORMED_INPUT;
756       if(flags & CURLU_DEFAULT_SCHEME)
757         schemep = (char *) DEFAULT_SCHEME;
758 
759       /*
760        * The URL was badly formatted, let's try without scheme specified.
761        */
762       p = url;
763     }
764     hostp = p; /* host name starts here */
765 
766     while(*p && !HOSTNAME_END(*p)) /* find end of host name */
767       p++;
768 
769     len = p - hostp;
770     if(!len)
771       return CURLUE_MALFORMED_INPUT;
772 
773     memcpy(hostname, hostp, len);
774     hostname[len] = 0;
775 
776     if((flags & CURLU_GUESS_SCHEME) && !schemep) {
777       /* legacy curl-style guess based on host name */
778       if(checkprefix("ftp.", hostname))
779         schemep = (char *)"ftp";
780       else if(checkprefix("dict.", hostname))
781         schemep = (char *)"dict";
782       else if(checkprefix("ldap.", hostname))
783         schemep = (char *)"ldap";
784       else if(checkprefix("imap.", hostname))
785         schemep = (char *)"imap";
786       else if(checkprefix("smtp.", hostname))
787         schemep = (char *)"smtp";
788       else if(checkprefix("pop3.", hostname))
789         schemep = (char *)"pop3";
790       else
791         schemep = (char *)"http";
792     }
793 
794     len = strlen(p);
795     memcpy(path, p, len);
796     path[len] = 0;
797 
798     u->scheme = strdup(schemep);
799     if(!u->scheme)
800       return CURLUE_OUT_OF_MEMORY;
801   }
802 
803   /* if this is a known scheme, get some details */
804   h = Curl_builtin_scheme(u->scheme);
805 
806   if(junkscan(path))
807     return CURLUE_MALFORMED_INPUT;
808 
809   query = strchr(path, '?');
810   if(query)
811     *query++ = 0;
812 
813   fragment = strchr(query?query:path, '#');
814   if(fragment)
815     *fragment++ = 0;
816 
817   if(!path[0])
818     /* if there's no path set, unset */
819     path = NULL;
820   else if(!(flags & CURLU_PATH_AS_IS)) {
821     /* sanitise paths and remove ../ and ./ sequences according to RFC3986 */
822     char *newp = Curl_dedotdotify(path);
823     if(!newp)
824       return CURLUE_OUT_OF_MEMORY;
825 
826     if(strcmp(newp, path)) {
827       /* if we got a new version */
828       path = newp;
829       path_alloced = TRUE;
830     }
831     else
832       free(newp);
833   }
834   if(path) {
835     u->path = path_alloced?path:strdup(path);
836     if(!u->path)
837       return CURLUE_OUT_OF_MEMORY;
838   }
839 
840   if(hostname) {
841     /*
842      * Parse the login details and strip them out of the host name.
843      */
844     if(junkscan(hostname))
845       return CURLUE_MALFORMED_INPUT;
846 
847     result = parse_hostname_login(u, h, &hostname, flags);
848     if(result)
849       return result;
850 
851     result = Curl_parse_port(u, hostname);
852     if(result)
853       return result;
854 
855     result = hostname_check(hostname, flags);
856     if(result)
857       return result;
858 
859     u->host = strdup(hostname);
860     if(!u->host)
861       return CURLUE_OUT_OF_MEMORY;
862   }
863 
864   if(query) {
865     u->query = strdup(query);
866     if(!u->query)
867       return CURLUE_OUT_OF_MEMORY;
868   }
869   if(fragment && fragment[0]) {
870     u->fragment = strdup(fragment);
871     if(!u->fragment)
872       return CURLUE_OUT_OF_MEMORY;
873   }
874 
875   free(u->scratch);
876   u->scratch = NULL;
877 
878   return CURLUE_OK;
879 }
880 
881 /*
882  * Parse the URL and set the relevant members of the Curl_URL struct.
883  */
parseurl(const char * url,CURLU * u,unsigned int flags)884 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
885 {
886   CURLUcode result = seturl(url, u, flags);
887   if(result) {
888     free_urlhandle(u);
889     memset(u, 0, sizeof(struct Curl_URL));
890   }
891   return result;
892 }
893 
894 /*
895  */
curl_url(void)896 CURLU *curl_url(void)
897 {
898   return calloc(sizeof(struct Curl_URL), 1);
899 }
900 
curl_url_cleanup(CURLU * u)901 void curl_url_cleanup(CURLU *u)
902 {
903   if(u) {
904     free_urlhandle(u);
905     free(u);
906   }
907 }
908 
909 #define DUP(dest, src, name)         \
910   if(src->name) {                    \
911     dest->name = strdup(src->name);  \
912     if(!dest->name)                  \
913       goto fail;                     \
914   }
915 
curl_url_dup(CURLU * in)916 CURLU *curl_url_dup(CURLU *in)
917 {
918   struct Curl_URL *u = calloc(sizeof(struct Curl_URL), 1);
919   if(u) {
920     DUP(u, in, scheme);
921     DUP(u, in, user);
922     DUP(u, in, password);
923     DUP(u, in, options);
924     DUP(u, in, host);
925     DUP(u, in, port);
926     DUP(u, in, path);
927     DUP(u, in, query);
928     DUP(u, in, fragment);
929     u->portnum = in->portnum;
930   }
931   return u;
932   fail:
933   curl_url_cleanup(u);
934   return NULL;
935 }
936 
curl_url_get(CURLU * u,CURLUPart what,char ** part,unsigned int flags)937 CURLUcode curl_url_get(CURLU *u, CURLUPart what,
938                        char **part, unsigned int flags)
939 {
940   char *ptr;
941   CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
942   char portbuf[7];
943   bool urldecode = (flags & CURLU_URLDECODE)?1:0;
944   bool plusdecode = FALSE;
945   (void)flags;
946   if(!u)
947     return CURLUE_BAD_HANDLE;
948   if(!part)
949     return CURLUE_BAD_PARTPOINTER;
950   *part = NULL;
951 
952   switch(what) {
953   case CURLUPART_SCHEME:
954     ptr = u->scheme;
955     ifmissing = CURLUE_NO_SCHEME;
956     urldecode = FALSE; /* never for schemes */
957     break;
958   case CURLUPART_USER:
959     ptr = u->user;
960     ifmissing = CURLUE_NO_USER;
961     break;
962   case CURLUPART_PASSWORD:
963     ptr = u->password;
964     ifmissing = CURLUE_NO_PASSWORD;
965     break;
966   case CURLUPART_OPTIONS:
967     ptr = u->options;
968     ifmissing = CURLUE_NO_OPTIONS;
969     break;
970   case CURLUPART_HOST:
971     ptr = u->host;
972     ifmissing = CURLUE_NO_HOST;
973     break;
974   case CURLUPART_PORT:
975     ptr = u->port;
976     ifmissing = CURLUE_NO_PORT;
977     urldecode = FALSE; /* never for port */
978     if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
979       /* there's no stored port number, but asked to deliver
980          a default one for the scheme */
981       const struct Curl_handler *h =
982         Curl_builtin_scheme(u->scheme);
983       if(h) {
984         msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
985         ptr = portbuf;
986       }
987     }
988     else if(ptr && u->scheme) {
989       /* there is a stored port number, but ask to inhibit if
990          it matches the default one for the scheme */
991       const struct Curl_handler *h =
992         Curl_builtin_scheme(u->scheme);
993       if(h && (h->defport == u->portnum) &&
994          (flags & CURLU_NO_DEFAULT_PORT))
995         ptr = NULL;
996     }
997     break;
998   case CURLUPART_PATH:
999     ptr = u->path;
1000     if(!ptr) {
1001       ptr = u->path = strdup("/");
1002       if(!u->path)
1003         return CURLUE_OUT_OF_MEMORY;
1004     }
1005     break;
1006   case CURLUPART_QUERY:
1007     ptr = u->query;
1008     ifmissing = CURLUE_NO_QUERY;
1009     plusdecode = urldecode;
1010     break;
1011   case CURLUPART_FRAGMENT:
1012     ptr = u->fragment;
1013     ifmissing = CURLUE_NO_FRAGMENT;
1014     break;
1015   case CURLUPART_URL: {
1016     char *url;
1017     char *scheme;
1018     char *options = u->options;
1019     char *port = u->port;
1020     if(u->scheme && strcasecompare("file", u->scheme)) {
1021       url = aprintf("file://%s%s%s",
1022                     u->path,
1023                     u->fragment? "#": "",
1024                     u->fragment? u->fragment : "");
1025     }
1026     else if(!u->host)
1027       return CURLUE_NO_HOST;
1028     else {
1029       const struct Curl_handler *h = NULL;
1030       if(u->scheme)
1031         scheme = u->scheme;
1032       else if(flags & CURLU_DEFAULT_SCHEME)
1033         scheme = (char *) DEFAULT_SCHEME;
1034       else
1035         return CURLUE_NO_SCHEME;
1036 
1037       if(scheme) {
1038         h = Curl_builtin_scheme(scheme);
1039         if(!port && (flags & CURLU_DEFAULT_PORT)) {
1040           /* there's no stored port number, but asked to deliver
1041              a default one for the scheme */
1042           if(h) {
1043             msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1044             port = portbuf;
1045           }
1046         }
1047         else if(port) {
1048           /* there is a stored port number, but asked to inhibit if it matches
1049              the default one for the scheme */
1050           if(h && (h->defport == u->portnum) &&
1051              (flags & CURLU_NO_DEFAULT_PORT))
1052             port = NULL;
1053         }
1054       }
1055       if(h && !(h->flags & PROTOPT_URLOPTIONS))
1056         options = NULL;
1057 
1058       url = aprintf("%s://%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1059                     scheme,
1060                     u->user ? u->user : "",
1061                     u->password ? ":": "",
1062                     u->password ? u->password : "",
1063                     options ? ";" : "",
1064                     options ? options : "",
1065                     (u->user || u->password || options) ? "@": "",
1066                     u->host,
1067                     port ? ":": "",
1068                     port ? port : "",
1069                     (u->path && (u->path[0] != '/')) ? "/": "",
1070                     u->path ? u->path : "/",
1071                     (u->query && u->query[0]) ? "?": "",
1072                     (u->query && u->query[0]) ? u->query : "",
1073                     u->fragment? "#": "",
1074                     u->fragment? u->fragment : "");
1075     }
1076     if(!url)
1077       return CURLUE_OUT_OF_MEMORY;
1078     *part = url;
1079     return CURLUE_OK;
1080   }
1081   default:
1082     ptr = NULL;
1083     break;
1084   }
1085   if(ptr) {
1086     *part = strdup(ptr);
1087     if(!*part)
1088       return CURLUE_OUT_OF_MEMORY;
1089     if(plusdecode) {
1090       /* convert + to space */
1091       char *plus;
1092       for(plus = *part; *plus; ++plus) {
1093         if(*plus == '+')
1094           *plus = ' ';
1095       }
1096     }
1097     if(urldecode) {
1098       char *decoded;
1099       size_t dlen;
1100       CURLcode res = Curl_urldecode(NULL, *part, 0, &decoded, &dlen, TRUE);
1101       free(*part);
1102       if(res) {
1103         *part = NULL;
1104         return CURLUE_URLDECODE;
1105       }
1106       *part = decoded;
1107     }
1108     return CURLUE_OK;
1109   }
1110   else
1111     return ifmissing;
1112 }
1113 
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1114 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1115                        const char *part, unsigned int flags)
1116 {
1117   char **storep = NULL;
1118   long port = 0;
1119   bool urlencode = (flags & CURLU_URLENCODE)? 1 : 0;
1120   bool plusencode = FALSE;
1121   bool urlskipslash = FALSE;
1122   bool appendquery = FALSE;
1123   bool equalsencode = FALSE;
1124 
1125   if(!u)
1126     return CURLUE_BAD_HANDLE;
1127   if(!part) {
1128     /* setting a part to NULL clears it */
1129     switch(what) {
1130     case CURLUPART_URL:
1131       break;
1132     case CURLUPART_SCHEME:
1133       storep = &u->scheme;
1134       break;
1135     case CURLUPART_USER:
1136       storep = &u->user;
1137       break;
1138     case CURLUPART_PASSWORD:
1139       storep = &u->password;
1140       break;
1141     case CURLUPART_OPTIONS:
1142       storep = &u->options;
1143       break;
1144     case CURLUPART_HOST:
1145       storep = &u->host;
1146       break;
1147     case CURLUPART_PORT:
1148       storep = &u->port;
1149       break;
1150     case CURLUPART_PATH:
1151       storep = &u->path;
1152       break;
1153     case CURLUPART_QUERY:
1154       storep = &u->query;
1155       break;
1156     case CURLUPART_FRAGMENT:
1157       storep = &u->fragment;
1158       break;
1159     default:
1160       return CURLUE_UNKNOWN_PART;
1161     }
1162     if(storep && *storep) {
1163       free(*storep);
1164       *storep = NULL;
1165     }
1166     return CURLUE_OK;
1167   }
1168 
1169   switch(what) {
1170   case CURLUPART_SCHEME:
1171     if(!(flags & CURLU_NON_SUPPORT_SCHEME) &&
1172        /* verify that it is a fine scheme */
1173        !Curl_builtin_scheme(part))
1174       return CURLUE_UNSUPPORTED_SCHEME;
1175     storep = &u->scheme;
1176     urlencode = FALSE; /* never */
1177     break;
1178   case CURLUPART_USER:
1179     storep = &u->user;
1180     break;
1181   case CURLUPART_PASSWORD:
1182     storep = &u->password;
1183     break;
1184   case CURLUPART_OPTIONS:
1185     storep = &u->options;
1186     break;
1187   case CURLUPART_HOST:
1188     storep = &u->host;
1189     break;
1190   case CURLUPART_PORT:
1191     urlencode = FALSE; /* never */
1192     port = strtol(part, NULL, 10);  /* Port number must be decimal */
1193     if((port <= 0) || (port > 0xffff))
1194       return CURLUE_BAD_PORT_NUMBER;
1195     storep = &u->port;
1196     break;
1197   case CURLUPART_PATH:
1198     urlskipslash = TRUE;
1199     storep = &u->path;
1200     break;
1201   case CURLUPART_QUERY:
1202     plusencode = urlencode;
1203     appendquery = (flags & CURLU_APPENDQUERY)?1:0;
1204     equalsencode = appendquery;
1205     storep = &u->query;
1206     break;
1207   case CURLUPART_FRAGMENT:
1208     storep = &u->fragment;
1209     break;
1210   case CURLUPART_URL: {
1211     /*
1212      * Allow a new URL to replace the existing (if any) contents.
1213      *
1214      * If the existing contents is enough for a URL, allow a relative URL to
1215      * replace it.
1216      */
1217     CURLUcode result;
1218     char *oldurl;
1219     char *redired_url;
1220     CURLU *handle2;
1221 
1222     if(Curl_is_absolute_url(part, NULL, MAX_SCHEME_LEN)) {
1223       handle2 = curl_url();
1224       if(!handle2)
1225         return CURLUE_OUT_OF_MEMORY;
1226       result = parseurl(part, handle2, flags);
1227       if(!result)
1228         mv_urlhandle(handle2, u);
1229       else
1230         curl_url_cleanup(handle2);
1231       return result;
1232     }
1233     /* extract the full "old" URL to do the redirect on */
1234     result = curl_url_get(u, CURLUPART_URL, &oldurl, flags);
1235     if(result) {
1236       /* couldn't get the old URL, just use the new! */
1237       handle2 = curl_url();
1238       if(!handle2)
1239         return CURLUE_OUT_OF_MEMORY;
1240       result = parseurl(part, handle2, flags);
1241       if(!result)
1242         mv_urlhandle(handle2, u);
1243       else
1244         curl_url_cleanup(handle2);
1245       return result;
1246     }
1247 
1248     /* apply the relative part to create a new URL */
1249     redired_url = concat_url(oldurl, part);
1250     free(oldurl);
1251     if(!redired_url)
1252       return CURLUE_OUT_OF_MEMORY;
1253 
1254     /* now parse the new URL */
1255     handle2 = curl_url();
1256     if(!handle2) {
1257       free(redired_url);
1258       return CURLUE_OUT_OF_MEMORY;
1259     }
1260     result = parseurl(redired_url, handle2, flags);
1261     free(redired_url);
1262     if(!result)
1263       mv_urlhandle(handle2, u);
1264     else
1265       curl_url_cleanup(handle2);
1266     return result;
1267   }
1268   default:
1269     return CURLUE_UNKNOWN_PART;
1270   }
1271   if(storep) {
1272     const char *newp = part;
1273     size_t nalloc = strlen(part);
1274 
1275     if(urlencode) {
1276       const char *i;
1277       char *o;
1278       bool free_part = FALSE;
1279       char *enc = malloc(nalloc * 3 + 1); /* for worst case! */
1280       if(!enc)
1281         return CURLUE_OUT_OF_MEMORY;
1282       if(plusencode) {
1283         /* space to plus */
1284         i = part;
1285         for(o = enc; *i; ++o, ++i)
1286           *o = (*i == ' ') ? '+' : *i;
1287         *o = 0; /* zero terminate */
1288         part = strdup(enc);
1289         if(!part) {
1290           free(enc);
1291           return CURLUE_OUT_OF_MEMORY;
1292         }
1293         free_part = TRUE;
1294       }
1295       for(i = part, o = enc; *i; i++) {
1296         if(Curl_isunreserved(*i) ||
1297            ((*i == '/') && urlskipslash) ||
1298            ((*i == '=') && equalsencode) ||
1299            ((*i == '+') && plusencode)) {
1300           if((*i == '=') && equalsencode)
1301             /* only skip the first equals sign */
1302             equalsencode = FALSE;
1303           *o = *i;
1304           o++;
1305         }
1306         else {
1307           msnprintf(o, 4, "%%%02x", *i);
1308           o += 3;
1309         }
1310       }
1311       *o = 0; /* zero terminate */
1312       newp = enc;
1313       if(free_part)
1314         free((char *)part);
1315     }
1316     else {
1317       char *p;
1318       newp = strdup(part);
1319       if(!newp)
1320         return CURLUE_OUT_OF_MEMORY;
1321       p = (char *)newp;
1322       while(*p) {
1323         /* make sure percent encoded are lower case */
1324         if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1325            (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1326           p[1] = (char)TOLOWER(p[1]);
1327           p[2] = (char)TOLOWER(p[2]);
1328           p += 3;
1329         }
1330         else
1331           p++;
1332       }
1333     }
1334 
1335     if(appendquery) {
1336       /* Append the string onto the old query. Add a '&' separator if none is
1337          present at the end of the exsting query already */
1338       size_t querylen = u->query ? strlen(u->query) : 0;
1339       bool addamperand = querylen && (u->query[querylen -1] != '&');
1340       if(querylen) {
1341         size_t newplen = strlen(newp);
1342         char *p = malloc(querylen + addamperand + newplen + 1);
1343         if(!p) {
1344           free((char *)newp);
1345           return CURLUE_OUT_OF_MEMORY;
1346         }
1347         strcpy(p, u->query); /* original query */
1348         if(addamperand)
1349           p[querylen] = '&'; /* ampersand */
1350         strcpy(&p[querylen + addamperand], newp); /* new suffix */
1351         free((char *)newp);
1352         free(*storep);
1353         *storep = p;
1354         return CURLUE_OK;
1355       }
1356     }
1357 
1358     free(*storep);
1359     *storep = (char *)newp;
1360   }
1361   /* set after the string, to make it not assigned if the allocation above
1362      fails */
1363   if(port)
1364     u->portnum = port;
1365   return CURLUE_OK;
1366 }
1367