1 /**
2  * uri.c: set of generic URI related routines
3  *
4  * Reference: RFCs 3986, 2732 and 2373
5  *
6  * See Copyright for the status of this software.
7  *
8  * daniel@veillard.com
9  */
10 
11 #define IN_LIBXML
12 #include "libxml.h"
13 
14 #include <string.h>
15 
16 #include <libxml/xmlmemory.h>
17 #include <libxml/uri.h>
18 #include <libxml/globals.h>
19 #include <libxml/xmlerror.h>
20 
21 /**
22  * MAX_URI_LENGTH:
23  *
24  * The definition of the URI regexp in the above RFC has no size limit
25  * In practice they are usually relativey short except for the
26  * data URI scheme as defined in RFC 2397. Even for data URI the usual
27  * maximum size before hitting random practical limits is around 64 KB
28  * and 4KB is usually a maximum admitted limit for proper operations.
29  * The value below is more a security limit than anything else and
30  * really should never be hit by 'normal' operations
31  * Set to 1 MByte in 2012, this is only enforced on output
32  */
33 #define MAX_URI_LENGTH 1024 * 1024
34 
35 static void
xmlURIErrMemory(const char * extra)36 xmlURIErrMemory(const char *extra)
37 {
38     if (extra)
39         __xmlRaiseError(NULL, NULL, NULL,
40                         NULL, NULL, XML_FROM_URI,
41                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0,
42                         extra, NULL, NULL, 0, 0,
43                         "Memory allocation failed : %s\n", extra);
44     else
45         __xmlRaiseError(NULL, NULL, NULL,
46                         NULL, NULL, XML_FROM_URI,
47                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0,
48                         NULL, NULL, NULL, 0, 0,
49                         "Memory allocation failed\n");
50 }
51 
52 static void xmlCleanURI(xmlURIPtr uri);
53 
54 /*
55  * Old rule from 2396 used in legacy handling code
56  * alpha    = lowalpha | upalpha
57  */
58 #define IS_ALPHA(x) (IS_LOWALPHA(x) || IS_UPALPHA(x))
59 
60 
61 /*
62  * lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" |
63  *            "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" |
64  *            "u" | "v" | "w" | "x" | "y" | "z"
65  */
66 
67 #define IS_LOWALPHA(x) (((x) >= 'a') && ((x) <= 'z'))
68 
69 /*
70  * upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" |
71  *           "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" |
72  *           "U" | "V" | "W" | "X" | "Y" | "Z"
73  */
74 #define IS_UPALPHA(x) (((x) >= 'A') && ((x) <= 'Z'))
75 
76 #ifdef IS_DIGIT
77 #undef IS_DIGIT
78 #endif
79 /*
80  * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
81  */
82 #define IS_DIGIT(x) (((x) >= '0') && ((x) <= '9'))
83 
84 /*
85  * alphanum = alpha | digit
86  */
87 
88 #define IS_ALPHANUM(x) (IS_ALPHA(x) || IS_DIGIT(x))
89 
90 /*
91  * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
92  */
93 
94 #define IS_MARK(x) (((x) == '-') || ((x) == '_') || ((x) == '.') ||     \
95     ((x) == '!') || ((x) == '~') || ((x) == '*') || ((x) == '\'') ||    \
96     ((x) == '(') || ((x) == ')'))
97 
98 /*
99  * unwise = "{" | "}" | "|" | "\" | "^" | "`"
100  */
101 
102 #define IS_UNWISE(p)                                                    \
103       (((*(p) == '{')) || ((*(p) == '}')) || ((*(p) == '|')) ||         \
104        ((*(p) == '\\')) || ((*(p) == '^')) || ((*(p) == '[')) ||        \
105        ((*(p) == ']')) || ((*(p) == '`')))
106 /*
107  * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," |
108  *            "[" | "]"
109  */
110 
111 #define IS_RESERVED(x) (((x) == ';') || ((x) == '/') || ((x) == '?') || \
112         ((x) == ':') || ((x) == '@') || ((x) == '&') || ((x) == '=') || \
113         ((x) == '+') || ((x) == '$') || ((x) == ',') || ((x) == '[') || \
114         ((x) == ']'))
115 
116 /*
117  * unreserved = alphanum | mark
118  */
119 
120 #define IS_UNRESERVED(x) (IS_ALPHANUM(x) || IS_MARK(x))
121 
122 /*
123  * Skip to next pointer char, handle escaped sequences
124  */
125 
126 #define NEXT(p) ((*p == '%')? p += 3 : p++)
127 
128 /*
129  * Productions from the spec.
130  *
131  *    authority     = server | reg_name
132  *    reg_name      = 1*( unreserved | escaped | "$" | "," |
133  *                        ";" | ":" | "@" | "&" | "=" | "+" )
134  *
135  * path          = [ abs_path | opaque_part ]
136  */
137 
138 #define STRNDUP(s, n) (char *) xmlStrndup((const xmlChar *)(s), (n))
139 
140 /************************************************************************
141  *									*
142  *                         RFC 3986 parser				*
143  *									*
144  ************************************************************************/
145 
146 #define ISA_DIGIT(p) ((*(p) >= '0') && (*(p) <= '9'))
147 #define ISA_ALPHA(p) (((*(p) >= 'a') && (*(p) <= 'z')) ||		\
148                       ((*(p) >= 'A') && (*(p) <= 'Z')))
149 #define ISA_HEXDIG(p)							\
150        (ISA_DIGIT(p) || ((*(p) >= 'a') && (*(p) <= 'f')) ||		\
151         ((*(p) >= 'A') && (*(p) <= 'F')))
152 
153 /*
154  *    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
155  *                     / "*" / "+" / "," / ";" / "="
156  */
157 #define ISA_SUB_DELIM(p)						\
158       (((*(p) == '!')) || ((*(p) == '$')) || ((*(p) == '&')) ||		\
159        ((*(p) == '(')) || ((*(p) == ')')) || ((*(p) == '*')) ||		\
160        ((*(p) == '+')) || ((*(p) == ',')) || ((*(p) == ';')) ||		\
161        ((*(p) == '=')) || ((*(p) == '\'')))
162 
163 /*
164  *    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
165  */
166 #define ISA_GEN_DELIM(p)						\
167       (((*(p) == ':')) || ((*(p) == '/')) || ((*(p) == '?')) ||         \
168        ((*(p) == '#')) || ((*(p) == '[')) || ((*(p) == ']')) ||         \
169        ((*(p) == '@')))
170 
171 /*
172  *    reserved      = gen-delims / sub-delims
173  */
174 #define ISA_RESERVED(p) (ISA_GEN_DELIM(p) || (ISA_SUB_DELIM(p)))
175 
176 /*
177  *    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
178  */
179 #define ISA_UNRESERVED(p)						\
180       ((ISA_ALPHA(p)) || (ISA_DIGIT(p)) || ((*(p) == '-')) ||		\
181        ((*(p) == '.')) || ((*(p) == '_')) || ((*(p) == '~')))
182 
183 /*
184  *    pct-encoded   = "%" HEXDIG HEXDIG
185  */
186 #define ISA_PCT_ENCODED(p)						\
187      ((*(p) == '%') && (ISA_HEXDIG(p + 1)) && (ISA_HEXDIG(p + 2)))
188 
189 /*
190  *    pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
191  */
192 #define ISA_PCHAR(p)							\
193      (ISA_UNRESERVED(p) || ISA_PCT_ENCODED(p) || ISA_SUB_DELIM(p) ||	\
194       ((*(p) == ':')) || ((*(p) == '@')))
195 
196 /**
197  * xmlParse3986Scheme:
198  * @uri:  pointer to an URI structure
199  * @str:  pointer to the string to analyze
200  *
201  * Parse an URI scheme
202  *
203  * ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
204  *
205  * Returns 0 or the error code
206  */
207 static int
xmlParse3986Scheme(xmlURIPtr uri,const char ** str)208 xmlParse3986Scheme(xmlURIPtr uri, const char **str) {
209     const char *cur;
210 
211     if (str == NULL)
212 	return(-1);
213 
214     cur = *str;
215     if (!ISA_ALPHA(cur))
216 	return(2);
217     cur++;
218     while (ISA_ALPHA(cur) || ISA_DIGIT(cur) ||
219            (*cur == '+') || (*cur == '-') || (*cur == '.')) cur++;
220     if (uri != NULL) {
221 	if (uri->scheme != NULL) xmlFree(uri->scheme);
222 	uri->scheme = STRNDUP(*str, cur - *str);
223     }
224     *str = cur;
225     return(0);
226 }
227 
228 /**
229  * xmlParse3986Fragment:
230  * @uri:  pointer to an URI structure
231  * @str:  pointer to the string to analyze
232  *
233  * Parse the query part of an URI
234  *
235  * fragment      = *( pchar / "/" / "?" )
236  * NOTE: the strict syntax as defined by 3986 does not allow '[' and ']'
237  *       in the fragment identifier but this is used very broadly for
238  *       xpointer scheme selection, so we are allowing it here to not break
239  *       for example all the DocBook processing chains.
240  *
241  * Returns 0 or the error code
242  */
243 static int
xmlParse3986Fragment(xmlURIPtr uri,const char ** str)244 xmlParse3986Fragment(xmlURIPtr uri, const char **str)
245 {
246     const char *cur;
247 
248     if (str == NULL)
249         return (-1);
250 
251     cur = *str;
252 
253     while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
254            (*cur == '[') || (*cur == ']') ||
255            ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur))))
256         NEXT(cur);
257     if (uri != NULL) {
258         if (uri->fragment != NULL)
259             xmlFree(uri->fragment);
260 	if (uri->cleanup & 2)
261 	    uri->fragment = STRNDUP(*str, cur - *str);
262 	else
263 	    uri->fragment = xmlURIUnescapeString(*str, cur - *str, NULL);
264     }
265     *str = cur;
266     return (0);
267 }
268 
269 /**
270  * xmlParse3986Query:
271  * @uri:  pointer to an URI structure
272  * @str:  pointer to the string to analyze
273  *
274  * Parse the query part of an URI
275  *
276  * query = *uric
277  *
278  * Returns 0 or the error code
279  */
280 static int
xmlParse3986Query(xmlURIPtr uri,const char ** str)281 xmlParse3986Query(xmlURIPtr uri, const char **str)
282 {
283     const char *cur;
284 
285     if (str == NULL)
286         return (-1);
287 
288     cur = *str;
289 
290     while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
291            ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur))))
292         NEXT(cur);
293     if (uri != NULL) {
294         if (uri->query != NULL)
295             xmlFree(uri->query);
296 	if (uri->cleanup & 2)
297 	    uri->query = STRNDUP(*str, cur - *str);
298 	else
299 	    uri->query = xmlURIUnescapeString(*str, cur - *str, NULL);
300 
301 	/* Save the raw bytes of the query as well.
302 	 * See: http://mail.gnome.org/archives/xml/2007-April/thread.html#00114
303 	 */
304 	if (uri->query_raw != NULL)
305 	    xmlFree (uri->query_raw);
306 	uri->query_raw = STRNDUP (*str, cur - *str);
307     }
308     *str = cur;
309     return (0);
310 }
311 
312 /**
313  * xmlParse3986Port:
314  * @uri:  pointer to an URI structure
315  * @str:  the string to analyze
316  *
317  * Parse a port  part and fills in the appropriate fields
318  * of the @uri structure
319  *
320  * port          = *DIGIT
321  *
322  * Returns 0 or the error code
323  */
324 static int
xmlParse3986Port(xmlURIPtr uri,const char ** str)325 xmlParse3986Port(xmlURIPtr uri, const char **str)
326 {
327     const char *cur = *str;
328 
329     if (ISA_DIGIT(cur)) {
330 	if (uri != NULL)
331 	    uri->port = 0;
332 	while (ISA_DIGIT(cur)) {
333 	    if (uri != NULL)
334 		uri->port = uri->port * 10 + (*cur - '0');
335 	    cur++;
336 	}
337 	*str = cur;
338 	return(0);
339     }
340     return(1);
341 }
342 
343 /**
344  * xmlParse3986Userinfo:
345  * @uri:  pointer to an URI structure
346  * @str:  the string to analyze
347  *
348  * Parse an user informations part and fills in the appropriate fields
349  * of the @uri structure
350  *
351  * userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
352  *
353  * Returns 0 or the error code
354  */
355 static int
xmlParse3986Userinfo(xmlURIPtr uri,const char ** str)356 xmlParse3986Userinfo(xmlURIPtr uri, const char **str)
357 {
358     const char *cur;
359 
360     cur = *str;
361     while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) ||
362            ISA_SUB_DELIM(cur) || (*cur == ':'))
363 	NEXT(cur);
364     if (*cur == '@') {
365 	if (uri != NULL) {
366 	    if (uri->user != NULL) xmlFree(uri->user);
367 	    if (uri->cleanup & 2)
368 		uri->user = STRNDUP(*str, cur - *str);
369 	    else
370 		uri->user = xmlURIUnescapeString(*str, cur - *str, NULL);
371 	}
372 	*str = cur;
373 	return(0);
374     }
375     return(1);
376 }
377 
378 /**
379  * xmlParse3986DecOctet:
380  * @str:  the string to analyze
381  *
382  *    dec-octet     = DIGIT                 ; 0-9
383  *                  / %x31-39 DIGIT         ; 10-99
384  *                  / "1" 2DIGIT            ; 100-199
385  *                  / "2" %x30-34 DIGIT     ; 200-249
386  *                  / "25" %x30-35          ; 250-255
387  *
388  * Skip a dec-octet.
389  *
390  * Returns 0 if found and skipped, 1 otherwise
391  */
392 static int
xmlParse3986DecOctet(const char ** str)393 xmlParse3986DecOctet(const char **str) {
394     const char *cur = *str;
395 
396     if (!(ISA_DIGIT(cur)))
397         return(1);
398     if (!ISA_DIGIT(cur+1))
399 	cur++;
400     else if ((*cur != '0') && (ISA_DIGIT(cur + 1)) && (!ISA_DIGIT(cur+2)))
401 	cur += 2;
402     else if ((*cur == '1') && (ISA_DIGIT(cur + 1)) && (ISA_DIGIT(cur + 2)))
403 	cur += 3;
404     else if ((*cur == '2') && (*(cur + 1) >= '0') &&
405 	     (*(cur + 1) <= '4') && (ISA_DIGIT(cur + 2)))
406 	cur += 3;
407     else if ((*cur == '2') && (*(cur + 1) == '5') &&
408 	     (*(cur + 2) >= '0') && (*(cur + 1) <= '5'))
409 	cur += 3;
410     else
411         return(1);
412     *str = cur;
413     return(0);
414 }
415 /**
416  * xmlParse3986Host:
417  * @uri:  pointer to an URI structure
418  * @str:  the string to analyze
419  *
420  * Parse an host part and fills in the appropriate fields
421  * of the @uri structure
422  *
423  * host          = IP-literal / IPv4address / reg-name
424  * IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
425  * IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
426  * reg-name      = *( unreserved / pct-encoded / sub-delims )
427  *
428  * Returns 0 or the error code
429  */
430 static int
xmlParse3986Host(xmlURIPtr uri,const char ** str)431 xmlParse3986Host(xmlURIPtr uri, const char **str)
432 {
433     const char *cur = *str;
434     const char *host;
435 
436     host = cur;
437     /*
438      * IPv6 and future adressing scheme are enclosed between brackets
439      */
440     if (*cur == '[') {
441         cur++;
442 	while ((*cur != ']') && (*cur != 0))
443 	    cur++;
444 	if (*cur != ']')
445 	    return(1);
446 	cur++;
447 	goto found;
448     }
449     /*
450      * try to parse an IPv4
451      */
452     if (ISA_DIGIT(cur)) {
453         if (xmlParse3986DecOctet(&cur) != 0)
454 	    goto not_ipv4;
455 	if (*cur != '.')
456 	    goto not_ipv4;
457 	cur++;
458         if (xmlParse3986DecOctet(&cur) != 0)
459 	    goto not_ipv4;
460 	if (*cur != '.')
461 	    goto not_ipv4;
462         if (xmlParse3986DecOctet(&cur) != 0)
463 	    goto not_ipv4;
464 	if (*cur != '.')
465 	    goto not_ipv4;
466         if (xmlParse3986DecOctet(&cur) != 0)
467 	    goto not_ipv4;
468 	goto found;
469 not_ipv4:
470         cur = *str;
471     }
472     /*
473      * then this should be a hostname which can be empty
474      */
475     while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur))
476         NEXT(cur);
477 found:
478     if (uri != NULL) {
479 	if (uri->authority != NULL) xmlFree(uri->authority);
480 	uri->authority = NULL;
481 	if (uri->server != NULL) xmlFree(uri->server);
482 	if (cur != host) {
483 	    if (uri->cleanup & 2)
484 		uri->server = STRNDUP(host, cur - host);
485 	    else
486 		uri->server = xmlURIUnescapeString(host, cur - host, NULL);
487 	} else
488 	    uri->server = NULL;
489     }
490     *str = cur;
491     return(0);
492 }
493 
494 /**
495  * xmlParse3986Authority:
496  * @uri:  pointer to an URI structure
497  * @str:  the string to analyze
498  *
499  * Parse an authority part and fills in the appropriate fields
500  * of the @uri structure
501  *
502  * authority     = [ userinfo "@" ] host [ ":" port ]
503  *
504  * Returns 0 or the error code
505  */
506 static int
xmlParse3986Authority(xmlURIPtr uri,const char ** str)507 xmlParse3986Authority(xmlURIPtr uri, const char **str)
508 {
509     const char *cur;
510     int ret;
511 
512     cur = *str;
513     /*
514      * try to parse an userinfo and check for the trailing @
515      */
516     ret = xmlParse3986Userinfo(uri, &cur);
517     if ((ret != 0) || (*cur != '@'))
518         cur = *str;
519     else
520         cur++;
521     ret = xmlParse3986Host(uri, &cur);
522     if (ret != 0) return(ret);
523     if (*cur == ':') {
524         cur++;
525         ret = xmlParse3986Port(uri, &cur);
526 	if (ret != 0) return(ret);
527     }
528     *str = cur;
529     return(0);
530 }
531 
532 /**
533  * xmlParse3986Segment:
534  * @str:  the string to analyze
535  * @forbid: an optional forbidden character
536  * @empty: allow an empty segment
537  *
538  * Parse a segment and fills in the appropriate fields
539  * of the @uri structure
540  *
541  * segment       = *pchar
542  * segment-nz    = 1*pchar
543  * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
544  *               ; non-zero-length segment without any colon ":"
545  *
546  * Returns 0 or the error code
547  */
548 static int
xmlParse3986Segment(const char ** str,char forbid,int empty)549 xmlParse3986Segment(const char **str, char forbid, int empty)
550 {
551     const char *cur;
552 
553     cur = *str;
554     if (!ISA_PCHAR(cur)) {
555         if (empty)
556 	    return(0);
557 	return(1);
558     }
559     while (ISA_PCHAR(cur) && (*cur != forbid))
560         NEXT(cur);
561     *str = cur;
562     return (0);
563 }
564 
565 /**
566  * xmlParse3986PathAbEmpty:
567  * @uri:  pointer to an URI structure
568  * @str:  the string to analyze
569  *
570  * Parse an path absolute or empty and fills in the appropriate fields
571  * of the @uri structure
572  *
573  * path-abempty  = *( "/" segment )
574  *
575  * Returns 0 or the error code
576  */
577 static int
xmlParse3986PathAbEmpty(xmlURIPtr uri,const char ** str)578 xmlParse3986PathAbEmpty(xmlURIPtr uri, const char **str)
579 {
580     const char *cur;
581     int ret;
582 
583     cur = *str;
584 
585     while (*cur == '/') {
586         cur++;
587 	ret = xmlParse3986Segment(&cur, 0, 1);
588 	if (ret != 0) return(ret);
589     }
590     if (uri != NULL) {
591 	if (uri->path != NULL) xmlFree(uri->path);
592         if (*str != cur) {
593             if (uri->cleanup & 2)
594                 uri->path = STRNDUP(*str, cur - *str);
595             else
596                 uri->path = xmlURIUnescapeString(*str, cur - *str, NULL);
597         } else {
598             uri->path = NULL;
599         }
600     }
601     *str = cur;
602     return (0);
603 }
604 
605 /**
606  * xmlParse3986PathAbsolute:
607  * @uri:  pointer to an URI structure
608  * @str:  the string to analyze
609  *
610  * Parse an path absolute and fills in the appropriate fields
611  * of the @uri structure
612  *
613  * path-absolute = "/" [ segment-nz *( "/" segment ) ]
614  *
615  * Returns 0 or the error code
616  */
617 static int
xmlParse3986PathAbsolute(xmlURIPtr uri,const char ** str)618 xmlParse3986PathAbsolute(xmlURIPtr uri, const char **str)
619 {
620     const char *cur;
621     int ret;
622 
623     cur = *str;
624 
625     if (*cur != '/')
626         return(1);
627     cur++;
628     ret = xmlParse3986Segment(&cur, 0, 0);
629     if (ret == 0) {
630 	while (*cur == '/') {
631 	    cur++;
632 	    ret = xmlParse3986Segment(&cur, 0, 1);
633 	    if (ret != 0) return(ret);
634 	}
635     }
636     if (uri != NULL) {
637 	if (uri->path != NULL) xmlFree(uri->path);
638         if (cur != *str) {
639             if (uri->cleanup & 2)
640                 uri->path = STRNDUP(*str, cur - *str);
641             else
642                 uri->path = xmlURIUnescapeString(*str, cur - *str, NULL);
643         } else {
644             uri->path = NULL;
645         }
646     }
647     *str = cur;
648     return (0);
649 }
650 
651 /**
652  * xmlParse3986PathRootless:
653  * @uri:  pointer to an URI structure
654  * @str:  the string to analyze
655  *
656  * Parse an path without root and fills in the appropriate fields
657  * of the @uri structure
658  *
659  * path-rootless = segment-nz *( "/" segment )
660  *
661  * Returns 0 or the error code
662  */
663 static int
xmlParse3986PathRootless(xmlURIPtr uri,const char ** str)664 xmlParse3986PathRootless(xmlURIPtr uri, const char **str)
665 {
666     const char *cur;
667     int ret;
668 
669     cur = *str;
670 
671     ret = xmlParse3986Segment(&cur, 0, 0);
672     if (ret != 0) return(ret);
673     while (*cur == '/') {
674         cur++;
675 	ret = xmlParse3986Segment(&cur, 0, 1);
676 	if (ret != 0) return(ret);
677     }
678     if (uri != NULL) {
679 	if (uri->path != NULL) xmlFree(uri->path);
680         if (cur != *str) {
681             if (uri->cleanup & 2)
682                 uri->path = STRNDUP(*str, cur - *str);
683             else
684                 uri->path = xmlURIUnescapeString(*str, cur - *str, NULL);
685         } else {
686             uri->path = NULL;
687         }
688     }
689     *str = cur;
690     return (0);
691 }
692 
693 /**
694  * xmlParse3986PathNoScheme:
695  * @uri:  pointer to an URI structure
696  * @str:  the string to analyze
697  *
698  * Parse an path which is not a scheme and fills in the appropriate fields
699  * of the @uri structure
700  *
701  * path-noscheme = segment-nz-nc *( "/" segment )
702  *
703  * Returns 0 or the error code
704  */
705 static int
xmlParse3986PathNoScheme(xmlURIPtr uri,const char ** str)706 xmlParse3986PathNoScheme(xmlURIPtr uri, const char **str)
707 {
708     const char *cur;
709     int ret;
710 
711     cur = *str;
712 
713     ret = xmlParse3986Segment(&cur, ':', 0);
714     if (ret != 0) return(ret);
715     while (*cur == '/') {
716         cur++;
717 	ret = xmlParse3986Segment(&cur, 0, 1);
718 	if (ret != 0) return(ret);
719     }
720     if (uri != NULL) {
721 	if (uri->path != NULL) xmlFree(uri->path);
722         if (cur != *str) {
723             if (uri->cleanup & 2)
724                 uri->path = STRNDUP(*str, cur - *str);
725             else
726                 uri->path = xmlURIUnescapeString(*str, cur - *str, NULL);
727         } else {
728             uri->path = NULL;
729         }
730     }
731     *str = cur;
732     return (0);
733 }
734 
735 /**
736  * xmlParse3986HierPart:
737  * @uri:  pointer to an URI structure
738  * @str:  the string to analyze
739  *
740  * Parse an hierarchical part and fills in the appropriate fields
741  * of the @uri structure
742  *
743  * hier-part     = "//" authority path-abempty
744  *                / path-absolute
745  *                / path-rootless
746  *                / path-empty
747  *
748  * Returns 0 or the error code
749  */
750 static int
xmlParse3986HierPart(xmlURIPtr uri,const char ** str)751 xmlParse3986HierPart(xmlURIPtr uri, const char **str)
752 {
753     const char *cur;
754     int ret;
755 
756     cur = *str;
757 
758     if ((*cur == '/') && (*(cur + 1) == '/')) {
759         cur += 2;
760 	ret = xmlParse3986Authority(uri, &cur);
761 	if (ret != 0) return(ret);
762 	ret = xmlParse3986PathAbEmpty(uri, &cur);
763 	if (ret != 0) return(ret);
764 	*str = cur;
765 	return(0);
766     } else if (*cur == '/') {
767         ret = xmlParse3986PathAbsolute(uri, &cur);
768 	if (ret != 0) return(ret);
769     } else if (ISA_PCHAR(cur)) {
770         ret = xmlParse3986PathRootless(uri, &cur);
771 	if (ret != 0) return(ret);
772     } else {
773 	/* path-empty is effectively empty */
774 	if (uri != NULL) {
775 	    if (uri->path != NULL) xmlFree(uri->path);
776 	    uri->path = NULL;
777 	}
778     }
779     *str = cur;
780     return (0);
781 }
782 
783 /**
784  * xmlParse3986RelativeRef:
785  * @uri:  pointer to an URI structure
786  * @str:  the string to analyze
787  *
788  * Parse an URI string and fills in the appropriate fields
789  * of the @uri structure
790  *
791  * relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
792  * relative-part = "//" authority path-abempty
793  *               / path-absolute
794  *               / path-noscheme
795  *               / path-empty
796  *
797  * Returns 0 or the error code
798  */
799 static int
xmlParse3986RelativeRef(xmlURIPtr uri,const char * str)800 xmlParse3986RelativeRef(xmlURIPtr uri, const char *str) {
801     int ret;
802 
803     if ((*str == '/') && (*(str + 1) == '/')) {
804         str += 2;
805 	ret = xmlParse3986Authority(uri, &str);
806 	if (ret != 0) return(ret);
807 	ret = xmlParse3986PathAbEmpty(uri, &str);
808 	if (ret != 0) return(ret);
809     } else if (*str == '/') {
810 	ret = xmlParse3986PathAbsolute(uri, &str);
811 	if (ret != 0) return(ret);
812     } else if (ISA_PCHAR(str)) {
813         ret = xmlParse3986PathNoScheme(uri, &str);
814 	if (ret != 0) return(ret);
815     } else {
816 	/* path-empty is effectively empty */
817 	if (uri != NULL) {
818 	    if (uri->path != NULL) xmlFree(uri->path);
819 	    uri->path = NULL;
820 	}
821     }
822 
823     if (*str == '?') {
824 	str++;
825 	ret = xmlParse3986Query(uri, &str);
826 	if (ret != 0) return(ret);
827     }
828     if (*str == '#') {
829 	str++;
830 	ret = xmlParse3986Fragment(uri, &str);
831 	if (ret != 0) return(ret);
832     }
833     if (*str != 0) {
834 	xmlCleanURI(uri);
835 	return(1);
836     }
837     return(0);
838 }
839 
840 
841 /**
842  * xmlParse3986URI:
843  * @uri:  pointer to an URI structure
844  * @str:  the string to analyze
845  *
846  * Parse an URI string and fills in the appropriate fields
847  * of the @uri structure
848  *
849  * scheme ":" hier-part [ "?" query ] [ "#" fragment ]
850  *
851  * Returns 0 or the error code
852  */
853 static int
xmlParse3986URI(xmlURIPtr uri,const char * str)854 xmlParse3986URI(xmlURIPtr uri, const char *str) {
855     int ret;
856 
857     ret = xmlParse3986Scheme(uri, &str);
858     if (ret != 0) return(ret);
859     if (*str != ':') {
860 	return(1);
861     }
862     str++;
863     ret = xmlParse3986HierPart(uri, &str);
864     if (ret != 0) return(ret);
865     if (*str == '?') {
866 	str++;
867 	ret = xmlParse3986Query(uri, &str);
868 	if (ret != 0) return(ret);
869     }
870     if (*str == '#') {
871 	str++;
872 	ret = xmlParse3986Fragment(uri, &str);
873 	if (ret != 0) return(ret);
874     }
875     if (*str != 0) {
876 	xmlCleanURI(uri);
877 	return(1);
878     }
879     return(0);
880 }
881 
882 /**
883  * xmlParse3986URIReference:
884  * @uri:  pointer to an URI structure
885  * @str:  the string to analyze
886  *
887  * Parse an URI reference string and fills in the appropriate fields
888  * of the @uri structure
889  *
890  * URI-reference = URI / relative-ref
891  *
892  * Returns 0 or the error code
893  */
894 static int
xmlParse3986URIReference(xmlURIPtr uri,const char * str)895 xmlParse3986URIReference(xmlURIPtr uri, const char *str) {
896     int ret;
897 
898     if (str == NULL)
899 	return(-1);
900     xmlCleanURI(uri);
901 
902     /*
903      * Try first to parse absolute refs, then fallback to relative if
904      * it fails.
905      */
906     ret = xmlParse3986URI(uri, str);
907     if (ret != 0) {
908 	xmlCleanURI(uri);
909         ret = xmlParse3986RelativeRef(uri, str);
910 	if (ret != 0) {
911 	    xmlCleanURI(uri);
912 	    return(ret);
913 	}
914     }
915     return(0);
916 }
917 
918 /**
919  * xmlParseURI:
920  * @str:  the URI string to analyze
921  *
922  * Parse an URI based on RFC 3986
923  *
924  * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
925  *
926  * Returns a newly built xmlURIPtr or NULL in case of error
927  */
928 xmlURIPtr
xmlParseURI(const char * str)929 xmlParseURI(const char *str) {
930     xmlURIPtr uri;
931     int ret;
932 
933     if (str == NULL)
934 	return(NULL);
935     uri = xmlCreateURI();
936     if (uri != NULL) {
937 	ret = xmlParse3986URIReference(uri, str);
938         if (ret) {
939 	    xmlFreeURI(uri);
940 	    return(NULL);
941 	}
942     }
943     return(uri);
944 }
945 
946 /**
947  * xmlParseURIReference:
948  * @uri:  pointer to an URI structure
949  * @str:  the string to analyze
950  *
951  * Parse an URI reference string based on RFC 3986 and fills in the
952  * appropriate fields of the @uri structure
953  *
954  * URI-reference = URI / relative-ref
955  *
956  * Returns 0 or the error code
957  */
958 int
xmlParseURIReference(xmlURIPtr uri,const char * str)959 xmlParseURIReference(xmlURIPtr uri, const char *str) {
960     return(xmlParse3986URIReference(uri, str));
961 }
962 
963 /**
964  * xmlParseURIRaw:
965  * @str:  the URI string to analyze
966  * @raw:  if 1 unescaping of URI pieces are disabled
967  *
968  * Parse an URI but allows to keep intact the original fragments.
969  *
970  * URI-reference = URI / relative-ref
971  *
972  * Returns a newly built xmlURIPtr or NULL in case of error
973  */
974 xmlURIPtr
xmlParseURIRaw(const char * str,int raw)975 xmlParseURIRaw(const char *str, int raw) {
976     xmlURIPtr uri;
977     int ret;
978 
979     if (str == NULL)
980 	return(NULL);
981     uri = xmlCreateURI();
982     if (uri != NULL) {
983         if (raw) {
984 	    uri->cleanup |= 2;
985 	}
986 	ret = xmlParseURIReference(uri, str);
987         if (ret) {
988 	    xmlFreeURI(uri);
989 	    return(NULL);
990 	}
991     }
992     return(uri);
993 }
994 
995 /************************************************************************
996  *									*
997  *			Generic URI structure functions			*
998  *									*
999  ************************************************************************/
1000 
1001 /**
1002  * xmlCreateURI:
1003  *
1004  * Simply creates an empty xmlURI
1005  *
1006  * Returns the new structure or NULL in case of error
1007  */
1008 xmlURIPtr
xmlCreateURI(void)1009 xmlCreateURI(void) {
1010     xmlURIPtr ret;
1011 
1012     ret = (xmlURIPtr) xmlMalloc(sizeof(xmlURI));
1013     if (ret == NULL) {
1014         xmlURIErrMemory("creating URI structure\n");
1015 	return(NULL);
1016     }
1017     memset(ret, 0, sizeof(xmlURI));
1018     return(ret);
1019 }
1020 
1021 /**
1022  * xmlSaveUriRealloc:
1023  *
1024  * Function to handle properly a reallocation when saving an URI
1025  * Also imposes some limit on the length of an URI string output
1026  */
1027 static xmlChar *
xmlSaveUriRealloc(xmlChar * ret,int * max)1028 xmlSaveUriRealloc(xmlChar *ret, int *max) {
1029     xmlChar *temp;
1030     int tmp;
1031 
1032     if (*max > MAX_URI_LENGTH) {
1033         xmlURIErrMemory("reaching arbitrary MAX_URI_LENGTH limit\n");
1034         return(NULL);
1035     }
1036     tmp = *max * 2;
1037     temp = (xmlChar *) xmlRealloc(ret, (tmp + 1));
1038     if (temp == NULL) {
1039         xmlURIErrMemory("saving URI\n");
1040         return(NULL);
1041     }
1042     *max = tmp;
1043     return(temp);
1044 }
1045 
1046 /**
1047  * xmlSaveUri:
1048  * @uri:  pointer to an xmlURI
1049  *
1050  * Save the URI as an escaped string
1051  *
1052  * Returns a new string (to be deallocated by caller)
1053  */
1054 xmlChar *
xmlSaveUri(xmlURIPtr uri)1055 xmlSaveUri(xmlURIPtr uri) {
1056     xmlChar *ret = NULL;
1057     xmlChar *temp;
1058     const char *p;
1059     int len;
1060     int max;
1061 
1062     if (uri == NULL) return(NULL);
1063 
1064 
1065     max = 80;
1066     ret = (xmlChar *) xmlMallocAtomic((max + 1) * sizeof(xmlChar));
1067     if (ret == NULL) {
1068         xmlURIErrMemory("saving URI\n");
1069 	return(NULL);
1070     }
1071     len = 0;
1072 
1073     if (uri->scheme != NULL) {
1074 	p = uri->scheme;
1075 	while (*p != 0) {
1076 	    if (len >= max) {
1077                 temp = xmlSaveUriRealloc(ret, &max);
1078                 if (temp == NULL) goto mem_error;
1079 		ret = temp;
1080 	    }
1081 	    ret[len++] = *p++;
1082 	}
1083 	if (len >= max) {
1084             temp = xmlSaveUriRealloc(ret, &max);
1085             if (temp == NULL) goto mem_error;
1086             ret = temp;
1087 	}
1088 	ret[len++] = ':';
1089     }
1090     if (uri->opaque != NULL) {
1091 	p = uri->opaque;
1092 	while (*p != 0) {
1093 	    if (len + 3 >= max) {
1094                 temp = xmlSaveUriRealloc(ret, &max);
1095                 if (temp == NULL) goto mem_error;
1096                 ret = temp;
1097 	    }
1098 	    if (IS_RESERVED(*(p)) || IS_UNRESERVED(*(p)))
1099 		ret[len++] = *p++;
1100 	    else {
1101 		int val = *(unsigned char *)p++;
1102 		int hi = val / 0x10, lo = val % 0x10;
1103 		ret[len++] = '%';
1104 		ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1105 		ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1106 	    }
1107 	}
1108     } else {
1109 	if (uri->server != NULL) {
1110 	    if (len + 3 >= max) {
1111                 temp = xmlSaveUriRealloc(ret, &max);
1112                 if (temp == NULL) goto mem_error;
1113                 ret = temp;
1114 	    }
1115 	    ret[len++] = '/';
1116 	    ret[len++] = '/';
1117 	    if (uri->user != NULL) {
1118 		p = uri->user;
1119 		while (*p != 0) {
1120 		    if (len + 3 >= max) {
1121                         temp = xmlSaveUriRealloc(ret, &max);
1122                         if (temp == NULL) goto mem_error;
1123                         ret = temp;
1124 		    }
1125 		    if ((IS_UNRESERVED(*(p))) ||
1126 			((*(p) == ';')) || ((*(p) == ':')) ||
1127 			((*(p) == '&')) || ((*(p) == '=')) ||
1128 			((*(p) == '+')) || ((*(p) == '$')) ||
1129 			((*(p) == ',')))
1130 			ret[len++] = *p++;
1131 		    else {
1132 			int val = *(unsigned char *)p++;
1133 			int hi = val / 0x10, lo = val % 0x10;
1134 			ret[len++] = '%';
1135 			ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1136 			ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1137 		    }
1138 		}
1139 		if (len + 3 >= max) {
1140                     temp = xmlSaveUriRealloc(ret, &max);
1141                     if (temp == NULL) goto mem_error;
1142                     ret = temp;
1143 		}
1144 		ret[len++] = '@';
1145 	    }
1146 	    p = uri->server;
1147 	    while (*p != 0) {
1148 		if (len >= max) {
1149                     temp = xmlSaveUriRealloc(ret, &max);
1150                     if (temp == NULL) goto mem_error;
1151                     ret = temp;
1152 		}
1153 		ret[len++] = *p++;
1154 	    }
1155 	    if (uri->port > 0) {
1156 		if (len + 10 >= max) {
1157                     temp = xmlSaveUriRealloc(ret, &max);
1158                     if (temp == NULL) goto mem_error;
1159                     ret = temp;
1160 		}
1161 		len += snprintf((char *) &ret[len], max - len, ":%d", uri->port);
1162 	    }
1163 	} else if (uri->authority != NULL) {
1164 	    if (len + 3 >= max) {
1165                 temp = xmlSaveUriRealloc(ret, &max);
1166                 if (temp == NULL) goto mem_error;
1167                 ret = temp;
1168 	    }
1169 	    ret[len++] = '/';
1170 	    ret[len++] = '/';
1171 	    p = uri->authority;
1172 	    while (*p != 0) {
1173 		if (len + 3 >= max) {
1174                     temp = xmlSaveUriRealloc(ret, &max);
1175                     if (temp == NULL) goto mem_error;
1176                     ret = temp;
1177 		}
1178 		if ((IS_UNRESERVED(*(p))) ||
1179                     ((*(p) == '$')) || ((*(p) == ',')) || ((*(p) == ';')) ||
1180                     ((*(p) == ':')) || ((*(p) == '@')) || ((*(p) == '&')) ||
1181                     ((*(p) == '=')) || ((*(p) == '+')))
1182 		    ret[len++] = *p++;
1183 		else {
1184 		    int val = *(unsigned char *)p++;
1185 		    int hi = val / 0x10, lo = val % 0x10;
1186 		    ret[len++] = '%';
1187 		    ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1188 		    ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1189 		}
1190 	    }
1191 	} else if (uri->scheme != NULL) {
1192 	    if (len + 3 >= max) {
1193                 temp = xmlSaveUriRealloc(ret, &max);
1194                 if (temp == NULL) goto mem_error;
1195                 ret = temp;
1196 	    }
1197 	    ret[len++] = '/';
1198 	    ret[len++] = '/';
1199 	}
1200 	if (uri->path != NULL) {
1201 	    p = uri->path;
1202 	    /*
1203 	     * the colon in file:///d: should not be escaped or
1204 	     * Windows accesses fail later.
1205 	     */
1206 	    if ((uri->scheme != NULL) &&
1207 		(p[0] == '/') &&
1208 		(((p[1] >= 'a') && (p[1] <= 'z')) ||
1209 		 ((p[1] >= 'A') && (p[1] <= 'Z'))) &&
1210 		(p[2] == ':') &&
1211 	        (xmlStrEqual(BAD_CAST uri->scheme, BAD_CAST "file"))) {
1212 		if (len + 3 >= max) {
1213                     temp = xmlSaveUriRealloc(ret, &max);
1214                     if (temp == NULL) goto mem_error;
1215                     ret = temp;
1216 		}
1217 		ret[len++] = *p++;
1218 		ret[len++] = *p++;
1219 		ret[len++] = *p++;
1220 	    }
1221 	    while (*p != 0) {
1222 		if (len + 3 >= max) {
1223                     temp = xmlSaveUriRealloc(ret, &max);
1224                     if (temp == NULL) goto mem_error;
1225                     ret = temp;
1226 		}
1227 		if ((IS_UNRESERVED(*(p))) || ((*(p) == '/')) ||
1228                     ((*(p) == ';')) || ((*(p) == '@')) || ((*(p) == '&')) ||
1229 	            ((*(p) == '=')) || ((*(p) == '+')) || ((*(p) == '$')) ||
1230 	            ((*(p) == ',')))
1231 		    ret[len++] = *p++;
1232 		else {
1233 		    int val = *(unsigned char *)p++;
1234 		    int hi = val / 0x10, lo = val % 0x10;
1235 		    ret[len++] = '%';
1236 		    ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1237 		    ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1238 		}
1239 	    }
1240 	}
1241 	if (uri->query_raw != NULL) {
1242 	    if (len + 1 >= max) {
1243                 temp = xmlSaveUriRealloc(ret, &max);
1244                 if (temp == NULL) goto mem_error;
1245                 ret = temp;
1246 	    }
1247 	    ret[len++] = '?';
1248 	    p = uri->query_raw;
1249 	    while (*p != 0) {
1250 		if (len + 1 >= max) {
1251                     temp = xmlSaveUriRealloc(ret, &max);
1252                     if (temp == NULL) goto mem_error;
1253                     ret = temp;
1254 		}
1255 		ret[len++] = *p++;
1256 	    }
1257 	} else if (uri->query != NULL) {
1258 	    if (len + 3 >= max) {
1259                 temp = xmlSaveUriRealloc(ret, &max);
1260                 if (temp == NULL) goto mem_error;
1261                 ret = temp;
1262 	    }
1263 	    ret[len++] = '?';
1264 	    p = uri->query;
1265 	    while (*p != 0) {
1266 		if (len + 3 >= max) {
1267                     temp = xmlSaveUriRealloc(ret, &max);
1268                     if (temp == NULL) goto mem_error;
1269                     ret = temp;
1270 		}
1271 		if ((IS_UNRESERVED(*(p))) || (IS_RESERVED(*(p))))
1272 		    ret[len++] = *p++;
1273 		else {
1274 		    int val = *(unsigned char *)p++;
1275 		    int hi = val / 0x10, lo = val % 0x10;
1276 		    ret[len++] = '%';
1277 		    ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1278 		    ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1279 		}
1280 	    }
1281 	}
1282     }
1283     if (uri->fragment != NULL) {
1284 	if (len + 3 >= max) {
1285             temp = xmlSaveUriRealloc(ret, &max);
1286             if (temp == NULL) goto mem_error;
1287             ret = temp;
1288 	}
1289 	ret[len++] = '#';
1290 	p = uri->fragment;
1291 	while (*p != 0) {
1292 	    if (len + 3 >= max) {
1293                 temp = xmlSaveUriRealloc(ret, &max);
1294                 if (temp == NULL) goto mem_error;
1295                 ret = temp;
1296 	    }
1297 	    if ((IS_UNRESERVED(*(p))) || (IS_RESERVED(*(p))))
1298 		ret[len++] = *p++;
1299 	    else {
1300 		int val = *(unsigned char *)p++;
1301 		int hi = val / 0x10, lo = val % 0x10;
1302 		ret[len++] = '%';
1303 		ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1304 		ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1305 	    }
1306 	}
1307     }
1308     if (len >= max) {
1309         temp = xmlSaveUriRealloc(ret, &max);
1310         if (temp == NULL) goto mem_error;
1311         ret = temp;
1312     }
1313     ret[len] = 0;
1314     return(ret);
1315 
1316 mem_error:
1317     xmlFree(ret);
1318     return(NULL);
1319 }
1320 
1321 /**
1322  * xmlPrintURI:
1323  * @stream:  a FILE* for the output
1324  * @uri:  pointer to an xmlURI
1325  *
1326  * Prints the URI in the stream @stream.
1327  */
1328 void
xmlPrintURI(FILE * stream,xmlURIPtr uri)1329 xmlPrintURI(FILE *stream, xmlURIPtr uri) {
1330     xmlChar *out;
1331 
1332     out = xmlSaveUri(uri);
1333     if (out != NULL) {
1334 	fprintf(stream, "%s", (char *) out);
1335 	xmlFree(out);
1336     }
1337 }
1338 
1339 /**
1340  * xmlCleanURI:
1341  * @uri:  pointer to an xmlURI
1342  *
1343  * Make sure the xmlURI struct is free of content
1344  */
1345 static void
xmlCleanURI(xmlURIPtr uri)1346 xmlCleanURI(xmlURIPtr uri) {
1347     if (uri == NULL) return;
1348 
1349     if (uri->scheme != NULL) xmlFree(uri->scheme);
1350     uri->scheme = NULL;
1351     if (uri->server != NULL) xmlFree(uri->server);
1352     uri->server = NULL;
1353     if (uri->user != NULL) xmlFree(uri->user);
1354     uri->user = NULL;
1355     if (uri->path != NULL) xmlFree(uri->path);
1356     uri->path = NULL;
1357     if (uri->fragment != NULL) xmlFree(uri->fragment);
1358     uri->fragment = NULL;
1359     if (uri->opaque != NULL) xmlFree(uri->opaque);
1360     uri->opaque = NULL;
1361     if (uri->authority != NULL) xmlFree(uri->authority);
1362     uri->authority = NULL;
1363     if (uri->query != NULL) xmlFree(uri->query);
1364     uri->query = NULL;
1365     if (uri->query_raw != NULL) xmlFree(uri->query_raw);
1366     uri->query_raw = NULL;
1367 }
1368 
1369 /**
1370  * xmlFreeURI:
1371  * @uri:  pointer to an xmlURI
1372  *
1373  * Free up the xmlURI struct
1374  */
1375 void
xmlFreeURI(xmlURIPtr uri)1376 xmlFreeURI(xmlURIPtr uri) {
1377     if (uri == NULL) return;
1378 
1379     if (uri->scheme != NULL) xmlFree(uri->scheme);
1380     if (uri->server != NULL) xmlFree(uri->server);
1381     if (uri->user != NULL) xmlFree(uri->user);
1382     if (uri->path != NULL) xmlFree(uri->path);
1383     if (uri->fragment != NULL) xmlFree(uri->fragment);
1384     if (uri->opaque != NULL) xmlFree(uri->opaque);
1385     if (uri->authority != NULL) xmlFree(uri->authority);
1386     if (uri->query != NULL) xmlFree(uri->query);
1387     if (uri->query_raw != NULL) xmlFree(uri->query_raw);
1388     xmlFree(uri);
1389 }
1390 
1391 /************************************************************************
1392  *									*
1393  *			Helper functions				*
1394  *									*
1395  ************************************************************************/
1396 
1397 /**
1398  * xmlNormalizeURIPath:
1399  * @path:  pointer to the path string
1400  *
1401  * Applies the 5 normalization steps to a path string--that is, RFC 2396
1402  * Section 5.2, steps 6.c through 6.g.
1403  *
1404  * Normalization occurs directly on the string, no new allocation is done
1405  *
1406  * Returns 0 or an error code
1407  */
1408 int
xmlNormalizeURIPath(char * path)1409 xmlNormalizeURIPath(char *path) {
1410     char *cur, *out;
1411 
1412     if (path == NULL)
1413 	return(-1);
1414 
1415     /* Skip all initial "/" chars.  We want to get to the beginning of the
1416      * first non-empty segment.
1417      */
1418     cur = path;
1419     while (cur[0] == '/')
1420       ++cur;
1421     if (cur[0] == '\0')
1422       return(0);
1423 
1424     /* Keep everything we've seen so far.  */
1425     out = cur;
1426 
1427     /*
1428      * Analyze each segment in sequence for cases (c) and (d).
1429      */
1430     while (cur[0] != '\0') {
1431 	/*
1432 	 * c) All occurrences of "./", where "." is a complete path segment,
1433 	 *    are removed from the buffer string.
1434 	 */
1435 	if ((cur[0] == '.') && (cur[1] == '/')) {
1436 	    cur += 2;
1437 	    /* '//' normalization should be done at this point too */
1438 	    while (cur[0] == '/')
1439 		cur++;
1440 	    continue;
1441 	}
1442 
1443 	/*
1444 	 * d) If the buffer string ends with "." as a complete path segment,
1445 	 *    that "." is removed.
1446 	 */
1447 	if ((cur[0] == '.') && (cur[1] == '\0'))
1448 	    break;
1449 
1450 	/* Otherwise keep the segment.  */
1451 	while (cur[0] != '/') {
1452             if (cur[0] == '\0')
1453               goto done_cd;
1454 	    (out++)[0] = (cur++)[0];
1455 	}
1456 	/* nomalize // */
1457 	while ((cur[0] == '/') && (cur[1] == '/'))
1458 	    cur++;
1459 
1460         (out++)[0] = (cur++)[0];
1461     }
1462  done_cd:
1463     out[0] = '\0';
1464 
1465     /* Reset to the beginning of the first segment for the next sequence.  */
1466     cur = path;
1467     while (cur[0] == '/')
1468       ++cur;
1469     if (cur[0] == '\0')
1470 	return(0);
1471 
1472     /*
1473      * Analyze each segment in sequence for cases (e) and (f).
1474      *
1475      * e) All occurrences of "<segment>/../", where <segment> is a
1476      *    complete path segment not equal to "..", are removed from the
1477      *    buffer string.  Removal of these path segments is performed
1478      *    iteratively, removing the leftmost matching pattern on each
1479      *    iteration, until no matching pattern remains.
1480      *
1481      * f) If the buffer string ends with "<segment>/..", where <segment>
1482      *    is a complete path segment not equal to "..", that
1483      *    "<segment>/.." is removed.
1484      *
1485      * To satisfy the "iterative" clause in (e), we need to collapse the
1486      * string every time we find something that needs to be removed.  Thus,
1487      * we don't need to keep two pointers into the string: we only need a
1488      * "current position" pointer.
1489      */
1490     while (1) {
1491         char *segp, *tmp;
1492 
1493         /* At the beginning of each iteration of this loop, "cur" points to
1494          * the first character of the segment we want to examine.
1495          */
1496 
1497         /* Find the end of the current segment.  */
1498         segp = cur;
1499         while ((segp[0] != '/') && (segp[0] != '\0'))
1500           ++segp;
1501 
1502         /* If this is the last segment, we're done (we need at least two
1503          * segments to meet the criteria for the (e) and (f) cases).
1504          */
1505         if (segp[0] == '\0')
1506           break;
1507 
1508         /* If the first segment is "..", or if the next segment _isn't_ "..",
1509          * keep this segment and try the next one.
1510          */
1511         ++segp;
1512         if (((cur[0] == '.') && (cur[1] == '.') && (segp == cur+3))
1513             || ((segp[0] != '.') || (segp[1] != '.')
1514                 || ((segp[2] != '/') && (segp[2] != '\0')))) {
1515           cur = segp;
1516           continue;
1517         }
1518 
1519         /* If we get here, remove this segment and the next one and back up
1520          * to the previous segment (if there is one), to implement the
1521          * "iteratively" clause.  It's pretty much impossible to back up
1522          * while maintaining two pointers into the buffer, so just compact
1523          * the whole buffer now.
1524          */
1525 
1526         /* If this is the end of the buffer, we're done.  */
1527         if (segp[2] == '\0') {
1528           cur[0] = '\0';
1529           break;
1530         }
1531         /* Valgrind complained, strcpy(cur, segp + 3); */
1532         /* string will overlap, do not use strcpy */
1533         tmp = cur;
1534         segp += 3;
1535         while ((*tmp++ = *segp++) != 0)
1536           ;
1537 
1538         /* If there are no previous segments, then keep going from here.  */
1539         segp = cur;
1540         while ((segp > path) && ((--segp)[0] == '/'))
1541           ;
1542         if (segp == path)
1543           continue;
1544 
1545         /* "segp" is pointing to the end of a previous segment; find it's
1546          * start.  We need to back up to the previous segment and start
1547          * over with that to handle things like "foo/bar/../..".  If we
1548          * don't do this, then on the first pass we'll remove the "bar/..",
1549          * but be pointing at the second ".." so we won't realize we can also
1550          * remove the "foo/..".
1551          */
1552         cur = segp;
1553         while ((cur > path) && (cur[-1] != '/'))
1554           --cur;
1555     }
1556     out[0] = '\0';
1557 
1558     /*
1559      * g) If the resulting buffer string still begins with one or more
1560      *    complete path segments of "..", then the reference is
1561      *    considered to be in error. Implementations may handle this
1562      *    error by retaining these components in the resolved path (i.e.,
1563      *    treating them as part of the final URI), by removing them from
1564      *    the resolved path (i.e., discarding relative levels above the
1565      *    root), or by avoiding traversal of the reference.
1566      *
1567      * We discard them from the final path.
1568      */
1569     if (path[0] == '/') {
1570       cur = path;
1571       while ((cur[0] == '/') && (cur[1] == '.') && (cur[2] == '.')
1572              && ((cur[3] == '/') || (cur[3] == '\0')))
1573 	cur += 3;
1574 
1575       if (cur != path) {
1576 	out = path;
1577 	while (cur[0] != '\0')
1578           (out++)[0] = (cur++)[0];
1579 	out[0] = 0;
1580       }
1581     }
1582 
1583     return(0);
1584 }
1585 
is_hex(char c)1586 static int is_hex(char c) {
1587     if (((c >= '0') && (c <= '9')) ||
1588         ((c >= 'a') && (c <= 'f')) ||
1589         ((c >= 'A') && (c <= 'F')))
1590 	return(1);
1591     return(0);
1592 }
1593 
1594 /**
1595  * xmlURIUnescapeString:
1596  * @str:  the string to unescape
1597  * @len:   the length in bytes to unescape (or <= 0 to indicate full string)
1598  * @target:  optional destination buffer
1599  *
1600  * Unescaping routine, but does not check that the string is an URI. The
1601  * output is a direct unsigned char translation of %XX values (no encoding)
1602  * Note that the length of the result can only be smaller or same size as
1603  * the input string.
1604  *
1605  * Returns a copy of the string, but unescaped, will return NULL only in case
1606  * of error
1607  */
1608 char *
xmlURIUnescapeString(const char * str,int len,char * target)1609 xmlURIUnescapeString(const char *str, int len, char *target) {
1610     char *ret, *out;
1611     const char *in;
1612 
1613     if (str == NULL)
1614 	return(NULL);
1615     if (len <= 0) len = strlen(str);
1616     if (len < 0) return(NULL);
1617 
1618     if (target == NULL) {
1619 	ret = (char *) xmlMallocAtomic(len + 1);
1620 	if (ret == NULL) {
1621             xmlURIErrMemory("unescaping URI value\n");
1622 	    return(NULL);
1623 	}
1624     } else
1625 	ret = target;
1626     in = str;
1627     out = ret;
1628     while(len > 0) {
1629 	if ((len > 2) && (*in == '%') && (is_hex(in[1])) && (is_hex(in[2]))) {
1630 	    in++;
1631 	    if ((*in >= '0') && (*in <= '9'))
1632 	        *out = (*in - '0');
1633 	    else if ((*in >= 'a') && (*in <= 'f'))
1634 	        *out = (*in - 'a') + 10;
1635 	    else if ((*in >= 'A') && (*in <= 'F'))
1636 	        *out = (*in - 'A') + 10;
1637 	    in++;
1638 	    if ((*in >= '0') && (*in <= '9'))
1639 	        *out = *out * 16 + (*in - '0');
1640 	    else if ((*in >= 'a') && (*in <= 'f'))
1641 	        *out = *out * 16 + (*in - 'a') + 10;
1642 	    else if ((*in >= 'A') && (*in <= 'F'))
1643 	        *out = *out * 16 + (*in - 'A') + 10;
1644 	    in++;
1645 	    len -= 3;
1646 	    out++;
1647 	} else {
1648 	    *out++ = *in++;
1649 	    len--;
1650 	}
1651     }
1652     *out = 0;
1653     return(ret);
1654 }
1655 
1656 /**
1657  * xmlURIEscapeStr:
1658  * @str:  string to escape
1659  * @list: exception list string of chars not to escape
1660  *
1661  * This routine escapes a string to hex, ignoring reserved characters (a-z)
1662  * and the characters in the exception list.
1663  *
1664  * Returns a new escaped string or NULL in case of error.
1665  */
1666 xmlChar *
xmlURIEscapeStr(const xmlChar * str,const xmlChar * list)1667 xmlURIEscapeStr(const xmlChar *str, const xmlChar *list) {
1668     xmlChar *ret, ch;
1669     xmlChar *temp;
1670     const xmlChar *in;
1671     int len, out;
1672 
1673     if (str == NULL)
1674 	return(NULL);
1675     if (str[0] == 0)
1676 	return(xmlStrdup(str));
1677     len = xmlStrlen(str);
1678     if (!(len > 0)) return(NULL);
1679 
1680     len += 20;
1681     ret = (xmlChar *) xmlMallocAtomic(len);
1682     if (ret == NULL) {
1683         xmlURIErrMemory("escaping URI value\n");
1684 	return(NULL);
1685     }
1686     in = (const xmlChar *) str;
1687     out = 0;
1688     while(*in != 0) {
1689 	if (len - out <= 3) {
1690             temp = xmlSaveUriRealloc(ret, &len);
1691 	    if (temp == NULL) {
1692                 xmlURIErrMemory("escaping URI value\n");
1693 		xmlFree(ret);
1694 		return(NULL);
1695 	    }
1696 	    ret = temp;
1697 	}
1698 
1699 	ch = *in;
1700 
1701 	if ((ch != '@') && (!IS_UNRESERVED(ch)) && (!xmlStrchr(list, ch))) {
1702 	    unsigned char val;
1703 	    ret[out++] = '%';
1704 	    val = ch >> 4;
1705 	    if (val <= 9)
1706 		ret[out++] = '0' + val;
1707 	    else
1708 		ret[out++] = 'A' + val - 0xA;
1709 	    val = ch & 0xF;
1710 	    if (val <= 9)
1711 		ret[out++] = '0' + val;
1712 	    else
1713 		ret[out++] = 'A' + val - 0xA;
1714 	    in++;
1715 	} else {
1716 	    ret[out++] = *in++;
1717 	}
1718 
1719     }
1720     ret[out] = 0;
1721     return(ret);
1722 }
1723 
1724 /**
1725  * xmlURIEscape:
1726  * @str:  the string of the URI to escape
1727  *
1728  * Escaping routine, does not do validity checks !
1729  * It will try to escape the chars needing this, but this is heuristic
1730  * based it's impossible to be sure.
1731  *
1732  * Returns an copy of the string, but escaped
1733  *
1734  * 25 May 2001
1735  * Uses xmlParseURI and xmlURIEscapeStr to try to escape correctly
1736  * according to RFC2396.
1737  *   - Carl Douglas
1738  */
1739 xmlChar *
xmlURIEscape(const xmlChar * str)1740 xmlURIEscape(const xmlChar * str)
1741 {
1742     xmlChar *ret, *segment = NULL;
1743     xmlURIPtr uri;
1744     int ret2;
1745 
1746 #define NULLCHK(p) if(!p) { \
1747          xmlURIErrMemory("escaping URI value\n"); \
1748          xmlFreeURI(uri); \
1749          return NULL; } \
1750 
1751     if (str == NULL)
1752         return (NULL);
1753 
1754     uri = xmlCreateURI();
1755     if (uri != NULL) {
1756 	/*
1757 	 * Allow escaping errors in the unescaped form
1758 	 */
1759         uri->cleanup = 1;
1760         ret2 = xmlParseURIReference(uri, (const char *)str);
1761         if (ret2) {
1762             xmlFreeURI(uri);
1763             return (NULL);
1764         }
1765     }
1766 
1767     if (!uri)
1768         return NULL;
1769 
1770     ret = NULL;
1771 
1772     if (uri->scheme) {
1773         segment = xmlURIEscapeStr(BAD_CAST uri->scheme, BAD_CAST "+-.");
1774         NULLCHK(segment)
1775         ret = xmlStrcat(ret, segment);
1776         ret = xmlStrcat(ret, BAD_CAST ":");
1777         xmlFree(segment);
1778     }
1779 
1780     if (uri->authority) {
1781         segment =
1782             xmlURIEscapeStr(BAD_CAST uri->authority, BAD_CAST "/?;:@");
1783         NULLCHK(segment)
1784         ret = xmlStrcat(ret, BAD_CAST "//");
1785         ret = xmlStrcat(ret, segment);
1786         xmlFree(segment);
1787     }
1788 
1789     if (uri->user) {
1790         segment = xmlURIEscapeStr(BAD_CAST uri->user, BAD_CAST ";:&=+$,");
1791         NULLCHK(segment)
1792 		ret = xmlStrcat(ret,BAD_CAST "//");
1793         ret = xmlStrcat(ret, segment);
1794         ret = xmlStrcat(ret, BAD_CAST "@");
1795         xmlFree(segment);
1796     }
1797 
1798     if (uri->server) {
1799         segment = xmlURIEscapeStr(BAD_CAST uri->server, BAD_CAST "/?;:@");
1800         NULLCHK(segment)
1801 		if (uri->user == NULL)
1802 		ret = xmlStrcat(ret, BAD_CAST "//");
1803         ret = xmlStrcat(ret, segment);
1804         xmlFree(segment);
1805     }
1806 
1807     if (uri->port) {
1808         xmlChar port[10];
1809 
1810         snprintf((char *) port, 10, "%d", uri->port);
1811         ret = xmlStrcat(ret, BAD_CAST ":");
1812         ret = xmlStrcat(ret, port);
1813     }
1814 
1815     if (uri->path) {
1816         segment =
1817             xmlURIEscapeStr(BAD_CAST uri->path, BAD_CAST ":@&=+$,/?;");
1818         NULLCHK(segment)
1819         ret = xmlStrcat(ret, segment);
1820         xmlFree(segment);
1821     }
1822 
1823     if (uri->query_raw) {
1824         ret = xmlStrcat(ret, BAD_CAST "?");
1825         ret = xmlStrcat(ret, BAD_CAST uri->query_raw);
1826     }
1827     else if (uri->query) {
1828         segment =
1829             xmlURIEscapeStr(BAD_CAST uri->query, BAD_CAST ";/?:@&=+,$");
1830         NULLCHK(segment)
1831         ret = xmlStrcat(ret, BAD_CAST "?");
1832         ret = xmlStrcat(ret, segment);
1833         xmlFree(segment);
1834     }
1835 
1836     if (uri->opaque) {
1837         segment = xmlURIEscapeStr(BAD_CAST uri->opaque, BAD_CAST "");
1838         NULLCHK(segment)
1839         ret = xmlStrcat(ret, segment);
1840         xmlFree(segment);
1841     }
1842 
1843     if (uri->fragment) {
1844         segment = xmlURIEscapeStr(BAD_CAST uri->fragment, BAD_CAST "#");
1845         NULLCHK(segment)
1846         ret = xmlStrcat(ret, BAD_CAST "#");
1847         ret = xmlStrcat(ret, segment);
1848         xmlFree(segment);
1849     }
1850 
1851     xmlFreeURI(uri);
1852 #undef NULLCHK
1853 
1854     return (ret);
1855 }
1856 
1857 /************************************************************************
1858  *									*
1859  *			Public functions				*
1860  *									*
1861  ************************************************************************/
1862 
1863 /**
1864  * xmlBuildURI:
1865  * @URI:  the URI instance found in the document
1866  * @base:  the base value
1867  *
1868  * Computes he final URI of the reference done by checking that
1869  * the given URI is valid, and building the final URI using the
1870  * base URI. This is processed according to section 5.2 of the
1871  * RFC 2396
1872  *
1873  * 5.2. Resolving Relative References to Absolute Form
1874  *
1875  * Returns a new URI string (to be freed by the caller) or NULL in case
1876  *         of error.
1877  */
1878 xmlChar *
xmlBuildURI(const xmlChar * URI,const xmlChar * base)1879 xmlBuildURI(const xmlChar *URI, const xmlChar *base) {
1880     xmlChar *val = NULL;
1881     int ret, len, indx, cur, out;
1882     xmlURIPtr ref = NULL;
1883     xmlURIPtr bas = NULL;
1884     xmlURIPtr res = NULL;
1885 
1886     /*
1887      * 1) The URI reference is parsed into the potential four components and
1888      *    fragment identifier, as described in Section 4.3.
1889      *
1890      *    NOTE that a completely empty URI is treated by modern browsers
1891      *    as a reference to "." rather than as a synonym for the current
1892      *    URI.  Should we do that here?
1893      */
1894     if (URI == NULL)
1895 	ret = -1;
1896     else {
1897 	if (*URI) {
1898 	    ref = xmlCreateURI();
1899 	    if (ref == NULL)
1900 		goto done;
1901 	    ret = xmlParseURIReference(ref, (const char *) URI);
1902 	}
1903 	else
1904 	    ret = 0;
1905     }
1906     if (ret != 0)
1907 	goto done;
1908     if ((ref != NULL) && (ref->scheme != NULL)) {
1909 	/*
1910 	 * The URI is absolute don't modify.
1911 	 */
1912 	val = xmlStrdup(URI);
1913 	goto done;
1914     }
1915     if (base == NULL)
1916 	ret = -1;
1917     else {
1918 	bas = xmlCreateURI();
1919 	if (bas == NULL)
1920 	    goto done;
1921 	ret = xmlParseURIReference(bas, (const char *) base);
1922     }
1923     if (ret != 0) {
1924 	if (ref)
1925 	    val = xmlSaveUri(ref);
1926 	goto done;
1927     }
1928     if (ref == NULL) {
1929 	/*
1930 	 * the base fragment must be ignored
1931 	 */
1932 	if (bas->fragment != NULL) {
1933 	    xmlFree(bas->fragment);
1934 	    bas->fragment = NULL;
1935 	}
1936 	val = xmlSaveUri(bas);
1937 	goto done;
1938     }
1939 
1940     /*
1941      * 2) If the path component is empty and the scheme, authority, and
1942      *    query components are undefined, then it is a reference to the
1943      *    current document and we are done.  Otherwise, the reference URI's
1944      *    query and fragment components are defined as found (or not found)
1945      *    within the URI reference and not inherited from the base URI.
1946      *
1947      *    NOTE that in modern browsers, the parsing differs from the above
1948      *    in the following aspect:  the query component is allowed to be
1949      *    defined while still treating this as a reference to the current
1950      *    document.
1951      */
1952     res = xmlCreateURI();
1953     if (res == NULL)
1954 	goto done;
1955     if ((ref->scheme == NULL) && (ref->path == NULL) &&
1956 	((ref->authority == NULL) && (ref->server == NULL))) {
1957 	if (bas->scheme != NULL)
1958 	    res->scheme = xmlMemStrdup(bas->scheme);
1959 	if (bas->authority != NULL)
1960 	    res->authority = xmlMemStrdup(bas->authority);
1961 	else if (bas->server != NULL) {
1962 	    res->server = xmlMemStrdup(bas->server);
1963 	    if (bas->user != NULL)
1964 		res->user = xmlMemStrdup(bas->user);
1965 	    res->port = bas->port;
1966 	}
1967 	if (bas->path != NULL)
1968 	    res->path = xmlMemStrdup(bas->path);
1969 	if (ref->query_raw != NULL)
1970 	    res->query_raw = xmlMemStrdup (ref->query_raw);
1971 	else if (ref->query != NULL)
1972 	    res->query = xmlMemStrdup(ref->query);
1973 	else if (bas->query_raw != NULL)
1974 	    res->query_raw = xmlMemStrdup(bas->query_raw);
1975 	else if (bas->query != NULL)
1976 	    res->query = xmlMemStrdup(bas->query);
1977 	if (ref->fragment != NULL)
1978 	    res->fragment = xmlMemStrdup(ref->fragment);
1979 	goto step_7;
1980     }
1981 
1982     /*
1983      * 3) If the scheme component is defined, indicating that the reference
1984      *    starts with a scheme name, then the reference is interpreted as an
1985      *    absolute URI and we are done.  Otherwise, the reference URI's
1986      *    scheme is inherited from the base URI's scheme component.
1987      */
1988     if (ref->scheme != NULL) {
1989 	val = xmlSaveUri(ref);
1990 	goto done;
1991     }
1992     if (bas->scheme != NULL)
1993 	res->scheme = xmlMemStrdup(bas->scheme);
1994 
1995     if (ref->query_raw != NULL)
1996 	res->query_raw = xmlMemStrdup(ref->query_raw);
1997     else if (ref->query != NULL)
1998 	res->query = xmlMemStrdup(ref->query);
1999     if (ref->fragment != NULL)
2000 	res->fragment = xmlMemStrdup(ref->fragment);
2001 
2002     /*
2003      * 4) If the authority component is defined, then the reference is a
2004      *    network-path and we skip to step 7.  Otherwise, the reference
2005      *    URI's authority is inherited from the base URI's authority
2006      *    component, which will also be undefined if the URI scheme does not
2007      *    use an authority component.
2008      */
2009     if ((ref->authority != NULL) || (ref->server != NULL)) {
2010 	if (ref->authority != NULL)
2011 	    res->authority = xmlMemStrdup(ref->authority);
2012 	else {
2013 	    res->server = xmlMemStrdup(ref->server);
2014 	    if (ref->user != NULL)
2015 		res->user = xmlMemStrdup(ref->user);
2016             res->port = ref->port;
2017 	}
2018 	if (ref->path != NULL)
2019 	    res->path = xmlMemStrdup(ref->path);
2020 	goto step_7;
2021     }
2022     if (bas->authority != NULL)
2023 	res->authority = xmlMemStrdup(bas->authority);
2024     else if (bas->server != NULL) {
2025 	res->server = xmlMemStrdup(bas->server);
2026 	if (bas->user != NULL)
2027 	    res->user = xmlMemStrdup(bas->user);
2028 	res->port = bas->port;
2029     }
2030 
2031     /*
2032      * 5) If the path component begins with a slash character ("/"), then
2033      *    the reference is an absolute-path and we skip to step 7.
2034      */
2035     if ((ref->path != NULL) && (ref->path[0] == '/')) {
2036 	res->path = xmlMemStrdup(ref->path);
2037 	goto step_7;
2038     }
2039 
2040 
2041     /*
2042      * 6) If this step is reached, then we are resolving a relative-path
2043      *    reference.  The relative path needs to be merged with the base
2044      *    URI's path.  Although there are many ways to do this, we will
2045      *    describe a simple method using a separate string buffer.
2046      *
2047      * Allocate a buffer large enough for the result string.
2048      */
2049     len = 2; /* extra / and 0 */
2050     if (ref->path != NULL)
2051 	len += strlen(ref->path);
2052     if (bas->path != NULL)
2053 	len += strlen(bas->path);
2054     res->path = (char *) xmlMallocAtomic(len);
2055     if (res->path == NULL) {
2056         xmlURIErrMemory("resolving URI against base\n");
2057 	goto done;
2058     }
2059     res->path[0] = 0;
2060 
2061     /*
2062      * a) All but the last segment of the base URI's path component is
2063      *    copied to the buffer.  In other words, any characters after the
2064      *    last (right-most) slash character, if any, are excluded.
2065      */
2066     cur = 0;
2067     out = 0;
2068     if (bas->path != NULL) {
2069 	while (bas->path[cur] != 0) {
2070 	    while ((bas->path[cur] != 0) && (bas->path[cur] != '/'))
2071 		cur++;
2072 	    if (bas->path[cur] == 0)
2073 		break;
2074 
2075 	    cur++;
2076 	    while (out < cur) {
2077 		res->path[out] = bas->path[out];
2078 		out++;
2079 	    }
2080 	}
2081     }
2082     res->path[out] = 0;
2083 
2084     /*
2085      * b) The reference's path component is appended to the buffer
2086      *    string.
2087      */
2088     if (ref->path != NULL && ref->path[0] != 0) {
2089 	indx = 0;
2090 	/*
2091 	 * Ensure the path includes a '/'
2092 	 */
2093 	if ((out == 0) && (bas->server != NULL))
2094 	    res->path[out++] = '/';
2095 	while (ref->path[indx] != 0) {
2096 	    res->path[out++] = ref->path[indx++];
2097 	}
2098     }
2099     res->path[out] = 0;
2100 
2101     /*
2102      * Steps c) to h) are really path normalization steps
2103      */
2104     xmlNormalizeURIPath(res->path);
2105 
2106 step_7:
2107 
2108     /*
2109      * 7) The resulting URI components, including any inherited from the
2110      *    base URI, are recombined to give the absolute form of the URI
2111      *    reference.
2112      */
2113     val = xmlSaveUri(res);
2114 
2115 done:
2116     if (ref != NULL)
2117 	xmlFreeURI(ref);
2118     if (bas != NULL)
2119 	xmlFreeURI(bas);
2120     if (res != NULL)
2121 	xmlFreeURI(res);
2122     return(val);
2123 }
2124 
2125 /**
2126  * xmlBuildRelativeURI:
2127  * @URI:  the URI reference under consideration
2128  * @base:  the base value
2129  *
2130  * Expresses the URI of the reference in terms relative to the
2131  * base.  Some examples of this operation include:
2132  *     base = "http://site1.com/docs/book1.html"
2133  *        URI input                        URI returned
2134  *     docs/pic1.gif                    pic1.gif
2135  *     docs/img/pic1.gif                img/pic1.gif
2136  *     img/pic1.gif                     ../img/pic1.gif
2137  *     http://site1.com/docs/pic1.gif   pic1.gif
2138  *     http://site2.com/docs/pic1.gif   http://site2.com/docs/pic1.gif
2139  *
2140  *     base = "docs/book1.html"
2141  *        URI input                        URI returned
2142  *     docs/pic1.gif                    pic1.gif
2143  *     docs/img/pic1.gif                img/pic1.gif
2144  *     img/pic1.gif                     ../img/pic1.gif
2145  *     http://site1.com/docs/pic1.gif   http://site1.com/docs/pic1.gif
2146  *
2147  *
2148  * Note: if the URI reference is really wierd or complicated, it may be
2149  *       worthwhile to first convert it into a "nice" one by calling
2150  *       xmlBuildURI (using 'base') before calling this routine,
2151  *       since this routine (for reasonable efficiency) assumes URI has
2152  *       already been through some validation.
2153  *
2154  * Returns a new URI string (to be freed by the caller) or NULL in case
2155  * error.
2156  */
2157 xmlChar *
xmlBuildRelativeURI(const xmlChar * URI,const xmlChar * base)2158 xmlBuildRelativeURI (const xmlChar * URI, const xmlChar * base)
2159 {
2160     xmlChar *val = NULL;
2161     int ret;
2162     int ix;
2163     int pos = 0;
2164     int nbslash = 0;
2165     int len;
2166     xmlURIPtr ref = NULL;
2167     xmlURIPtr bas = NULL;
2168     xmlChar *bptr, *uptr, *vptr;
2169     int remove_path = 0;
2170 
2171     if ((URI == NULL) || (*URI == 0))
2172 	return NULL;
2173 
2174     /*
2175      * First parse URI into a standard form
2176      */
2177     ref = xmlCreateURI ();
2178     if (ref == NULL)
2179 	return NULL;
2180     /* If URI not already in "relative" form */
2181     if (URI[0] != '.') {
2182 	ret = xmlParseURIReference (ref, (const char *) URI);
2183 	if (ret != 0)
2184 	    goto done;		/* Error in URI, return NULL */
2185     } else
2186 	ref->path = (char *)xmlStrdup(URI);
2187 
2188     /*
2189      * Next parse base into the same standard form
2190      */
2191     if ((base == NULL) || (*base == 0)) {
2192 	val = xmlStrdup (URI);
2193 	goto done;
2194     }
2195     bas = xmlCreateURI ();
2196     if (bas == NULL)
2197 	goto done;
2198     if (base[0] != '.') {
2199 	ret = xmlParseURIReference (bas, (const char *) base);
2200 	if (ret != 0)
2201 	    goto done;		/* Error in base, return NULL */
2202     } else
2203 	bas->path = (char *)xmlStrdup(base);
2204 
2205     /*
2206      * If the scheme / server on the URI differs from the base,
2207      * just return the URI
2208      */
2209     if ((ref->scheme != NULL) &&
2210 	((bas->scheme == NULL) ||
2211 	 (xmlStrcmp ((xmlChar *)bas->scheme, (xmlChar *)ref->scheme)) ||
2212 	 (xmlStrcmp ((xmlChar *)bas->server, (xmlChar *)ref->server)))) {
2213 	val = xmlStrdup (URI);
2214 	goto done;
2215     }
2216     if (xmlStrEqual((xmlChar *)bas->path, (xmlChar *)ref->path)) {
2217 	val = xmlStrdup(BAD_CAST "");
2218 	goto done;
2219     }
2220     if (bas->path == NULL) {
2221 	val = xmlStrdup((xmlChar *)ref->path);
2222 	goto done;
2223     }
2224     if (ref->path == NULL) {
2225         ref->path = (char *) "/";
2226 	remove_path = 1;
2227     }
2228 
2229     /*
2230      * At this point (at last!) we can compare the two paths
2231      *
2232      * First we take care of the special case where either of the
2233      * two path components may be missing (bug 316224)
2234      */
2235     if (bas->path == NULL) {
2236 	if (ref->path != NULL) {
2237 	    uptr = (xmlChar *) ref->path;
2238 	    if (*uptr == '/')
2239 		uptr++;
2240 	    /* exception characters from xmlSaveUri */
2241 	    val = xmlURIEscapeStr(uptr, BAD_CAST "/;&=+$,");
2242 	}
2243 	goto done;
2244     }
2245     bptr = (xmlChar *)bas->path;
2246     if (ref->path == NULL) {
2247 	for (ix = 0; bptr[ix] != 0; ix++) {
2248 	    if (bptr[ix] == '/')
2249 		nbslash++;
2250 	}
2251 	uptr = NULL;
2252 	len = 1;	/* this is for a string terminator only */
2253     } else {
2254     /*
2255      * Next we compare the two strings and find where they first differ
2256      */
2257 	if ((ref->path[pos] == '.') && (ref->path[pos+1] == '/'))
2258             pos += 2;
2259 	if ((*bptr == '.') && (bptr[1] == '/'))
2260             bptr += 2;
2261 	else if ((*bptr == '/') && (ref->path[pos] != '/'))
2262 	    bptr++;
2263 	while ((bptr[pos] == ref->path[pos]) && (bptr[pos] != 0))
2264 	    pos++;
2265 
2266 	if (bptr[pos] == ref->path[pos]) {
2267 	    val = xmlStrdup(BAD_CAST "");
2268 	    goto done;		/* (I can't imagine why anyone would do this) */
2269 	}
2270 
2271 	/*
2272 	 * In URI, "back up" to the last '/' encountered.  This will be the
2273 	 * beginning of the "unique" suffix of URI
2274 	 */
2275 	ix = pos;
2276 	if ((ref->path[ix] == '/') && (ix > 0))
2277 	    ix--;
2278 	else if ((ref->path[ix] == 0) && (ix > 1) && (ref->path[ix - 1] == '/'))
2279 	    ix -= 2;
2280 	for (; ix > 0; ix--) {
2281 	    if (ref->path[ix] == '/')
2282 		break;
2283 	}
2284 	if (ix == 0) {
2285 	    uptr = (xmlChar *)ref->path;
2286 	} else {
2287 	    ix++;
2288 	    uptr = (xmlChar *)&ref->path[ix];
2289 	}
2290 
2291 	/*
2292 	 * In base, count the number of '/' from the differing point
2293 	 */
2294 	if (bptr[pos] != ref->path[pos]) {/* check for trivial URI == base */
2295 	    for (; bptr[ix] != 0; ix++) {
2296 		if (bptr[ix] == '/')
2297 		    nbslash++;
2298 	    }
2299 	}
2300 	len = xmlStrlen (uptr) + 1;
2301     }
2302 
2303     if (nbslash == 0) {
2304 	if (uptr != NULL)
2305 	    /* exception characters from xmlSaveUri */
2306 	    val = xmlURIEscapeStr(uptr, BAD_CAST "/;&=+$,");
2307 	goto done;
2308     }
2309 
2310     /*
2311      * Allocate just enough space for the returned string -
2312      * length of the remainder of the URI, plus enough space
2313      * for the "../" groups, plus one for the terminator
2314      */
2315     val = (xmlChar *) xmlMalloc (len + 3 * nbslash);
2316     if (val == NULL) {
2317         xmlURIErrMemory("building relative URI\n");
2318 	goto done;
2319     }
2320     vptr = val;
2321     /*
2322      * Put in as many "../" as needed
2323      */
2324     for (; nbslash>0; nbslash--) {
2325 	*vptr++ = '.';
2326 	*vptr++ = '.';
2327 	*vptr++ = '/';
2328     }
2329     /*
2330      * Finish up with the end of the URI
2331      */
2332     if (uptr != NULL) {
2333         if ((vptr > val) && (len > 0) &&
2334 	    (uptr[0] == '/') && (vptr[-1] == '/')) {
2335 	    memcpy (vptr, uptr + 1, len - 1);
2336 	    vptr[len - 2] = 0;
2337 	} else {
2338 	    memcpy (vptr, uptr, len);
2339 	    vptr[len - 1] = 0;
2340 	}
2341     } else {
2342 	vptr[len - 1] = 0;
2343     }
2344 
2345     /* escape the freshly-built path */
2346     vptr = val;
2347 	/* exception characters from xmlSaveUri */
2348     val = xmlURIEscapeStr(vptr, BAD_CAST "/;&=+$,");
2349     xmlFree(vptr);
2350 
2351 done:
2352     /*
2353      * Free the working variables
2354      */
2355     if (remove_path != 0)
2356         ref->path = NULL;
2357     if (ref != NULL)
2358 	xmlFreeURI (ref);
2359     if (bas != NULL)
2360 	xmlFreeURI (bas);
2361 
2362     return val;
2363 }
2364 
2365 /**
2366  * xmlCanonicPath:
2367  * @path:  the resource locator in a filesystem notation
2368  *
2369  * Constructs a canonic path from the specified path.
2370  *
2371  * Returns a new canonic path, or a duplicate of the path parameter if the
2372  * construction fails. The caller is responsible for freeing the memory occupied
2373  * by the returned string. If there is insufficient memory available, or the
2374  * argument is NULL, the function returns NULL.
2375  */
2376 #define IS_WINDOWS_PATH(p)					\
2377 	((p != NULL) &&						\
2378 	 (((p[0] >= 'a') && (p[0] <= 'z')) ||			\
2379 	  ((p[0] >= 'A') && (p[0] <= 'Z'))) &&			\
2380 	 (p[1] == ':') && ((p[2] == '/') || (p[2] == '\\')))
2381 xmlChar *
xmlCanonicPath(const xmlChar * path)2382 xmlCanonicPath(const xmlChar *path)
2383 {
2384 /*
2385  * For Windows implementations, additional work needs to be done to
2386  * replace backslashes in pathnames with "forward slashes"
2387  */
2388 #if defined(_WIN32) && !defined(__CYGWIN__)
2389     int len = 0;
2390     int i = 0;
2391     xmlChar *p = NULL;
2392 #endif
2393     xmlURIPtr uri;
2394     xmlChar *ret;
2395     const xmlChar *absuri;
2396 
2397     if (path == NULL)
2398 	return(NULL);
2399 
2400 #if defined(_WIN32)
2401     /*
2402      * We must not change the backslashes to slashes if the the path
2403      * starts with \\?\
2404      * Those paths can be up to 32k characters long.
2405      * Was added specifically for OpenOffice, those paths can't be converted
2406      * to URIs anyway.
2407      */
2408     if ((path[0] == '\\') && (path[1] == '\\') && (path[2] == '?') &&
2409         (path[3] == '\\') )
2410 	return xmlStrdup((const xmlChar *) path);
2411 #endif
2412 
2413 	/* sanitize filename starting with // so it can be used as URI */
2414     if ((path[0] == '/') && (path[1] == '/') && (path[2] != '/'))
2415         path++;
2416 
2417     if ((uri = xmlParseURI((const char *) path)) != NULL) {
2418 	xmlFreeURI(uri);
2419 	return xmlStrdup(path);
2420     }
2421 
2422     /* Check if this is an "absolute uri" */
2423     absuri = xmlStrstr(path, BAD_CAST "://");
2424     if (absuri != NULL) {
2425         int l, j;
2426 	unsigned char c;
2427 	xmlChar *escURI;
2428 
2429         /*
2430 	 * this looks like an URI where some parts have not been
2431 	 * escaped leading to a parsing problem.  Check that the first
2432 	 * part matches a protocol.
2433 	 */
2434 	l = absuri - path;
2435 	/* Bypass if first part (part before the '://') is > 20 chars */
2436 	if ((l <= 0) || (l > 20))
2437 	    goto path_processing;
2438 	/* Bypass if any non-alpha characters are present in first part */
2439 	for (j = 0;j < l;j++) {
2440 	    c = path[j];
2441 	    if (!(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'))))
2442 	        goto path_processing;
2443 	}
2444 
2445 	/* Escape all except the characters specified in the supplied path */
2446         escURI = xmlURIEscapeStr(path, BAD_CAST ":/?_.#&;=");
2447 	if (escURI != NULL) {
2448 	    /* Try parsing the escaped path */
2449 	    uri = xmlParseURI((const char *) escURI);
2450 	    /* If successful, return the escaped string */
2451 	    if (uri != NULL) {
2452 	        xmlFreeURI(uri);
2453 		return escURI;
2454 	    }
2455 	}
2456     }
2457 
2458 path_processing:
2459 /* For Windows implementations, replace backslashes with 'forward slashes' */
2460 #if defined(_WIN32) && !defined(__CYGWIN__)
2461     /*
2462      * Create a URI structure
2463      */
2464     uri = xmlCreateURI();
2465     if (uri == NULL) {		/* Guard against 'out of memory' */
2466         return(NULL);
2467     }
2468 
2469     len = xmlStrlen(path);
2470     if ((len > 2) && IS_WINDOWS_PATH(path)) {
2471         /* make the scheme 'file' */
2472 	uri->scheme = xmlStrdup(BAD_CAST "file");
2473 	/* allocate space for leading '/' + path + string terminator */
2474 	uri->path = xmlMallocAtomic(len + 2);
2475 	if (uri->path == NULL) {
2476 	    xmlFreeURI(uri);	/* Guard agains 'out of memory' */
2477 	    return(NULL);
2478 	}
2479 	/* Put in leading '/' plus path */
2480 	uri->path[0] = '/';
2481 	p = uri->path + 1;
2482 	strncpy(p, path, len + 1);
2483     } else {
2484 	uri->path = xmlStrdup(path);
2485 	if (uri->path == NULL) {
2486 	    xmlFreeURI(uri);
2487 	    return(NULL);
2488 	}
2489 	p = uri->path;
2490     }
2491     /* Now change all occurences of '\' to '/' */
2492     while (*p != '\0') {
2493 	if (*p == '\\')
2494 	    *p = '/';
2495 	p++;
2496     }
2497 
2498     if (uri->scheme == NULL) {
2499 	ret = xmlStrdup((const xmlChar *) uri->path);
2500     } else {
2501 	ret = xmlSaveUri(uri);
2502     }
2503 
2504     xmlFreeURI(uri);
2505 #else
2506     ret = xmlStrdup((const xmlChar *) path);
2507 #endif
2508     return(ret);
2509 }
2510 
2511 /**
2512  * xmlPathToURI:
2513  * @path:  the resource locator in a filesystem notation
2514  *
2515  * Constructs an URI expressing the existing path
2516  *
2517  * Returns a new URI, or a duplicate of the path parameter if the
2518  * construction fails. The caller is responsible for freeing the memory
2519  * occupied by the returned string. If there is insufficient memory available,
2520  * or the argument is NULL, the function returns NULL.
2521  */
2522 xmlChar *
xmlPathToURI(const xmlChar * path)2523 xmlPathToURI(const xmlChar *path)
2524 {
2525     xmlURIPtr uri;
2526     xmlURI temp;
2527     xmlChar *ret, *cal;
2528 
2529     if (path == NULL)
2530         return(NULL);
2531 
2532     if ((uri = xmlParseURI((const char *) path)) != NULL) {
2533 	xmlFreeURI(uri);
2534 	return xmlStrdup(path);
2535     }
2536     cal = xmlCanonicPath(path);
2537     if (cal == NULL)
2538         return(NULL);
2539 #if defined(_WIN32) && !defined(__CYGWIN__)
2540     /* xmlCanonicPath can return an URI on Windows (is that the intended behaviour?)
2541        If 'cal' is a valid URI allready then we are done here, as continuing would make
2542        it invalid. */
2543     if ((uri = xmlParseURI((const char *) cal)) != NULL) {
2544 	xmlFreeURI(uri);
2545 	return cal;
2546     }
2547     /* 'cal' can contain a relative path with backslashes. If that is processed
2548        by xmlSaveURI, they will be escaped and the external entity loader machinery
2549        will fail. So convert them to slashes. Misuse 'ret' for walking. */
2550     ret = cal;
2551     while (*ret != '\0') {
2552 	if (*ret == '\\')
2553 	    *ret = '/';
2554 	ret++;
2555     }
2556 #endif
2557     memset(&temp, 0, sizeof(temp));
2558     temp.path = (char *) cal;
2559     ret = xmlSaveUri(&temp);
2560     xmlFree(cal);
2561     return(ret);
2562 }
2563 #define bottom_uri
2564 #include "elfgcchack.h"
2565