1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28"""
29
30import re
31import sys
32import collections
33
34__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
35           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36           "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
37           "unquote", "unquote_plus", "unquote_to_bytes",
38           "DefragResult", "ParseResult", "SplitResult",
39           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
40
41# A classification of schemes ('' means apply by default)
42uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
43                 'wais', 'file', 'https', 'shttp', 'mms',
44                 'prospero', 'rtsp', 'rtspu', '', 'sftp',
45                 'svn', 'svn+ssh', 'ws', 'wss']
46uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
47               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
48               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
49               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
50               'ws', 'wss']
51uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
52               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
53               'mms', '', 'sftp', 'tel']
54
55# These are not actually used anymore, but should stay for backwards
56# compatibility.  (They are undocumented, but have a public-looking name.)
57non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
58                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
59uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
60              'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
61uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
62                 'nntp', 'wais', 'https', 'shttp', 'snews',
63                 'file', 'prospero', '']
64
65# Characters valid in scheme names
66scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
67                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
68                '0123456789'
69                '+-.')
70
71# XXX: Consider replacing with functools.lru_cache
72MAX_CACHE_SIZE = 20
73_parse_cache = {}
74
75def clear_cache():
76    """Clear the parse cache and the quoters cache."""
77    _parse_cache.clear()
78    _safe_quoters.clear()
79
80
81# Helpers for bytes handling
82# For 3.2, we deliberately require applications that
83# handle improperly quoted URLs to do their own
84# decoding and encoding. If valid use cases are
85# presented, we may relax this by using latin-1
86# decoding internally for 3.3
87_implicit_encoding = 'ascii'
88_implicit_errors = 'strict'
89
90def _noop(obj):
91    return obj
92
93def _encode_result(obj, encoding=_implicit_encoding,
94                        errors=_implicit_errors):
95    return obj.encode(encoding, errors)
96
97def _decode_args(args, encoding=_implicit_encoding,
98                       errors=_implicit_errors):
99    return tuple(x.decode(encoding, errors) if x else '' for x in args)
100
101def _coerce_args(*args):
102    # Invokes decode if necessary to create str args
103    # and returns the coerced inputs along with
104    # an appropriate result coercion function
105    #   - noop for str inputs
106    #   - encoding function otherwise
107    str_input = isinstance(args[0], str)
108    for arg in args[1:]:
109        # We special-case the empty string to support the
110        # "scheme=''" default argument to some functions
111        if arg and isinstance(arg, str) != str_input:
112            raise TypeError("Cannot mix str and non-str arguments")
113    if str_input:
114        return args + (_noop,)
115    return _decode_args(args) + (_encode_result,)
116
117# Result objects are more helpful than simple tuples
118class _ResultMixinStr(object):
119    """Standard approach to encoding parsed results from str to bytes"""
120    __slots__ = ()
121
122    def encode(self, encoding='ascii', errors='strict'):
123        return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
124
125
126class _ResultMixinBytes(object):
127    """Standard approach to decoding parsed results from bytes to str"""
128    __slots__ = ()
129
130    def decode(self, encoding='ascii', errors='strict'):
131        return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
132
133
134class _NetlocResultMixinBase(object):
135    """Shared methods for the parsed result objects containing a netloc element"""
136    __slots__ = ()
137
138    @property
139    def username(self):
140        return self._userinfo[0]
141
142    @property
143    def password(self):
144        return self._userinfo[1]
145
146    @property
147    def hostname(self):
148        hostname = self._hostinfo[0]
149        if not hostname:
150            hostname = None
151        elif hostname is not None:
152            hostname = hostname.lower()
153        return hostname
154
155    @property
156    def port(self):
157        port = self._hostinfo[1]
158        if port is not None:
159            port = int(port, 10)
160            if not ( 0 <= port <= 65535):
161                raise ValueError("Port out of range 0-65535")
162        return port
163
164
165class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
166    __slots__ = ()
167
168    @property
169    def _userinfo(self):
170        netloc = self.netloc
171        userinfo, have_info, hostinfo = netloc.rpartition('@')
172        if have_info:
173            username, have_password, password = userinfo.partition(':')
174            if not have_password:
175                password = None
176        else:
177            username = password = None
178        return username, password
179
180    @property
181    def _hostinfo(self):
182        netloc = self.netloc
183        _, _, hostinfo = netloc.rpartition('@')
184        _, have_open_br, bracketed = hostinfo.partition('[')
185        if have_open_br:
186            hostname, _, port = bracketed.partition(']')
187            _, _, port = port.partition(':')
188        else:
189            hostname, _, port = hostinfo.partition(':')
190        if not port:
191            port = None
192        return hostname, port
193
194
195class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
196    __slots__ = ()
197
198    @property
199    def _userinfo(self):
200        netloc = self.netloc
201        userinfo, have_info, hostinfo = netloc.rpartition(b'@')
202        if have_info:
203            username, have_password, password = userinfo.partition(b':')
204            if not have_password:
205                password = None
206        else:
207            username = password = None
208        return username, password
209
210    @property
211    def _hostinfo(self):
212        netloc = self.netloc
213        _, _, hostinfo = netloc.rpartition(b'@')
214        _, have_open_br, bracketed = hostinfo.partition(b'[')
215        if have_open_br:
216            hostname, _, port = bracketed.partition(b']')
217            _, _, port = port.partition(b':')
218        else:
219            hostname, _, port = hostinfo.partition(b':')
220        if not port:
221            port = None
222        return hostname, port
223
224
225from collections import namedtuple
226
227_DefragResultBase = namedtuple('DefragResult', 'url fragment')
228_SplitResultBase = namedtuple(
229    'SplitResult', 'scheme netloc path query fragment')
230_ParseResultBase = namedtuple(
231    'ParseResult', 'scheme netloc path params query fragment')
232
233_DefragResultBase.__doc__ = """
234DefragResult(url, fragment)
235
236A 2-tuple that contains the url without fragment identifier and the fragment
237identifier as a separate argument.
238"""
239
240_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
241
242_DefragResultBase.fragment.__doc__ = """
243Fragment identifier separated from URL, that allows indirect identification of a
244secondary resource by reference to a primary resource and additional identifying
245information.
246"""
247
248_SplitResultBase.__doc__ = """
249SplitResult(scheme, netloc, path, query, fragment)
250
251A 5-tuple that contains the different components of a URL. Similar to
252ParseResult, but does not split params.
253"""
254
255_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
256
257_SplitResultBase.netloc.__doc__ = """
258Network location where the request is made to.
259"""
260
261_SplitResultBase.path.__doc__ = """
262The hierarchical path, such as the path to a file to download.
263"""
264
265_SplitResultBase.query.__doc__ = """
266The query component, that contains non-hierarchical data, that along with data
267in path component, identifies a resource in the scope of URI's scheme and
268network location.
269"""
270
271_SplitResultBase.fragment.__doc__ = """
272Fragment identifier, that allows indirect identification of a secondary resource
273by reference to a primary resource and additional identifying information.
274"""
275
276_ParseResultBase.__doc__ = """
277ParseResult(scheme, netloc, path, params,  query, fragment)
278
279A 6-tuple that contains components of a parsed URL.
280"""
281
282_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
283_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
284_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
285_ParseResultBase.params.__doc__ = """
286Parameters for last path element used to dereference the URI in order to provide
287access to perform some operation on the resource.
288"""
289
290_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
291_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
292
293
294# For backwards compatibility, alias _NetlocResultMixinStr
295# ResultBase is no longer part of the documented API, but it is
296# retained since deprecating it isn't worth the hassle
297ResultBase = _NetlocResultMixinStr
298
299# Structured result objects for string data
300class DefragResult(_DefragResultBase, _ResultMixinStr):
301    __slots__ = ()
302    def geturl(self):
303        if self.fragment:
304            return self.url + '#' + self.fragment
305        else:
306            return self.url
307
308class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
309    __slots__ = ()
310    def geturl(self):
311        return urlunsplit(self)
312
313class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
314    __slots__ = ()
315    def geturl(self):
316        return urlunparse(self)
317
318# Structured result objects for bytes data
319class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
320    __slots__ = ()
321    def geturl(self):
322        if self.fragment:
323            return self.url + b'#' + self.fragment
324        else:
325            return self.url
326
327class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
328    __slots__ = ()
329    def geturl(self):
330        return urlunsplit(self)
331
332class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
333    __slots__ = ()
334    def geturl(self):
335        return urlunparse(self)
336
337# Set up the encode/decode result pairs
338def _fix_result_transcoding():
339    _result_pairs = (
340        (DefragResult, DefragResultBytes),
341        (SplitResult, SplitResultBytes),
342        (ParseResult, ParseResultBytes),
343    )
344    for _decoded, _encoded in _result_pairs:
345        _decoded._encoded_counterpart = _encoded
346        _encoded._decoded_counterpart = _decoded
347
348_fix_result_transcoding()
349del _fix_result_transcoding
350
351def urlparse(url, scheme='', allow_fragments=True):
352    """Parse a URL into 6 components:
353    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
354    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
355    Note that we don't break the components up in smaller bits
356    (e.g. netloc is a single string) and we don't expand % escapes."""
357    url, scheme, _coerce_result = _coerce_args(url, scheme)
358    splitresult = urlsplit(url, scheme, allow_fragments)
359    scheme, netloc, url, query, fragment = splitresult
360    if scheme in uses_params and ';' in url:
361        url, params = _splitparams(url)
362    else:
363        params = ''
364    result = ParseResult(scheme, netloc, url, params, query, fragment)
365    return _coerce_result(result)
366
367def _splitparams(url):
368    if '/'  in url:
369        i = url.find(';', url.rfind('/'))
370        if i < 0:
371            return url, ''
372    else:
373        i = url.find(';')
374    return url[:i], url[i+1:]
375
376def _splitnetloc(url, start=0):
377    delim = len(url)   # position of end of domain part of url, default is end
378    for c in '/?#':    # look for delimiters; the order is NOT important
379        wdelim = url.find(c, start)        # find first of this delim
380        if wdelim >= 0:                    # if found
381            delim = min(delim, wdelim)     # use earliest delim position
382    return url[start:delim], url[delim:]   # return (domain, rest)
383
384def urlsplit(url, scheme='', allow_fragments=True):
385    """Parse a URL into 5 components:
386    <scheme>://<netloc>/<path>?<query>#<fragment>
387    Return a 5-tuple: (scheme, netloc, path, query, fragment).
388    Note that we don't break the components up in smaller bits
389    (e.g. netloc is a single string) and we don't expand % escapes."""
390    url, scheme, _coerce_result = _coerce_args(url, scheme)
391    allow_fragments = bool(allow_fragments)
392    key = url, scheme, allow_fragments, type(url), type(scheme)
393    cached = _parse_cache.get(key, None)
394    if cached:
395        return _coerce_result(cached)
396    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
397        clear_cache()
398    netloc = query = fragment = ''
399    i = url.find(':')
400    if i > 0:
401        if url[:i] == 'http': # optimize the common case
402            scheme = url[:i].lower()
403            url = url[i+1:]
404            if url[:2] == '//':
405                netloc, url = _splitnetloc(url, 2)
406                if (('[' in netloc and ']' not in netloc) or
407                        (']' in netloc and '[' not in netloc)):
408                    raise ValueError("Invalid IPv6 URL")
409            if allow_fragments and '#' in url:
410                url, fragment = url.split('#', 1)
411            if '?' in url:
412                url, query = url.split('?', 1)
413            v = SplitResult(scheme, netloc, url, query, fragment)
414            _parse_cache[key] = v
415            return _coerce_result(v)
416        for c in url[:i]:
417            if c not in scheme_chars:
418                break
419        else:
420            # make sure "url" is not actually a port number (in which case
421            # "scheme" is really part of the path)
422            rest = url[i+1:]
423            if not rest or any(c not in '0123456789' for c in rest):
424                # not a port number
425                scheme, url = url[:i].lower(), rest
426
427    if url[:2] == '//':
428        netloc, url = _splitnetloc(url, 2)
429        if (('[' in netloc and ']' not in netloc) or
430                (']' in netloc and '[' not in netloc)):
431            raise ValueError("Invalid IPv6 URL")
432    if allow_fragments and '#' in url:
433        url, fragment = url.split('#', 1)
434    if '?' in url:
435        url, query = url.split('?', 1)
436    v = SplitResult(scheme, netloc, url, query, fragment)
437    _parse_cache[key] = v
438    return _coerce_result(v)
439
440def urlunparse(components):
441    """Put a parsed URL back together again.  This may result in a
442    slightly different, but equivalent URL, if the URL that was parsed
443    originally had redundant delimiters, e.g. a ? with an empty query
444    (the draft states that these are equivalent)."""
445    scheme, netloc, url, params, query, fragment, _coerce_result = (
446                                                  _coerce_args(*components))
447    if params:
448        url = "%s;%s" % (url, params)
449    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
450
451def urlunsplit(components):
452    """Combine the elements of a tuple as returned by urlsplit() into a
453    complete URL as a string. The data argument can be any five-item iterable.
454    This may result in a slightly different, but equivalent URL, if the URL that
455    was parsed originally had unnecessary delimiters (for example, a ? with an
456    empty query; the RFC states that these are equivalent)."""
457    scheme, netloc, url, query, fragment, _coerce_result = (
458                                          _coerce_args(*components))
459    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
460        if url and url[:1] != '/': url = '/' + url
461        url = '//' + (netloc or '') + url
462    if scheme:
463        url = scheme + ':' + url
464    if query:
465        url = url + '?' + query
466    if fragment:
467        url = url + '#' + fragment
468    return _coerce_result(url)
469
470def urljoin(base, url, allow_fragments=True):
471    """Join a base URL and a possibly relative URL to form an absolute
472    interpretation of the latter."""
473    if not base:
474        return url
475    if not url:
476        return base
477
478    base, url, _coerce_result = _coerce_args(base, url)
479    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
480            urlparse(base, '', allow_fragments)
481    scheme, netloc, path, params, query, fragment = \
482            urlparse(url, bscheme, allow_fragments)
483
484    if scheme != bscheme or scheme not in uses_relative:
485        return _coerce_result(url)
486    if scheme in uses_netloc:
487        if netloc:
488            return _coerce_result(urlunparse((scheme, netloc, path,
489                                              params, query, fragment)))
490        netloc = bnetloc
491
492    if not path and not params:
493        path = bpath
494        params = bparams
495        if not query:
496            query = bquery
497        return _coerce_result(urlunparse((scheme, netloc, path,
498                                          params, query, fragment)))
499
500    base_parts = bpath.split('/')
501    if base_parts[-1] != '':
502        # the last item is not a directory, so will not be taken into account
503        # in resolving the relative path
504        del base_parts[-1]
505
506    # for rfc3986, ignore all base path should the first character be root.
507    if path[:1] == '/':
508        segments = path.split('/')
509    else:
510        segments = base_parts + path.split('/')
511        # filter out elements that would cause redundant slashes on re-joining
512        # the resolved_path
513        segments[1:-1] = filter(None, segments[1:-1])
514
515    resolved_path = []
516
517    for seg in segments:
518        if seg == '..':
519            try:
520                resolved_path.pop()
521            except IndexError:
522                # ignore any .. segments that would otherwise cause an IndexError
523                # when popped from resolved_path if resolving for rfc3986
524                pass
525        elif seg == '.':
526            continue
527        else:
528            resolved_path.append(seg)
529
530    if segments[-1] in ('.', '..'):
531        # do some post-processing here. if the last segment was a relative dir,
532        # then we need to append the trailing '/'
533        resolved_path.append('')
534
535    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
536        resolved_path) or '/', params, query, fragment)))
537
538
539def urldefrag(url):
540    """Removes any existing fragment from URL.
541
542    Returns a tuple of the defragmented URL and the fragment.  If
543    the URL contained no fragments, the second element is the
544    empty string.
545    """
546    url, _coerce_result = _coerce_args(url)
547    if '#' in url:
548        s, n, p, a, q, frag = urlparse(url)
549        defrag = urlunparse((s, n, p, a, q, ''))
550    else:
551        frag = ''
552        defrag = url
553    return _coerce_result(DefragResult(defrag, frag))
554
555_hexdig = '0123456789ABCDEFabcdef'
556_hextobyte = None
557
558def unquote_to_bytes(string):
559    """unquote_to_bytes('abc%20def') -> b'abc def'."""
560    # Note: strings are encoded as UTF-8. This is only an issue if it contains
561    # unescaped non-ASCII characters, which URIs should not.
562    if not string:
563        # Is it a string-like object?
564        string.split
565        return b''
566    if isinstance(string, str):
567        string = string.encode('utf-8')
568    bits = string.split(b'%')
569    if len(bits) == 1:
570        return string
571    res = [bits[0]]
572    append = res.append
573    # Delay the initialization of the table to not waste memory
574    # if the function is never called
575    global _hextobyte
576    if _hextobyte is None:
577        _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
578                      for a in _hexdig for b in _hexdig}
579    for item in bits[1:]:
580        try:
581            append(_hextobyte[item[:2]])
582            append(item[2:])
583        except KeyError:
584            append(b'%')
585            append(item)
586    return b''.join(res)
587
588_asciire = re.compile('([\x00-\x7f]+)')
589
590def unquote(string, encoding='utf-8', errors='replace'):
591    """Replace %xx escapes by their single-character equivalent. The optional
592    encoding and errors parameters specify how to decode percent-encoded
593    sequences into Unicode characters, as accepted by the bytes.decode()
594    method.
595    By default, percent-encoded sequences are decoded with UTF-8, and invalid
596    sequences are replaced by a placeholder character.
597
598    unquote('abc%20def') -> 'abc def'.
599    """
600    if '%' not in string:
601        string.split
602        return string
603    if encoding is None:
604        encoding = 'utf-8'
605    if errors is None:
606        errors = 'replace'
607    bits = _asciire.split(string)
608    res = [bits[0]]
609    append = res.append
610    for i in range(1, len(bits), 2):
611        append(unquote_to_bytes(bits[i]).decode(encoding, errors))
612        append(bits[i + 1])
613    return ''.join(res)
614
615def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
616             encoding='utf-8', errors='replace'):
617    """Parse a query given as a string argument.
618
619        Arguments:
620
621        qs: percent-encoded query string to be parsed
622
623        keep_blank_values: flag indicating whether blank values in
624            percent-encoded queries should be treated as blank strings.
625            A true value indicates that blanks should be retained as
626            blank strings.  The default false value indicates that
627            blank values are to be ignored and treated as if they were
628            not included.
629
630        strict_parsing: flag indicating what to do with parsing errors.
631            If false (the default), errors are silently ignored.
632            If true, errors raise a ValueError exception.
633
634        encoding and errors: specify how to decode percent-encoded sequences
635            into Unicode characters, as accepted by the bytes.decode() method.
636    """
637    parsed_result = {}
638    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
639                      encoding=encoding, errors=errors)
640    for name, value in pairs:
641        if name in parsed_result:
642            parsed_result[name].append(value)
643        else:
644            parsed_result[name] = [value]
645    return parsed_result
646
647def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
648              encoding='utf-8', errors='replace'):
649    """Parse a query given as a string argument.
650
651    Arguments:
652
653    qs: percent-encoded query string to be parsed
654
655    keep_blank_values: flag indicating whether blank values in
656        percent-encoded queries should be treated as blank strings.  A
657        true value indicates that blanks should be retained as blank
658        strings.  The default false value indicates that blank values
659        are to be ignored and treated as if they were  not included.
660
661    strict_parsing: flag indicating what to do with parsing errors. If
662        false (the default), errors are silently ignored. If true,
663        errors raise a ValueError exception.
664
665    encoding and errors: specify how to decode percent-encoded sequences
666        into Unicode characters, as accepted by the bytes.decode() method.
667
668    Returns a list, as G-d intended.
669    """
670    qs, _coerce_result = _coerce_args(qs)
671    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
672    r = []
673    for name_value in pairs:
674        if not name_value and not strict_parsing:
675            continue
676        nv = name_value.split('=', 1)
677        if len(nv) != 2:
678            if strict_parsing:
679                raise ValueError("bad query field: %r" % (name_value,))
680            # Handle case of a control-name with no equal sign
681            if keep_blank_values:
682                nv.append('')
683            else:
684                continue
685        if len(nv[1]) or keep_blank_values:
686            name = nv[0].replace('+', ' ')
687            name = unquote(name, encoding=encoding, errors=errors)
688            name = _coerce_result(name)
689            value = nv[1].replace('+', ' ')
690            value = unquote(value, encoding=encoding, errors=errors)
691            value = _coerce_result(value)
692            r.append((name, value))
693    return r
694
695def unquote_plus(string, encoding='utf-8', errors='replace'):
696    """Like unquote(), but also replace plus signs by spaces, as required for
697    unquoting HTML form values.
698
699    unquote_plus('%7e/abc+def') -> '~/abc def'
700    """
701    string = string.replace('+', ' ')
702    return unquote(string, encoding, errors)
703
704_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
705                         b'abcdefghijklmnopqrstuvwxyz'
706                         b'0123456789'
707                         b'_.-')
708_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
709_safe_quoters = {}
710
711class Quoter(collections.defaultdict):
712    """A mapping from bytes (in range(0,256)) to strings.
713
714    String values are percent-encoded byte values, unless the key < 128, and
715    in the "safe" set (either the specified safe set, or default set).
716    """
717    # Keeps a cache internally, using defaultdict, for efficiency (lookups
718    # of cached keys don't call Python code at all).
719    def __init__(self, safe):
720        """safe: bytes object."""
721        self.safe = _ALWAYS_SAFE.union(safe)
722
723    def __repr__(self):
724        # Without this, will just display as a defaultdict
725        return "<%s %r>" % (self.__class__.__name__, dict(self))
726
727    def __missing__(self, b):
728        # Handle a cache miss. Store quoted string in cache and return.
729        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
730        self[b] = res
731        return res
732
733def quote(string, safe='/', encoding=None, errors=None):
734    """quote('abc def') -> 'abc%20def'
735
736    Each part of a URL, e.g. the path info, the query, etc., has a
737    different set of reserved characters that must be quoted.
738
739    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
740    the following reserved characters.
741
742    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
743                  "$" | ","
744
745    Each of these characters is reserved in some component of a URL,
746    but not necessarily in all of them.
747
748    By default, the quote function is intended for quoting the path
749    section of a URL.  Thus, it will not encode '/'.  This character
750    is reserved, but in typical usage the quote function is being
751    called on a path where the existing slash characters are used as
752    reserved characters.
753
754    string and safe may be either str or bytes objects. encoding and errors
755    must not be specified if string is a bytes object.
756
757    The optional encoding and errors parameters specify how to deal with
758    non-ASCII characters, as accepted by the str.encode method.
759    By default, encoding='utf-8' (characters are encoded with UTF-8), and
760    errors='strict' (unsupported characters raise a UnicodeEncodeError).
761    """
762    if isinstance(string, str):
763        if not string:
764            return string
765        if encoding is None:
766            encoding = 'utf-8'
767        if errors is None:
768            errors = 'strict'
769        string = string.encode(encoding, errors)
770    else:
771        if encoding is not None:
772            raise TypeError("quote() doesn't support 'encoding' for bytes")
773        if errors is not None:
774            raise TypeError("quote() doesn't support 'errors' for bytes")
775    return quote_from_bytes(string, safe)
776
777def quote_plus(string, safe='', encoding=None, errors=None):
778    """Like quote(), but also replace ' ' with '+', as required for quoting
779    HTML form values. Plus signs in the original string are escaped unless
780    they are included in safe. It also does not have safe default to '/'.
781    """
782    # Check if ' ' in string, where string may either be a str or bytes.  If
783    # there are no spaces, the regular quote will produce the right answer.
784    if ((isinstance(string, str) and ' ' not in string) or
785        (isinstance(string, bytes) and b' ' not in string)):
786        return quote(string, safe, encoding, errors)
787    if isinstance(safe, str):
788        space = ' '
789    else:
790        space = b' '
791    string = quote(string, safe + space, encoding, errors)
792    return string.replace(' ', '+')
793
794def quote_from_bytes(bs, safe='/'):
795    """Like quote(), but accepts a bytes object rather than a str, and does
796    not perform string-to-bytes encoding.  It always returns an ASCII string.
797    quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
798    """
799    if not isinstance(bs, (bytes, bytearray)):
800        raise TypeError("quote_from_bytes() expected bytes")
801    if not bs:
802        return ''
803    if isinstance(safe, str):
804        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
805        safe = safe.encode('ascii', 'ignore')
806    else:
807        safe = bytes([c for c in safe if c < 128])
808    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
809        return bs.decode()
810    try:
811        quoter = _safe_quoters[safe]
812    except KeyError:
813        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
814    return ''.join([quoter(char) for char in bs])
815
816def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
817              quote_via=quote_plus):
818    """Encode a dict or sequence of two-element tuples into a URL query string.
819
820    If any values in the query arg are sequences and doseq is true, each
821    sequence element is converted to a separate parameter.
822
823    If the query arg is a sequence of two-element tuples, the order of the
824    parameters in the output will match the order of parameters in the
825    input.
826
827    The components of a query arg may each be either a string or a bytes type.
828
829    The safe, encoding, and errors parameters are passed down to the function
830    specified by quote_via (encoding and errors only if a component is a str).
831    """
832
833    if hasattr(query, "items"):
834        query = query.items()
835    else:
836        # It's a bother at times that strings and string-like objects are
837        # sequences.
838        try:
839            # non-sequence items should not work with len()
840            # non-empty strings will fail this
841            if len(query) and not isinstance(query[0], tuple):
842                raise TypeError
843            # Zero-length sequences of all types will get here and succeed,
844            # but that's a minor nit.  Since the original implementation
845            # allowed empty dicts that type of behavior probably should be
846            # preserved for consistency
847        except TypeError:
848            ty, va, tb = sys.exc_info()
849            raise TypeError("not a valid non-string sequence "
850                            "or mapping object").with_traceback(tb)
851
852    l = []
853    if not doseq:
854        for k, v in query:
855            if isinstance(k, bytes):
856                k = quote_via(k, safe)
857            else:
858                k = quote_via(str(k), safe, encoding, errors)
859
860            if isinstance(v, bytes):
861                v = quote_via(v, safe)
862            else:
863                v = quote_via(str(v), safe, encoding, errors)
864            l.append(k + '=' + v)
865    else:
866        for k, v in query:
867            if isinstance(k, bytes):
868                k = quote_via(k, safe)
869            else:
870                k = quote_via(str(k), safe, encoding, errors)
871
872            if isinstance(v, bytes):
873                v = quote_via(v, safe)
874                l.append(k + '=' + v)
875            elif isinstance(v, str):
876                v = quote_via(v, safe, encoding, errors)
877                l.append(k + '=' + v)
878            else:
879                try:
880                    # Is this a sufficient test for sequence-ness?
881                    x = len(v)
882                except TypeError:
883                    # not a sequence
884                    v = quote_via(str(v), safe, encoding, errors)
885                    l.append(k + '=' + v)
886                else:
887                    # loop over the sequence
888                    for elt in v:
889                        if isinstance(elt, bytes):
890                            elt = quote_via(elt, safe)
891                        else:
892                            elt = quote_via(str(elt), safe, encoding, errors)
893                        l.append(k + '=' + elt)
894    return '&'.join(l)
895
896def to_bytes(url):
897    """to_bytes(u"URL") --> 'URL'."""
898    # Most URL schemes require ASCII. If that changes, the conversion
899    # can be relaxed.
900    # XXX get rid of to_bytes()
901    if isinstance(url, str):
902        try:
903            url = url.encode("ASCII").decode()
904        except UnicodeError:
905            raise UnicodeError("URL " + repr(url) +
906                               " contains non-ASCII characters")
907    return url
908
909def unwrap(url):
910    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
911    url = str(url).strip()
912    if url[:1] == '<' and url[-1:] == '>':
913        url = url[1:-1].strip()
914    if url[:4] == 'URL:': url = url[4:].strip()
915    return url
916
917_typeprog = None
918def splittype(url):
919    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
920    global _typeprog
921    if _typeprog is None:
922        _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
923
924    match = _typeprog.match(url)
925    if match:
926        scheme, data = match.groups()
927        return scheme.lower(), data
928    return None, url
929
930_hostprog = None
931def splithost(url):
932    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
933    global _hostprog
934    if _hostprog is None:
935        _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
936
937    match = _hostprog.match(url)
938    if match:
939        host_port, path = match.groups()
940        if path and path[0] != '/':
941            path = '/' + path
942        return host_port, path
943    return None, url
944
945def splituser(host):
946    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
947    user, delim, host = host.rpartition('@')
948    return (user if delim else None), host
949
950def splitpasswd(user):
951    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
952    user, delim, passwd = user.partition(':')
953    return user, (passwd if delim else None)
954
955# splittag('/path#tag') --> '/path', 'tag'
956_portprog = None
957def splitport(host):
958    """splitport('host:port') --> 'host', 'port'."""
959    global _portprog
960    if _portprog is None:
961        _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
962
963    match = _portprog.match(host)
964    if match:
965        host, port = match.groups()
966        if port:
967            return host, port
968    return host, None
969
970def splitnport(host, defport=-1):
971    """Split host and port, returning numeric port.
972    Return given default port if no ':' found; defaults to -1.
973    Return numerical port if a valid number are found after ':'.
974    Return None if ':' but not a valid number."""
975    host, delim, port = host.rpartition(':')
976    if not delim:
977        host = port
978    elif port:
979        try:
980            nport = int(port)
981        except ValueError:
982            nport = None
983        return host, nport
984    return host, defport
985
986def splitquery(url):
987    """splitquery('/path?query') --> '/path', 'query'."""
988    path, delim, query = url.rpartition('?')
989    if delim:
990        return path, query
991    return url, None
992
993def splittag(url):
994    """splittag('/path#tag') --> '/path', 'tag'."""
995    path, delim, tag = url.rpartition('#')
996    if delim:
997        return path, tag
998    return url, None
999
1000def splitattr(url):
1001    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1002        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1003    words = url.split(';')
1004    return words[0], words[1:]
1005
1006def splitvalue(attr):
1007    """splitvalue('attr=value') --> 'attr', 'value'."""
1008    attr, delim, value = attr.partition('=')
1009    return attr, (value if delim else None)
1010