1r"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
13
14                        CookieJar____
15                        /     \      \
16            FileCookieJar      \      \
17             /    |   \         \      \
18 MozillaCookieJar | LWPCookieJar \      \
19                  |               |      \
20                  |   ---MSIEBase |       \
21                  |  /      |     |        \
22                  | /   MSIEDBCookieJar BSDDBCookieJar
23                  |/
24               MSIECookieJar
25
26"""
27
28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29           'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
31import copy
32import datetime
33import re
34import time
35import urllib.parse, urllib.request
36import threading as _threading
37import http.client  # only for the default HTTP port
38from calendar import timegm
39
40debug = False   # set to True to enable debugging via the logging module
41logger = None
42
43def _debug(*args):
44    if not debug:
45        return
46    global logger
47    if not logger:
48        import logging
49        logger = logging.getLogger("http.cookiejar")
50    return logger.debug(*args)
51
52
53DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
54MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55                         "instance initialised with one)")
56
57def _warn_unhandled_exception():
58    # There are a few catch-all except: statements in this module, for
59    # catching input that's bad in unexpected ways.  Warn if any
60    # exceptions are caught there.
61    import io, warnings, traceback
62    f = io.StringIO()
63    traceback.print_exc(None, f)
64    msg = f.getvalue()
65    warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
66
67
68# Date/time conversion
69# -----------------------------------------------------------------------------
70
71EPOCH_YEAR = 1970
72def _timegm(tt):
73    year, month, mday, hour, min, sec = tt[:6]
74    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76        return timegm(tt)
77    else:
78        return None
79
80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83MONTHS_LOWER = []
84for month in MONTHS: MONTHS_LOWER.append(month.lower())
85
86def time2isoz(t=None):
87    """Return a string representing time in seconds since epoch, t.
88
89    If the function is called without an argument, it will use the current
90    time.
91
92    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93    representing Universal Time (UTC, aka GMT).  An example of this format is:
94
95    1994-11-24 08:49:37Z
96
97    """
98    if t is None:
99        dt = datetime.datetime.utcnow()
100    else:
101        dt = datetime.datetime.utcfromtimestamp(t)
102    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
103        dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
104
105def time2netscape(t=None):
106    """Return a string representing time in seconds since epoch, t.
107
108    If the function is called without an argument, it will use the current
109    time.
110
111    The format of the returned string is like this:
112
113    Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115    """
116    if t is None:
117        dt = datetime.datetime.utcnow()
118    else:
119        dt = datetime.datetime.utcfromtimestamp(t)
120    return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
121        DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
122        dt.year, dt.hour, dt.minute, dt.second)
123
124
125UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
126
127TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
128def offset_from_tz_string(tz):
129    offset = None
130    if tz in UTC_ZONES:
131        offset = 0
132    else:
133        m = TIMEZONE_RE.search(tz)
134        if m:
135            offset = 3600 * int(m.group(2))
136            if m.group(3):
137                offset = offset + 60 * int(m.group(3))
138            if m.group(1) == '-':
139                offset = -offset
140    return offset
141
142def _str2time(day, mon, yr, hr, min, sec, tz):
143    yr = int(yr)
144    if yr > datetime.MAXYEAR:
145        return None
146
147    # translate month name to number
148    # month numbers start with 1 (January)
149    try:
150        mon = MONTHS_LOWER.index(mon.lower())+1
151    except ValueError:
152        # maybe it's already a number
153        try:
154            imon = int(mon)
155        except ValueError:
156            return None
157        if 1 <= imon <= 12:
158            mon = imon
159        else:
160            return None
161
162    # make sure clock elements are defined
163    if hr is None: hr = 0
164    if min is None: min = 0
165    if sec is None: sec = 0
166
167    day = int(day)
168    hr = int(hr)
169    min = int(min)
170    sec = int(sec)
171
172    if yr < 1000:
173        # find "obvious" year
174        cur_yr = time.localtime(time.time())[0]
175        m = cur_yr % 100
176        tmp = yr
177        yr = yr + cur_yr - m
178        m = m - tmp
179        if abs(m) > 50:
180            if m > 0: yr = yr + 100
181            else: yr = yr - 100
182
183    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184    t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186    if t is not None:
187        # adjust time using timezone string, to get absolute time since epoch
188        if tz is None:
189            tz = "UTC"
190        tz = tz.upper()
191        offset = offset_from_tz_string(tz)
192        if offset is None:
193            return None
194        t = t - offset
195
196    return t
197
198STRICT_DATE_RE = re.compile(
199    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
200    r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
201WEEKDAY_RE = re.compile(
202    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
203LOOSE_HTTP_DATE_RE = re.compile(
204    r"""^
205    (\d\d?)            # day
206       (?:\s+|[-\/])
207    (\w+)              # month
208        (?:\s+|[-\/])
209    (\d+)              # year
210    (?:
211          (?:\s+|:)    # separator before clock
212       (\d\d?):(\d\d)  # hour:min
213       (?::(\d\d))?    # optional seconds
214    )?                 # optional clock
215       \s*
216    ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
217       \s*
218    (?:\(\w+\))?       # ASCII representation of timezone in parens.
219       \s*$""", re.X | re.ASCII)
220def http2time(text):
221    """Returns time in seconds since epoch of time represented by a string.
222
223    Return value is an integer.
224
225    None is returned if the format of str is unrecognized, the time is outside
226    the representable range, or the timezone string is not recognized.  If the
227    string contains no timezone, UTC is assumed.
228
229    The timezone in the string may be numerical (like "-0800" or "+0100") or a
230    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
231    timezone strings equivalent to UTC (zero offset) are known to the function.
232
233    The function loosely parses the following formats:
234
235    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
236    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
237    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
238    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
239    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
240    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
241
242    The parser ignores leading and trailing whitespace.  The time may be
243    absent.
244
245    If the year is given with only 2 digits, the function will select the
246    century that makes the year closest to the current date.
247
248    """
249    # fast exit for strictly conforming string
250    m = STRICT_DATE_RE.search(text)
251    if m:
252        g = m.groups()
253        mon = MONTHS_LOWER.index(g[1].lower()) + 1
254        tt = (int(g[2]), mon, int(g[0]),
255              int(g[3]), int(g[4]), float(g[5]))
256        return _timegm(tt)
257
258    # No, we need some messy parsing...
259
260    # clean up
261    text = text.lstrip()
262    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
263
264    # tz is time zone specifier string
265    day, mon, yr, hr, min, sec, tz = [None]*7
266
267    # loose regexp parse
268    m = LOOSE_HTTP_DATE_RE.search(text)
269    if m is not None:
270        day, mon, yr, hr, min, sec, tz = m.groups()
271    else:
272        return None  # bad format
273
274    return _str2time(day, mon, yr, hr, min, sec, tz)
275
276ISO_DATE_RE = re.compile(
277    r"""^
278    (\d{4})              # year
279       [-\/]?
280    (\d\d?)              # numerical month
281       [-\/]?
282    (\d\d?)              # day
283   (?:
284         (?:\s+|[-:Tt])  # separator before clock
285      (\d\d?):?(\d\d)    # hour:min
286      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
287   )?                    # optional clock
288      \s*
289   ([-+]?\d\d?:?(:?\d\d)?
290    |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
291      \s*$""", re.X | re. ASCII)
292def iso2time(text):
293    """
294    As for http2time, but parses the ISO 8601 formats:
295
296    1994-02-03 14:15:29 -0100    -- ISO 8601 format
297    1994-02-03 14:15:29          -- zone is optional
298    1994-02-03                   -- only date
299    1994-02-03T14:15:29          -- Use T as separator
300    19940203T141529Z             -- ISO 8601 compact format
301    19940203                     -- only date
302
303    """
304    # clean up
305    text = text.lstrip()
306
307    # tz is time zone specifier string
308    day, mon, yr, hr, min, sec, tz = [None]*7
309
310    # loose regexp parse
311    m = ISO_DATE_RE.search(text)
312    if m is not None:
313        # XXX there's an extra bit of the timezone I'm ignoring here: is
314        #   this the right thing to do?
315        yr, mon, day, hr, min, sec, tz, _ = m.groups()
316    else:
317        return None  # bad format
318
319    return _str2time(day, mon, yr, hr, min, sec, tz)
320
321
322# Header parsing
323# -----------------------------------------------------------------------------
324
325def unmatched(match):
326    """Return unmatched part of re.Match object."""
327    start, end = match.span(0)
328    return match.string[:start]+match.string[end:]
329
330HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
332HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
333HEADER_ESCAPE_RE = re.compile(r"\\(.)")
334def split_header_words(header_values):
335    r"""Parse header values into a list of lists containing key,value pairs.
336
337    The function knows how to deal with ",", ";" and "=" as well as quoted
338    values after "=".  A list of space separated tokens are parsed as if they
339    were separated by ";".
340
341    If the header_values passed as argument contains multiple values, then they
342    are treated as if they were a single value separated by comma ",".
343
344    This means that this function is useful for parsing header fields that
345    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
346    the requirement for tokens).
347
348      headers           = #header
349      header            = (token | parameter) *( [";"] (token | parameter))
350
351      token             = 1*<any CHAR except CTLs or separators>
352      separators        = "(" | ")" | "<" | ">" | "@"
353                        | "," | ";" | ":" | "\" | <">
354                        | "/" | "[" | "]" | "?" | "="
355                        | "{" | "}" | SP | HT
356
357      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
358      qdtext            = <any TEXT except <">>
359      quoted-pair       = "\" CHAR
360
361      parameter         = attribute "=" value
362      attribute         = token
363      value             = token | quoted-string
364
365    Each header is represented by a list of key/value pairs.  The value for a
366    simple token (not part of a parameter) is None.  Syntactically incorrect
367    headers will not necessarily be parsed as you would want.
368
369    This is easier to describe with some examples:
370
371    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
372    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
373    >>> split_header_words(['text/html; charset="iso-8859-1"'])
374    [[('text/html', None), ('charset', 'iso-8859-1')]]
375    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
376    [[('Basic', None), ('realm', '"foobar"')]]
377
378    """
379    assert not isinstance(header_values, str)
380    result = []
381    for text in header_values:
382        orig_text = text
383        pairs = []
384        while text:
385            m = HEADER_TOKEN_RE.search(text)
386            if m:
387                text = unmatched(m)
388                name = m.group(1)
389                m = HEADER_QUOTED_VALUE_RE.search(text)
390                if m:  # quoted value
391                    text = unmatched(m)
392                    value = m.group(1)
393                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
394                else:
395                    m = HEADER_VALUE_RE.search(text)
396                    if m:  # unquoted value
397                        text = unmatched(m)
398                        value = m.group(1)
399                        value = value.rstrip()
400                    else:
401                        # no value, a lone token
402                        value = None
403                pairs.append((name, value))
404            elif text.lstrip().startswith(","):
405                # concatenated headers, as per RFC 2616 section 4.2
406                text = text.lstrip()[1:]
407                if pairs: result.append(pairs)
408                pairs = []
409            else:
410                # skip junk
411                non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
412                assert nr_junk_chars > 0, (
413                    "split_header_words bug: '%s', '%s', %s" %
414                    (orig_text, text, pairs))
415                text = non_junk
416        if pairs: result.append(pairs)
417    return result
418
419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
420def join_header_words(lists):
421    """Do the inverse (almost) of the conversion done by split_header_words.
422
423    Takes a list of lists of (key, value) pairs and produces a single header
424    value.  Attribute values are quoted if needed.
425
426    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
427    'text/plain; charset="iso-8859-1"'
428    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
429    'text/plain, charset="iso-8859-1"'
430
431    """
432    headers = []
433    for pairs in lists:
434        attr = []
435        for k, v in pairs:
436            if v is not None:
437                if not re.search(r"^\w+$", v):
438                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
439                    v = '"%s"' % v
440                k = "%s=%s" % (k, v)
441            attr.append(k)
442        if attr: headers.append("; ".join(attr))
443    return ", ".join(headers)
444
445def strip_quotes(text):
446    if text.startswith('"'):
447        text = text[1:]
448    if text.endswith('"'):
449        text = text[:-1]
450    return text
451
452def parse_ns_headers(ns_headers):
453    """Ad-hoc parser for Netscape protocol cookie-attributes.
454
455    The old Netscape cookie format for Set-Cookie can for instance contain
456    an unquoted "," in the expires field, so we have to use this ad-hoc
457    parser instead of split_header_words.
458
459    XXX This may not make the best possible effort to parse all the crap
460    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
461    parser is probably better, so could do worse than following that if
462    this ever gives any trouble.
463
464    Currently, this is also used for parsing RFC 2109 cookies.
465
466    """
467    known_attrs = ("expires", "domain", "path", "secure",
468                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
469                   "version", "port", "max-age")
470
471    result = []
472    for ns_header in ns_headers:
473        pairs = []
474        version_set = False
475
476        # XXX: The following does not strictly adhere to RFCs in that empty
477        # names and values are legal (the former will only appear once and will
478        # be overwritten if multiple occurrences are present). This is
479        # mostly to deal with backwards compatibility.
480        for ii, param in enumerate(ns_header.split(';')):
481            param = param.strip()
482
483            key, sep, val = param.partition('=')
484            key = key.strip()
485
486            if not key:
487                if ii == 0:
488                    break
489                else:
490                    continue
491
492            # allow for a distinction between present and empty and missing
493            # altogether
494            val = val.strip() if sep else None
495
496            if ii != 0:
497                lc = key.lower()
498                if lc in known_attrs:
499                    key = lc
500
501                if key == "version":
502                    # This is an RFC 2109 cookie.
503                    if val is not None:
504                        val = strip_quotes(val)
505                    version_set = True
506                elif key == "expires":
507                    # convert expires date to seconds since epoch
508                    if val is not None:
509                        val = http2time(strip_quotes(val))  # None if invalid
510            pairs.append((key, val))
511
512        if pairs:
513            if not version_set:
514                pairs.append(("version", "0"))
515            result.append(pairs)
516
517    return result
518
519
520IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
521def is_HDN(text):
522    """Return True if text is a host domain name."""
523    # XXX
524    # This may well be wrong.  Which RFC is HDN defined in, if any (for
525    #  the purposes of RFC 2965)?
526    # For the current implementation, what about IPv6?  Remember to look
527    #  at other uses of IPV4_RE also, if change this.
528    if IPV4_RE.search(text):
529        return False
530    if text == "":
531        return False
532    if text[0] == "." or text[-1] == ".":
533        return False
534    return True
535
536def domain_match(A, B):
537    """Return True if domain A domain-matches domain B, according to RFC 2965.
538
539    A and B may be host domain names or IP addresses.
540
541    RFC 2965, section 1:
542
543    Host names can be specified either as an IP address or a HDN string.
544    Sometimes we compare one host name with another.  (Such comparisons SHALL
545    be case-insensitive.)  Host A's name domain-matches host B's if
546
547         *  their host name strings string-compare equal; or
548
549         * A is a HDN string and has the form NB, where N is a non-empty
550            name string, B has the form .B', and B' is a HDN string.  (So,
551            x.y.com domain-matches .Y.com but not Y.com.)
552
553    Note that domain-match is not a commutative operation: a.b.c.com
554    domain-matches .c.com, but not the reverse.
555
556    """
557    # Note that, if A or B are IP addresses, the only relevant part of the
558    # definition of the domain-match algorithm is the direct string-compare.
559    A = A.lower()
560    B = B.lower()
561    if A == B:
562        return True
563    if not is_HDN(A):
564        return False
565    i = A.rfind(B)
566    if i == -1 or i == 0:
567        # A does not have form NB, or N is the empty string
568        return False
569    if not B.startswith("."):
570        return False
571    if not is_HDN(B[1:]):
572        return False
573    return True
574
575def liberal_is_HDN(text):
576    """Return True if text is a sort-of-like a host domain name.
577
578    For accepting/blocking domains.
579
580    """
581    if IPV4_RE.search(text):
582        return False
583    return True
584
585def user_domain_match(A, B):
586    """For blocking/accepting domains.
587
588    A and B may be host domain names or IP addresses.
589
590    """
591    A = A.lower()
592    B = B.lower()
593    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
594        if A == B:
595            # equal IP addresses
596            return True
597        return False
598    initial_dot = B.startswith(".")
599    if initial_dot and A.endswith(B):
600        return True
601    if not initial_dot and A == B:
602        return True
603    return False
604
605cut_port_re = re.compile(r":\d+$", re.ASCII)
606def request_host(request):
607    """Return request-host, as defined by RFC 2965.
608
609    Variation from RFC: returned value is lowercased, for convenient
610    comparison.
611
612    """
613    url = request.get_full_url()
614    host = urllib.parse.urlparse(url)[1]
615    if host == "":
616        host = request.get_header("Host", "")
617
618    # remove port, if present
619    host = cut_port_re.sub("", host, 1)
620    return host.lower()
621
622def eff_request_host(request):
623    """Return a tuple (request-host, effective request-host name).
624
625    As defined by RFC 2965, except both are lowercased.
626
627    """
628    erhn = req_host = request_host(request)
629    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
630        erhn = req_host + ".local"
631    return req_host, erhn
632
633def request_path(request):
634    """Path component of request-URI, as defined by RFC 2965."""
635    url = request.get_full_url()
636    parts = urllib.parse.urlsplit(url)
637    path = escape_path(parts.path)
638    if not path.startswith("/"):
639        # fix bad RFC 2396 absoluteURI
640        path = "/" + path
641    return path
642
643def request_port(request):
644    host = request.host
645    i = host.find(':')
646    if i >= 0:
647        port = host[i+1:]
648        try:
649            int(port)
650        except ValueError:
651            _debug("nonnumeric port: '%s'", port)
652            return None
653    else:
654        port = DEFAULT_HTTP_PORT
655    return port
656
657# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
658# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
659HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
660ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
661def uppercase_escaped_char(match):
662    return "%%%s" % match.group(1).upper()
663def escape_path(path):
664    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
665    # There's no knowing what character encoding was used to create URLs
666    # containing %-escapes, but since we have to pick one to escape invalid
667    # path characters, we pick UTF-8, as recommended in the HTML 4.0
668    # specification:
669    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
670    # And here, kind of: draft-fielding-uri-rfc2396bis-03
671    # (And in draft IRI specification: draft-duerst-iri-05)
672    # (And here, for new URI schemes: RFC 2718)
673    path = urllib.parse.quote(path, HTTP_PATH_SAFE)
674    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
675    return path
676
677def reach(h):
678    """Return reach of host h, as defined by RFC 2965, section 1.
679
680    The reach R of a host name H is defined as follows:
681
682       *  If
683
684          -  H is the host domain name of a host; and,
685
686          -  H has the form A.B; and
687
688          -  A has no embedded (that is, interior) dots; and
689
690          -  B has at least one embedded dot, or B is the string "local".
691             then the reach of H is .B.
692
693       *  Otherwise, the reach of H is H.
694
695    >>> reach("www.acme.com")
696    '.acme.com'
697    >>> reach("acme.com")
698    'acme.com'
699    >>> reach("acme.local")
700    '.local'
701
702    """
703    i = h.find(".")
704    if i >= 0:
705        #a = h[:i]  # this line is only here to show what a is
706        b = h[i+1:]
707        i = b.find(".")
708        if is_HDN(h) and (i >= 0 or b == "local"):
709            return "."+b
710    return h
711
712def is_third_party(request):
713    """
714
715    RFC 2965, section 3.3.6:
716
717        An unverifiable transaction is to a third-party host if its request-
718        host U does not domain-match the reach R of the request-host O in the
719        origin transaction.
720
721    """
722    req_host = request_host(request)
723    if not domain_match(req_host, reach(request.origin_req_host)):
724        return True
725    else:
726        return False
727
728
729class Cookie:
730    """HTTP Cookie.
731
732    This class represents both Netscape and RFC 2965 cookies.
733
734    This is deliberately a very simple class.  It just holds attributes.  It's
735    possible to construct Cookie instances that don't comply with the cookie
736    standards.  CookieJar.make_cookies is the factory function for Cookie
737    objects -- it deals with cookie parsing, supplying defaults, and
738    normalising to the representation used in this class.  CookiePolicy is
739    responsible for checking them to see whether they should be accepted from
740    and returned to the server.
741
742    Note that the port may be present in the headers, but unspecified ("Port"
743    rather than"Port=80", for example); if this is the case, port is None.
744
745    """
746
747    def __init__(self, version, name, value,
748                 port, port_specified,
749                 domain, domain_specified, domain_initial_dot,
750                 path, path_specified,
751                 secure,
752                 expires,
753                 discard,
754                 comment,
755                 comment_url,
756                 rest,
757                 rfc2109=False,
758                 ):
759
760        if version is not None: version = int(version)
761        if expires is not None: expires = int(float(expires))
762        if port is None and port_specified is True:
763            raise ValueError("if port is None, port_specified must be false")
764
765        self.version = version
766        self.name = name
767        self.value = value
768        self.port = port
769        self.port_specified = port_specified
770        # normalise case, as per RFC 2965 section 3.3.3
771        self.domain = domain.lower()
772        self.domain_specified = domain_specified
773        # Sigh.  We need to know whether the domain given in the
774        # cookie-attribute had an initial dot, in order to follow RFC 2965
775        # (as clarified in draft errata).  Needed for the returned $Domain
776        # value.
777        self.domain_initial_dot = domain_initial_dot
778        self.path = path
779        self.path_specified = path_specified
780        self.secure = secure
781        self.expires = expires
782        self.discard = discard
783        self.comment = comment
784        self.comment_url = comment_url
785        self.rfc2109 = rfc2109
786
787        self._rest = copy.copy(rest)
788
789    def has_nonstandard_attr(self, name):
790        return name in self._rest
791    def get_nonstandard_attr(self, name, default=None):
792        return self._rest.get(name, default)
793    def set_nonstandard_attr(self, name, value):
794        self._rest[name] = value
795
796    def is_expired(self, now=None):
797        if now is None: now = time.time()
798        if (self.expires is not None) and (self.expires <= now):
799            return True
800        return False
801
802    def __str__(self):
803        if self.port is None: p = ""
804        else: p = ":"+self.port
805        limit = self.domain + p + self.path
806        if self.value is not None:
807            namevalue = "%s=%s" % (self.name, self.value)
808        else:
809            namevalue = self.name
810        return "<Cookie %s for %s>" % (namevalue, limit)
811
812    def __repr__(self):
813        args = []
814        for name in ("version", "name", "value",
815                     "port", "port_specified",
816                     "domain", "domain_specified", "domain_initial_dot",
817                     "path", "path_specified",
818                     "secure", "expires", "discard", "comment", "comment_url",
819                     ):
820            attr = getattr(self, name)
821            args.append("%s=%s" % (name, repr(attr)))
822        args.append("rest=%s" % repr(self._rest))
823        args.append("rfc2109=%s" % repr(self.rfc2109))
824        return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
825
826
827class CookiePolicy:
828    """Defines which cookies get accepted from and returned to server.
829
830    May also modify cookies, though this is probably a bad idea.
831
832    The subclass DefaultCookiePolicy defines the standard rules for Netscape
833    and RFC 2965 cookies -- override that if you want a customized policy.
834
835    """
836    def set_ok(self, cookie, request):
837        """Return true if (and only if) cookie should be accepted from server.
838
839        Currently, pre-expired cookies never get this far -- the CookieJar
840        class deletes such cookies itself.
841
842        """
843        raise NotImplementedError()
844
845    def return_ok(self, cookie, request):
846        """Return true if (and only if) cookie should be returned to server."""
847        raise NotImplementedError()
848
849    def domain_return_ok(self, domain, request):
850        """Return false if cookies should not be returned, given cookie domain.
851        """
852        return True
853
854    def path_return_ok(self, path, request):
855        """Return false if cookies should not be returned, given cookie path.
856        """
857        return True
858
859
860class DefaultCookiePolicy(CookiePolicy):
861    """Implements the standard rules for accepting and returning cookies."""
862
863    DomainStrictNoDots = 1
864    DomainStrictNonDomain = 2
865    DomainRFC2965Match = 4
866
867    DomainLiberal = 0
868    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
869
870    def __init__(self,
871                 blocked_domains=None, allowed_domains=None,
872                 netscape=True, rfc2965=False,
873                 rfc2109_as_netscape=None,
874                 hide_cookie2=False,
875                 strict_domain=False,
876                 strict_rfc2965_unverifiable=True,
877                 strict_ns_unverifiable=False,
878                 strict_ns_domain=DomainLiberal,
879                 strict_ns_set_initial_dollar=False,
880                 strict_ns_set_path=False,
881                 ):
882        """Constructor arguments should be passed as keyword arguments only."""
883        self.netscape = netscape
884        self.rfc2965 = rfc2965
885        self.rfc2109_as_netscape = rfc2109_as_netscape
886        self.hide_cookie2 = hide_cookie2
887        self.strict_domain = strict_domain
888        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
889        self.strict_ns_unverifiable = strict_ns_unverifiable
890        self.strict_ns_domain = strict_ns_domain
891        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
892        self.strict_ns_set_path = strict_ns_set_path
893
894        if blocked_domains is not None:
895            self._blocked_domains = tuple(blocked_domains)
896        else:
897            self._blocked_domains = ()
898
899        if allowed_domains is not None:
900            allowed_domains = tuple(allowed_domains)
901        self._allowed_domains = allowed_domains
902
903    def blocked_domains(self):
904        """Return the sequence of blocked domains (as a tuple)."""
905        return self._blocked_domains
906    def set_blocked_domains(self, blocked_domains):
907        """Set the sequence of blocked domains."""
908        self._blocked_domains = tuple(blocked_domains)
909
910    def is_blocked(self, domain):
911        for blocked_domain in self._blocked_domains:
912            if user_domain_match(domain, blocked_domain):
913                return True
914        return False
915
916    def allowed_domains(self):
917        """Return None, or the sequence of allowed domains (as a tuple)."""
918        return self._allowed_domains
919    def set_allowed_domains(self, allowed_domains):
920        """Set the sequence of allowed domains, or None."""
921        if allowed_domains is not None:
922            allowed_domains = tuple(allowed_domains)
923        self._allowed_domains = allowed_domains
924
925    def is_not_allowed(self, domain):
926        if self._allowed_domains is None:
927            return False
928        for allowed_domain in self._allowed_domains:
929            if user_domain_match(domain, allowed_domain):
930                return False
931        return True
932
933    def set_ok(self, cookie, request):
934        """
935        If you override .set_ok(), be sure to call this method.  If it returns
936        false, so should your subclass (assuming your subclass wants to be more
937        strict about which cookies to accept).
938
939        """
940        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
941
942        assert cookie.name is not None
943
944        for n in "version", "verifiability", "name", "path", "domain", "port":
945            fn_name = "set_ok_"+n
946            fn = getattr(self, fn_name)
947            if not fn(cookie, request):
948                return False
949
950        return True
951
952    def set_ok_version(self, cookie, request):
953        if cookie.version is None:
954            # Version is always set to 0 by parse_ns_headers if it's a Netscape
955            # cookie, so this must be an invalid RFC 2965 cookie.
956            _debug("   Set-Cookie2 without version attribute (%s=%s)",
957                   cookie.name, cookie.value)
958            return False
959        if cookie.version > 0 and not self.rfc2965:
960            _debug("   RFC 2965 cookies are switched off")
961            return False
962        elif cookie.version == 0 and not self.netscape:
963            _debug("   Netscape cookies are switched off")
964            return False
965        return True
966
967    def set_ok_verifiability(self, cookie, request):
968        if request.unverifiable and is_third_party(request):
969            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
970                _debug("   third-party RFC 2965 cookie during "
971                             "unverifiable transaction")
972                return False
973            elif cookie.version == 0 and self.strict_ns_unverifiable:
974                _debug("   third-party Netscape cookie during "
975                             "unverifiable transaction")
976                return False
977        return True
978
979    def set_ok_name(self, cookie, request):
980        # Try and stop servers setting V0 cookies designed to hack other
981        # servers that know both V0 and V1 protocols.
982        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
983            cookie.name.startswith("$")):
984            _debug("   illegal name (starts with '$'): '%s'", cookie.name)
985            return False
986        return True
987
988    def set_ok_path(self, cookie, request):
989        if cookie.path_specified:
990            req_path = request_path(request)
991            if ((cookie.version > 0 or
992                 (cookie.version == 0 and self.strict_ns_set_path)) and
993                not self.path_return_ok(cookie.path, request)):
994                _debug("   path attribute %s is not a prefix of request "
995                       "path %s", cookie.path, req_path)
996                return False
997        return True
998
999    def set_ok_domain(self, cookie, request):
1000        if self.is_blocked(cookie.domain):
1001            _debug("   domain %s is in user block-list", cookie.domain)
1002            return False
1003        if self.is_not_allowed(cookie.domain):
1004            _debug("   domain %s is not in user allow-list", cookie.domain)
1005            return False
1006        if cookie.domain_specified:
1007            req_host, erhn = eff_request_host(request)
1008            domain = cookie.domain
1009            if self.strict_domain and (domain.count(".") >= 2):
1010                # XXX This should probably be compared with the Konqueror
1011                # (kcookiejar.cpp) and Mozilla implementations, but it's a
1012                # losing battle.
1013                i = domain.rfind(".")
1014                j = domain.rfind(".", 0, i)
1015                if j == 0:  # domain like .foo.bar
1016                    tld = domain[i+1:]
1017                    sld = domain[j+1:i]
1018                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1019                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
1020                       "info", "jobs", "mobi", "museum", "name", "pro",
1021                       "travel", "eu") and len(tld) == 2:
1022                        # domain like .co.uk
1023                        _debug("   country-code second level domain %s", domain)
1024                        return False
1025            if domain.startswith("."):
1026                undotted_domain = domain[1:]
1027            else:
1028                undotted_domain = domain
1029            embedded_dots = (undotted_domain.find(".") >= 0)
1030            if not embedded_dots and domain != ".local":
1031                _debug("   non-local domain %s contains no embedded dot",
1032                       domain)
1033                return False
1034            if cookie.version == 0:
1035                if (not erhn.endswith(domain) and
1036                    (not erhn.startswith(".") and
1037                     not ("."+erhn).endswith(domain))):
1038                    _debug("   effective request-host %s (even with added "
1039                           "initial dot) does not end with %s",
1040                           erhn, domain)
1041                    return False
1042            if (cookie.version > 0 or
1043                (self.strict_ns_domain & self.DomainRFC2965Match)):
1044                if not domain_match(erhn, domain):
1045                    _debug("   effective request-host %s does not domain-match "
1046                           "%s", erhn, domain)
1047                    return False
1048            if (cookie.version > 0 or
1049                (self.strict_ns_domain & self.DomainStrictNoDots)):
1050                host_prefix = req_host[:-len(domain)]
1051                if (host_prefix.find(".") >= 0 and
1052                    not IPV4_RE.search(req_host)):
1053                    _debug("   host prefix %s for domain %s contains a dot",
1054                           host_prefix, domain)
1055                    return False
1056        return True
1057
1058    def set_ok_port(self, cookie, request):
1059        if cookie.port_specified:
1060            req_port = request_port(request)
1061            if req_port is None:
1062                req_port = "80"
1063            else:
1064                req_port = str(req_port)
1065            for p in cookie.port.split(","):
1066                try:
1067                    int(p)
1068                except ValueError:
1069                    _debug("   bad port %s (not numeric)", p)
1070                    return False
1071                if p == req_port:
1072                    break
1073            else:
1074                _debug("   request port (%s) not found in %s",
1075                       req_port, cookie.port)
1076                return False
1077        return True
1078
1079    def return_ok(self, cookie, request):
1080        """
1081        If you override .return_ok(), be sure to call this method.  If it
1082        returns false, so should your subclass (assuming your subclass wants to
1083        be more strict about which cookies to return).
1084
1085        """
1086        # Path has already been checked by .path_return_ok(), and domain
1087        # blocking done by .domain_return_ok().
1088        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1089
1090        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1091            fn_name = "return_ok_"+n
1092            fn = getattr(self, fn_name)
1093            if not fn(cookie, request):
1094                return False
1095        return True
1096
1097    def return_ok_version(self, cookie, request):
1098        if cookie.version > 0 and not self.rfc2965:
1099            _debug("   RFC 2965 cookies are switched off")
1100            return False
1101        elif cookie.version == 0 and not self.netscape:
1102            _debug("   Netscape cookies are switched off")
1103            return False
1104        return True
1105
1106    def return_ok_verifiability(self, cookie, request):
1107        if request.unverifiable and is_third_party(request):
1108            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1109                _debug("   third-party RFC 2965 cookie during unverifiable "
1110                       "transaction")
1111                return False
1112            elif cookie.version == 0 and self.strict_ns_unverifiable:
1113                _debug("   third-party Netscape cookie during unverifiable "
1114                       "transaction")
1115                return False
1116        return True
1117
1118    def return_ok_secure(self, cookie, request):
1119        if cookie.secure and request.type != "https":
1120            _debug("   secure cookie with non-secure request")
1121            return False
1122        return True
1123
1124    def return_ok_expires(self, cookie, request):
1125        if cookie.is_expired(self._now):
1126            _debug("   cookie expired")
1127            return False
1128        return True
1129
1130    def return_ok_port(self, cookie, request):
1131        if cookie.port:
1132            req_port = request_port(request)
1133            if req_port is None:
1134                req_port = "80"
1135            for p in cookie.port.split(","):
1136                if p == req_port:
1137                    break
1138            else:
1139                _debug("   request port %s does not match cookie port %s",
1140                       req_port, cookie.port)
1141                return False
1142        return True
1143
1144    def return_ok_domain(self, cookie, request):
1145        req_host, erhn = eff_request_host(request)
1146        domain = cookie.domain
1147
1148        if domain and not domain.startswith("."):
1149            dotdomain = "." + domain
1150        else:
1151            dotdomain = domain
1152
1153        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1154        if (cookie.version == 0 and
1155            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1156            not cookie.domain_specified and domain != erhn):
1157            _debug("   cookie with unspecified domain does not string-compare "
1158                   "equal to request domain")
1159            return False
1160
1161        if cookie.version > 0 and not domain_match(erhn, domain):
1162            _debug("   effective request-host name %s does not domain-match "
1163                   "RFC 2965 cookie domain %s", erhn, domain)
1164            return False
1165        if cookie.version == 0 and not ("."+erhn).endswith(dotdomain):
1166            _debug("   request-host %s does not match Netscape cookie domain "
1167                   "%s", req_host, domain)
1168            return False
1169        return True
1170
1171    def domain_return_ok(self, domain, request):
1172        # Liberal check of.  This is here as an optimization to avoid
1173        # having to load lots of MSIE cookie files unless necessary.
1174        req_host, erhn = eff_request_host(request)
1175        if not req_host.startswith("."):
1176            req_host = "."+req_host
1177        if not erhn.startswith("."):
1178            erhn = "."+erhn
1179        if domain and not domain.startswith("."):
1180            dotdomain = "." + domain
1181        else:
1182            dotdomain = domain
1183        if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)):
1184            #_debug("   request domain %s does not match cookie domain %s",
1185            #       req_host, domain)
1186            return False
1187
1188        if self.is_blocked(domain):
1189            _debug("   domain %s is in user block-list", domain)
1190            return False
1191        if self.is_not_allowed(domain):
1192            _debug("   domain %s is not in user allow-list", domain)
1193            return False
1194
1195        return True
1196
1197    def path_return_ok(self, path, request):
1198        _debug("- checking cookie path=%s", path)
1199        req_path = request_path(request)
1200        pathlen = len(path)
1201        if req_path == path:
1202            return True
1203        elif (req_path.startswith(path) and
1204              (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")):
1205            return True
1206
1207        _debug("  %s does not path-match %s", req_path, path)
1208        return False
1209
1210def vals_sorted_by_key(adict):
1211    keys = sorted(adict.keys())
1212    return map(adict.get, keys)
1213
1214def deepvalues(mapping):
1215    """Iterates over nested mapping, depth-first, in sorted order by key."""
1216    values = vals_sorted_by_key(mapping)
1217    for obj in values:
1218        mapping = False
1219        try:
1220            obj.items
1221        except AttributeError:
1222            pass
1223        else:
1224            mapping = True
1225            yield from deepvalues(obj)
1226        if not mapping:
1227            yield obj
1228
1229
1230# Used as second parameter to dict.get() method, to distinguish absent
1231# dict key from one with a None value.
1232class Absent: pass
1233
1234class CookieJar:
1235    """Collection of HTTP cookies.
1236
1237    You may not need to know about this class: try
1238    urllib.request.build_opener(HTTPCookieProcessor).open(url).
1239    """
1240
1241    non_word_re = re.compile(r"\W")
1242    quote_re = re.compile(r"([\"\\])")
1243    strict_domain_re = re.compile(r"\.?[^.]*")
1244    domain_re = re.compile(r"[^.]*")
1245    dots_re = re.compile(r"^\.+")
1246
1247    magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
1248
1249    def __init__(self, policy=None):
1250        if policy is None:
1251            policy = DefaultCookiePolicy()
1252        self._policy = policy
1253
1254        self._cookies_lock = _threading.RLock()
1255        self._cookies = {}
1256
1257    def set_policy(self, policy):
1258        self._policy = policy
1259
1260    def _cookies_for_domain(self, domain, request):
1261        cookies = []
1262        if not self._policy.domain_return_ok(domain, request):
1263            return []
1264        _debug("Checking %s for cookies to return", domain)
1265        cookies_by_path = self._cookies[domain]
1266        for path in cookies_by_path.keys():
1267            if not self._policy.path_return_ok(path, request):
1268                continue
1269            cookies_by_name = cookies_by_path[path]
1270            for cookie in cookies_by_name.values():
1271                if not self._policy.return_ok(cookie, request):
1272                    _debug("   not returning cookie")
1273                    continue
1274                _debug("   it's a match")
1275                cookies.append(cookie)
1276        return cookies
1277
1278    def _cookies_for_request(self, request):
1279        """Return a list of cookies to be returned to server."""
1280        cookies = []
1281        for domain in self._cookies.keys():
1282            cookies.extend(self._cookies_for_domain(domain, request))
1283        return cookies
1284
1285    def _cookie_attrs(self, cookies):
1286        """Return a list of cookie-attributes to be returned to server.
1287
1288        like ['foo="bar"; $Path="/"', ...]
1289
1290        The $Version attribute is also added when appropriate (currently only
1291        once per request).
1292
1293        """
1294        # add cookies in order of most specific (ie. longest) path first
1295        cookies.sort(key=lambda a: len(a.path), reverse=True)
1296
1297        version_set = False
1298
1299        attrs = []
1300        for cookie in cookies:
1301            # set version of Cookie header
1302            # XXX
1303            # What should it be if multiple matching Set-Cookie headers have
1304            #  different versions themselves?
1305            # Answer: there is no answer; was supposed to be settled by
1306            #  RFC 2965 errata, but that may never appear...
1307            version = cookie.version
1308            if not version_set:
1309                version_set = True
1310                if version > 0:
1311                    attrs.append("$Version=%s" % version)
1312
1313            # quote cookie value if necessary
1314            # (not for Netscape protocol, which already has any quotes
1315            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1316            if ((cookie.value is not None) and
1317                self.non_word_re.search(cookie.value) and version > 0):
1318                value = self.quote_re.sub(r"\\\1", cookie.value)
1319            else:
1320                value = cookie.value
1321
1322            # add cookie-attributes to be returned in Cookie header
1323            if cookie.value is None:
1324                attrs.append(cookie.name)
1325            else:
1326                attrs.append("%s=%s" % (cookie.name, value))
1327            if version > 0:
1328                if cookie.path_specified:
1329                    attrs.append('$Path="%s"' % cookie.path)
1330                if cookie.domain.startswith("."):
1331                    domain = cookie.domain
1332                    if (not cookie.domain_initial_dot and
1333                        domain.startswith(".")):
1334                        domain = domain[1:]
1335                    attrs.append('$Domain="%s"' % domain)
1336                if cookie.port is not None:
1337                    p = "$Port"
1338                    if cookie.port_specified:
1339                        p = p + ('="%s"' % cookie.port)
1340                    attrs.append(p)
1341
1342        return attrs
1343
1344    def add_cookie_header(self, request):
1345        """Add correct Cookie: header to request (urllib.request.Request object).
1346
1347        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1348
1349        """
1350        _debug("add_cookie_header")
1351        self._cookies_lock.acquire()
1352        try:
1353
1354            self._policy._now = self._now = int(time.time())
1355
1356            cookies = self._cookies_for_request(request)
1357
1358            attrs = self._cookie_attrs(cookies)
1359            if attrs:
1360                if not request.has_header("Cookie"):
1361                    request.add_unredirected_header(
1362                        "Cookie", "; ".join(attrs))
1363
1364            # if necessary, advertise that we know RFC 2965
1365            if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1366                not request.has_header("Cookie2")):
1367                for cookie in cookies:
1368                    if cookie.version != 1:
1369                        request.add_unredirected_header("Cookie2", '$Version="1"')
1370                        break
1371
1372        finally:
1373            self._cookies_lock.release()
1374
1375        self.clear_expired_cookies()
1376
1377    def _normalized_cookie_tuples(self, attrs_set):
1378        """Return list of tuples containing normalised cookie information.
1379
1380        attrs_set is the list of lists of key,value pairs extracted from
1381        the Set-Cookie or Set-Cookie2 headers.
1382
1383        Tuples are name, value, standard, rest, where name and value are the
1384        cookie name and value, standard is a dictionary containing the standard
1385        cookie-attributes (discard, secure, version, expires or max-age,
1386        domain, path and port) and rest is a dictionary containing the rest of
1387        the cookie-attributes.
1388
1389        """
1390        cookie_tuples = []
1391
1392        boolean_attrs = "discard", "secure"
1393        value_attrs = ("version",
1394                       "expires", "max-age",
1395                       "domain", "path", "port",
1396                       "comment", "commenturl")
1397
1398        for cookie_attrs in attrs_set:
1399            name, value = cookie_attrs[0]
1400
1401            # Build dictionary of standard cookie-attributes (standard) and
1402            # dictionary of other cookie-attributes (rest).
1403
1404            # Note: expiry time is normalised to seconds since epoch.  V0
1405            # cookies should have the Expires cookie-attribute, and V1 cookies
1406            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1407            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1408            # accept either (but prefer Max-Age).
1409            max_age_set = False
1410
1411            bad_cookie = False
1412
1413            standard = {}
1414            rest = {}
1415            for k, v in cookie_attrs[1:]:
1416                lc = k.lower()
1417                # don't lose case distinction for unknown fields
1418                if lc in value_attrs or lc in boolean_attrs:
1419                    k = lc
1420                if k in boolean_attrs and v is None:
1421                    # boolean cookie-attribute is present, but has no value
1422                    # (like "discard", rather than "port=80")
1423                    v = True
1424                if k in standard:
1425                    # only first value is significant
1426                    continue
1427                if k == "domain":
1428                    if v is None:
1429                        _debug("   missing value for domain attribute")
1430                        bad_cookie = True
1431                        break
1432                    # RFC 2965 section 3.3.3
1433                    v = v.lower()
1434                if k == "expires":
1435                    if max_age_set:
1436                        # Prefer max-age to expires (like Mozilla)
1437                        continue
1438                    if v is None:
1439                        _debug("   missing or invalid value for expires "
1440                              "attribute: treating as session cookie")
1441                        continue
1442                if k == "max-age":
1443                    max_age_set = True
1444                    try:
1445                        v = int(v)
1446                    except ValueError:
1447                        _debug("   missing or invalid (non-numeric) value for "
1448                              "max-age attribute")
1449                        bad_cookie = True
1450                        break
1451                    # convert RFC 2965 Max-Age to seconds since epoch
1452                    # XXX Strictly you're supposed to follow RFC 2616
1453                    #   age-calculation rules.  Remember that zero Max-Age
1454                    #   is a request to discard (old and new) cookie, though.
1455                    k = "expires"
1456                    v = self._now + v
1457                if (k in value_attrs) or (k in boolean_attrs):
1458                    if (v is None and
1459                        k not in ("port", "comment", "commenturl")):
1460                        _debug("   missing value for %s attribute" % k)
1461                        bad_cookie = True
1462                        break
1463                    standard[k] = v
1464                else:
1465                    rest[k] = v
1466
1467            if bad_cookie:
1468                continue
1469
1470            cookie_tuples.append((name, value, standard, rest))
1471
1472        return cookie_tuples
1473
1474    def _cookie_from_cookie_tuple(self, tup, request):
1475        # standard is dict of standard cookie-attributes, rest is dict of the
1476        # rest of them
1477        name, value, standard, rest = tup
1478
1479        domain = standard.get("domain", Absent)
1480        path = standard.get("path", Absent)
1481        port = standard.get("port", Absent)
1482        expires = standard.get("expires", Absent)
1483
1484        # set the easy defaults
1485        version = standard.get("version", None)
1486        if version is not None:
1487            try:
1488                version = int(version)
1489            except ValueError:
1490                return None  # invalid version, ignore cookie
1491        secure = standard.get("secure", False)
1492        # (discard is also set if expires is Absent)
1493        discard = standard.get("discard", False)
1494        comment = standard.get("comment", None)
1495        comment_url = standard.get("commenturl", None)
1496
1497        # set default path
1498        if path is not Absent and path != "":
1499            path_specified = True
1500            path = escape_path(path)
1501        else:
1502            path_specified = False
1503            path = request_path(request)
1504            i = path.rfind("/")
1505            if i != -1:
1506                if version == 0:
1507                    # Netscape spec parts company from reality here
1508                    path = path[:i]
1509                else:
1510                    path = path[:i+1]
1511            if len(path) == 0: path = "/"
1512
1513        # set default domain
1514        domain_specified = domain is not Absent
1515        # but first we have to remember whether it starts with a dot
1516        domain_initial_dot = False
1517        if domain_specified:
1518            domain_initial_dot = bool(domain.startswith("."))
1519        if domain is Absent:
1520            req_host, erhn = eff_request_host(request)
1521            domain = erhn
1522        elif not domain.startswith("."):
1523            domain = "."+domain
1524
1525        # set default port
1526        port_specified = False
1527        if port is not Absent:
1528            if port is None:
1529                # Port attr present, but has no value: default to request port.
1530                # Cookie should then only be sent back on that port.
1531                port = request_port(request)
1532            else:
1533                port_specified = True
1534                port = re.sub(r"\s+", "", port)
1535        else:
1536            # No port attr present.  Cookie can be sent back on any port.
1537            port = None
1538
1539        # set default expires and discard
1540        if expires is Absent:
1541            expires = None
1542            discard = True
1543        elif expires <= self._now:
1544            # Expiry date in past is request to delete cookie.  This can't be
1545            # in DefaultCookiePolicy, because can't delete cookies there.
1546            try:
1547                self.clear(domain, path, name)
1548            except KeyError:
1549                pass
1550            _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1551                   domain, path, name)
1552            return None
1553
1554        return Cookie(version,
1555                      name, value,
1556                      port, port_specified,
1557                      domain, domain_specified, domain_initial_dot,
1558                      path, path_specified,
1559                      secure,
1560                      expires,
1561                      discard,
1562                      comment,
1563                      comment_url,
1564                      rest)
1565
1566    def _cookies_from_attrs_set(self, attrs_set, request):
1567        cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1568
1569        cookies = []
1570        for tup in cookie_tuples:
1571            cookie = self._cookie_from_cookie_tuple(tup, request)
1572            if cookie: cookies.append(cookie)
1573        return cookies
1574
1575    def _process_rfc2109_cookies(self, cookies):
1576        rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1577        if rfc2109_as_ns is None:
1578            rfc2109_as_ns = not self._policy.rfc2965
1579        for cookie in cookies:
1580            if cookie.version == 1:
1581                cookie.rfc2109 = True
1582                if rfc2109_as_ns:
1583                    # treat 2109 cookies as Netscape cookies rather than
1584                    # as RFC2965 cookies
1585                    cookie.version = 0
1586
1587    def make_cookies(self, response, request):
1588        """Return sequence of Cookie objects extracted from response object."""
1589        # get cookie-attributes for RFC 2965 and Netscape protocols
1590        headers = response.info()
1591        rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1592        ns_hdrs = headers.get_all("Set-Cookie", [])
1593
1594        rfc2965 = self._policy.rfc2965
1595        netscape = self._policy.netscape
1596
1597        if ((not rfc2965_hdrs and not ns_hdrs) or
1598            (not ns_hdrs and not rfc2965) or
1599            (not rfc2965_hdrs and not netscape) or
1600            (not netscape and not rfc2965)):
1601            return []  # no relevant cookie headers: quick exit
1602
1603        try:
1604            cookies = self._cookies_from_attrs_set(
1605                split_header_words(rfc2965_hdrs), request)
1606        except Exception:
1607            _warn_unhandled_exception()
1608            cookies = []
1609
1610        if ns_hdrs and netscape:
1611            try:
1612                # RFC 2109 and Netscape cookies
1613                ns_cookies = self._cookies_from_attrs_set(
1614                    parse_ns_headers(ns_hdrs), request)
1615            except Exception:
1616                _warn_unhandled_exception()
1617                ns_cookies = []
1618            self._process_rfc2109_cookies(ns_cookies)
1619
1620            # Look for Netscape cookies (from Set-Cookie headers) that match
1621            # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1622            # For each match, keep the RFC 2965 cookie and ignore the Netscape
1623            # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1624            # bundled in with the Netscape cookies for this purpose, which is
1625            # reasonable behaviour.
1626            if rfc2965:
1627                lookup = {}
1628                for cookie in cookies:
1629                    lookup[(cookie.domain, cookie.path, cookie.name)] = None
1630
1631                def no_matching_rfc2965(ns_cookie, lookup=lookup):
1632                    key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1633                    return key not in lookup
1634                ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1635
1636            if ns_cookies:
1637                cookies.extend(ns_cookies)
1638
1639        return cookies
1640
1641    def set_cookie_if_ok(self, cookie, request):
1642        """Set a cookie if policy says it's OK to do so."""
1643        self._cookies_lock.acquire()
1644        try:
1645            self._policy._now = self._now = int(time.time())
1646
1647            if self._policy.set_ok(cookie, request):
1648                self.set_cookie(cookie)
1649
1650
1651        finally:
1652            self._cookies_lock.release()
1653
1654    def set_cookie(self, cookie):
1655        """Set a cookie, without checking whether or not it should be set."""
1656        c = self._cookies
1657        self._cookies_lock.acquire()
1658        try:
1659            if cookie.domain not in c: c[cookie.domain] = {}
1660            c2 = c[cookie.domain]
1661            if cookie.path not in c2: c2[cookie.path] = {}
1662            c3 = c2[cookie.path]
1663            c3[cookie.name] = cookie
1664        finally:
1665            self._cookies_lock.release()
1666
1667    def extract_cookies(self, response, request):
1668        """Extract cookies from response, where allowable given the request."""
1669        _debug("extract_cookies: %s", response.info())
1670        self._cookies_lock.acquire()
1671        try:
1672            self._policy._now = self._now = int(time.time())
1673
1674            for cookie in self.make_cookies(response, request):
1675                if self._policy.set_ok(cookie, request):
1676                    _debug(" setting cookie: %s", cookie)
1677                    self.set_cookie(cookie)
1678        finally:
1679            self._cookies_lock.release()
1680
1681    def clear(self, domain=None, path=None, name=None):
1682        """Clear some cookies.
1683
1684        Invoking this method without arguments will clear all cookies.  If
1685        given a single argument, only cookies belonging to that domain will be
1686        removed.  If given two arguments, cookies belonging to the specified
1687        path within that domain are removed.  If given three arguments, then
1688        the cookie with the specified name, path and domain is removed.
1689
1690        Raises KeyError if no matching cookie exists.
1691
1692        """
1693        if name is not None:
1694            if (domain is None) or (path is None):
1695                raise ValueError(
1696                    "domain and path must be given to remove a cookie by name")
1697            del self._cookies[domain][path][name]
1698        elif path is not None:
1699            if domain is None:
1700                raise ValueError(
1701                    "domain must be given to remove cookies by path")
1702            del self._cookies[domain][path]
1703        elif domain is not None:
1704            del self._cookies[domain]
1705        else:
1706            self._cookies = {}
1707
1708    def clear_session_cookies(self):
1709        """Discard all session cookies.
1710
1711        Note that the .save() method won't save session cookies anyway, unless
1712        you ask otherwise by passing a true ignore_discard argument.
1713
1714        """
1715        self._cookies_lock.acquire()
1716        try:
1717            for cookie in self:
1718                if cookie.discard:
1719                    self.clear(cookie.domain, cookie.path, cookie.name)
1720        finally:
1721            self._cookies_lock.release()
1722
1723    def clear_expired_cookies(self):
1724        """Discard all expired cookies.
1725
1726        You probably don't need to call this method: expired cookies are never
1727        sent back to the server (provided you're using DefaultCookiePolicy),
1728        this method is called by CookieJar itself every so often, and the
1729        .save() method won't save expired cookies anyway (unless you ask
1730        otherwise by passing a true ignore_expires argument).
1731
1732        """
1733        self._cookies_lock.acquire()
1734        try:
1735            now = time.time()
1736            for cookie in self:
1737                if cookie.is_expired(now):
1738                    self.clear(cookie.domain, cookie.path, cookie.name)
1739        finally:
1740            self._cookies_lock.release()
1741
1742    def __iter__(self):
1743        return deepvalues(self._cookies)
1744
1745    def __len__(self):
1746        """Return number of contained cookies."""
1747        i = 0
1748        for cookie in self: i = i + 1
1749        return i
1750
1751    def __repr__(self):
1752        r = []
1753        for cookie in self: r.append(repr(cookie))
1754        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1755
1756    def __str__(self):
1757        r = []
1758        for cookie in self: r.append(str(cookie))
1759        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1760
1761
1762# derives from OSError for backwards-compatibility with Python 2.4.0
1763class LoadError(OSError): pass
1764
1765class FileCookieJar(CookieJar):
1766    """CookieJar that can be loaded from and saved to a file."""
1767
1768    def __init__(self, filename=None, delayload=False, policy=None):
1769        """
1770        Cookies are NOT loaded from the named file until either the .load() or
1771        .revert() method is called.
1772
1773        """
1774        CookieJar.__init__(self, policy)
1775        if filename is not None:
1776            try:
1777                filename+""
1778            except:
1779                raise ValueError("filename must be string-like")
1780        self.filename = filename
1781        self.delayload = bool(delayload)
1782
1783    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1784        """Save cookies to a file."""
1785        raise NotImplementedError()
1786
1787    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1788        """Load cookies from a file."""
1789        if filename is None:
1790            if self.filename is not None: filename = self.filename
1791            else: raise ValueError(MISSING_FILENAME_TEXT)
1792
1793        with open(filename) as f:
1794            self._really_load(f, filename, ignore_discard, ignore_expires)
1795
1796    def revert(self, filename=None,
1797               ignore_discard=False, ignore_expires=False):
1798        """Clear all cookies and reload cookies from a saved file.
1799
1800        Raises LoadError (or OSError) if reversion is not successful; the
1801        object's state will not be altered if this happens.
1802
1803        """
1804        if filename is None:
1805            if self.filename is not None: filename = self.filename
1806            else: raise ValueError(MISSING_FILENAME_TEXT)
1807
1808        self._cookies_lock.acquire()
1809        try:
1810
1811            old_state = copy.deepcopy(self._cookies)
1812            self._cookies = {}
1813            try:
1814                self.load(filename, ignore_discard, ignore_expires)
1815            except OSError:
1816                self._cookies = old_state
1817                raise
1818
1819        finally:
1820            self._cookies_lock.release()
1821
1822
1823def lwp_cookie_str(cookie):
1824    """Return string representation of Cookie in the LWP cookie file format.
1825
1826    Actually, the format is extended a bit -- see module docstring.
1827
1828    """
1829    h = [(cookie.name, cookie.value),
1830         ("path", cookie.path),
1831         ("domain", cookie.domain)]
1832    if cookie.port is not None: h.append(("port", cookie.port))
1833    if cookie.path_specified: h.append(("path_spec", None))
1834    if cookie.port_specified: h.append(("port_spec", None))
1835    if cookie.domain_initial_dot: h.append(("domain_dot", None))
1836    if cookie.secure: h.append(("secure", None))
1837    if cookie.expires: h.append(("expires",
1838                               time2isoz(float(cookie.expires))))
1839    if cookie.discard: h.append(("discard", None))
1840    if cookie.comment: h.append(("comment", cookie.comment))
1841    if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1842
1843    keys = sorted(cookie._rest.keys())
1844    for k in keys:
1845        h.append((k, str(cookie._rest[k])))
1846
1847    h.append(("version", str(cookie.version)))
1848
1849    return join_header_words([h])
1850
1851class LWPCookieJar(FileCookieJar):
1852    """
1853    The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
1854    "Set-Cookie3" is the format used by the libwww-perl library, not known
1855    to be compatible with any browser, but which is easy to read and
1856    doesn't lose information about RFC 2965 cookies.
1857
1858    Additional methods
1859
1860    as_lwp_str(ignore_discard=True, ignore_expired=True)
1861
1862    """
1863
1864    def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1865        """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
1866
1867        ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1868
1869        """
1870        now = time.time()
1871        r = []
1872        for cookie in self:
1873            if not ignore_discard and cookie.discard:
1874                continue
1875            if not ignore_expires and cookie.is_expired(now):
1876                continue
1877            r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1878        return "\n".join(r+[""])
1879
1880    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1881        if filename is None:
1882            if self.filename is not None: filename = self.filename
1883            else: raise ValueError(MISSING_FILENAME_TEXT)
1884
1885        with open(filename, "w") as f:
1886            # There really isn't an LWP Cookies 2.0 format, but this indicates
1887            # that there is extra information in here (domain_dot and
1888            # port_spec) while still being compatible with libwww-perl, I hope.
1889            f.write("#LWP-Cookies-2.0\n")
1890            f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1891
1892    def _really_load(self, f, filename, ignore_discard, ignore_expires):
1893        magic = f.readline()
1894        if not self.magic_re.search(magic):
1895            msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1896                   "file" % filename)
1897            raise LoadError(msg)
1898
1899        now = time.time()
1900
1901        header = "Set-Cookie3:"
1902        boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1903                         "secure", "discard")
1904        value_attrs = ("version",
1905                       "port", "path", "domain",
1906                       "expires",
1907                       "comment", "commenturl")
1908
1909        try:
1910            while 1:
1911                line = f.readline()
1912                if line == "": break
1913                if not line.startswith(header):
1914                    continue
1915                line = line[len(header):].strip()
1916
1917                for data in split_header_words([line]):
1918                    name, value = data[0]
1919                    standard = {}
1920                    rest = {}
1921                    for k in boolean_attrs:
1922                        standard[k] = False
1923                    for k, v in data[1:]:
1924                        if k is not None:
1925                            lc = k.lower()
1926                        else:
1927                            lc = None
1928                        # don't lose case distinction for unknown fields
1929                        if (lc in value_attrs) or (lc in boolean_attrs):
1930                            k = lc
1931                        if k in boolean_attrs:
1932                            if v is None: v = True
1933                            standard[k] = v
1934                        elif k in value_attrs:
1935                            standard[k] = v
1936                        else:
1937                            rest[k] = v
1938
1939                    h = standard.get
1940                    expires = h("expires")
1941                    discard = h("discard")
1942                    if expires is not None:
1943                        expires = iso2time(expires)
1944                    if expires is None:
1945                        discard = True
1946                    domain = h("domain")
1947                    domain_specified = domain.startswith(".")
1948                    c = Cookie(h("version"), name, value,
1949                               h("port"), h("port_spec"),
1950                               domain, domain_specified, h("domain_dot"),
1951                               h("path"), h("path_spec"),
1952                               h("secure"),
1953                               expires,
1954                               discard,
1955                               h("comment"),
1956                               h("commenturl"),
1957                               rest)
1958                    if not ignore_discard and c.discard:
1959                        continue
1960                    if not ignore_expires and c.is_expired(now):
1961                        continue
1962                    self.set_cookie(c)
1963        except OSError:
1964            raise
1965        except Exception:
1966            _warn_unhandled_exception()
1967            raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1968                            (filename, line))
1969
1970
1971class MozillaCookieJar(FileCookieJar):
1972    """
1973
1974    WARNING: you may want to backup your browser's cookies file if you use
1975    this class to save cookies.  I *think* it works, but there have been
1976    bugs in the past!
1977
1978    This class differs from CookieJar only in the format it uses to save and
1979    load cookies to and from a file.  This class uses the Mozilla/Netscape
1980    `cookies.txt' format.  lynx uses this file format, too.
1981
1982    Don't expect cookies saved while the browser is running to be noticed by
1983    the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1984    you change them on disk while it's running; on Windows, you probably can't
1985    save at all while the browser is running).
1986
1987    Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1988    Netscape cookies on saving.
1989
1990    In particular, the cookie version and port number information is lost,
1991    together with information about whether or not Path, Port and Discard were
1992    specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1993    domain as set in the HTTP header started with a dot (yes, I'm aware some
1994    domains in Netscape files start with a dot and some don't -- trust me, you
1995    really don't want to know any more about this).
1996
1997    Note that though Mozilla and Netscape use the same format, they use
1998    slightly different headers.  The class saves cookies using the Netscape
1999    header by default (Mozilla can cope with that).
2000
2001    """
2002    magic_re = re.compile("#( Netscape)? HTTP Cookie File")
2003    header = """\
2004# Netscape HTTP Cookie File
2005# http://curl.haxx.se/rfc/cookie_spec.html
2006# This is a generated file!  Do not edit.
2007
2008"""
2009
2010    def _really_load(self, f, filename, ignore_discard, ignore_expires):
2011        now = time.time()
2012
2013        magic = f.readline()
2014        if not self.magic_re.search(magic):
2015            raise LoadError(
2016                "%r does not look like a Netscape format cookies file" %
2017                filename)
2018
2019        try:
2020            while 1:
2021                line = f.readline()
2022                if line == "": break
2023
2024                # last field may be absent, so keep any trailing tab
2025                if line.endswith("\n"): line = line[:-1]
2026
2027                # skip comments and blank lines XXX what is $ for?
2028                if (line.strip().startswith(("#", "$")) or
2029                    line.strip() == ""):
2030                    continue
2031
2032                domain, domain_specified, path, secure, expires, name, value = \
2033                        line.split("\t")
2034                secure = (secure == "TRUE")
2035                domain_specified = (domain_specified == "TRUE")
2036                if name == "":
2037                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2038                    # with no name, whereas http.cookiejar regards it as a
2039                    # cookie with no value.
2040                    name = value
2041                    value = None
2042
2043                initial_dot = domain.startswith(".")
2044                assert domain_specified == initial_dot
2045
2046                discard = False
2047                if expires == "":
2048                    expires = None
2049                    discard = True
2050
2051                # assume path_specified is false
2052                c = Cookie(0, name, value,
2053                           None, False,
2054                           domain, domain_specified, initial_dot,
2055                           path, False,
2056                           secure,
2057                           expires,
2058                           discard,
2059                           None,
2060                           None,
2061                           {})
2062                if not ignore_discard and c.discard:
2063                    continue
2064                if not ignore_expires and c.is_expired(now):
2065                    continue
2066                self.set_cookie(c)
2067
2068        except OSError:
2069            raise
2070        except Exception:
2071            _warn_unhandled_exception()
2072            raise LoadError("invalid Netscape format cookies file %r: %r" %
2073                            (filename, line))
2074
2075    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2076        if filename is None:
2077            if self.filename is not None: filename = self.filename
2078            else: raise ValueError(MISSING_FILENAME_TEXT)
2079
2080        with open(filename, "w") as f:
2081            f.write(self.header)
2082            now = time.time()
2083            for cookie in self:
2084                if not ignore_discard and cookie.discard:
2085                    continue
2086                if not ignore_expires and cookie.is_expired(now):
2087                    continue
2088                if cookie.secure: secure = "TRUE"
2089                else: secure = "FALSE"
2090                if cookie.domain.startswith("."): initial_dot = "TRUE"
2091                else: initial_dot = "FALSE"
2092                if cookie.expires is not None:
2093                    expires = str(cookie.expires)
2094                else:
2095                    expires = ""
2096                if cookie.value is None:
2097                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2098                    # with no name, whereas http.cookiejar regards it as a
2099                    # cookie with no value.
2100                    name = ""
2101                    value = cookie.name
2102                else:
2103                    name = cookie.name
2104                    value = cookie.value
2105                f.write(
2106                    "\t".join([cookie.domain, initial_dot, cookie.path,
2107                               secure, expires, name, value])+
2108                    "\n")
2109