1r"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
13
14                        CookieJar____
15                        /     \      \
16            FileCookieJar      \      \
17             /    |   \         \      \
18 MozillaCookieJar | LWPCookieJar \      \
19                  |               |      \
20                  |   ---MSIEBase |       \
21                  |  /      |     |        \
22                  | /   MSIEDBCookieJar BSDDBCookieJar
23                  |/
24               MSIECookieJar
25
26"""
27
28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29           'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
31import copy
32import datetime
33import re
34import time
35import urllib.parse, urllib.request
36try:
37    import threading as _threading
38except ImportError:
39    import dummy_threading as _threading
40import http.client  # only for the default HTTP port
41from calendar import timegm
42
43debug = False   # set to True to enable debugging via the logging module
44logger = None
45
46def _debug(*args):
47    if not debug:
48        return
49    global logger
50    if not logger:
51        import logging
52        logger = logging.getLogger("http.cookiejar")
53    return logger.debug(*args)
54
55
56DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
57MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
58                         "instance initialised with one)")
59
60def _warn_unhandled_exception():
61    # There are a few catch-all except: statements in this module, for
62    # catching input that's bad in unexpected ways.  Warn if any
63    # exceptions are caught there.
64    import io, warnings, traceback
65    f = io.StringIO()
66    traceback.print_exc(None, f)
67    msg = f.getvalue()
68    warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
69
70
71# Date/time conversion
72# -----------------------------------------------------------------------------
73
74EPOCH_YEAR = 1970
75def _timegm(tt):
76    year, month, mday, hour, min, sec = tt[:6]
77    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
78        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
79        return timegm(tt)
80    else:
81        return None
82
83DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
84MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
85          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
86MONTHS_LOWER = []
87for month in MONTHS: MONTHS_LOWER.append(month.lower())
88
89def time2isoz(t=None):
90    """Return a string representing time in seconds since epoch, t.
91
92    If the function is called without an argument, it will use the current
93    time.
94
95    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
96    representing Universal Time (UTC, aka GMT).  An example of this format is:
97
98    1994-11-24 08:49:37Z
99
100    """
101    if t is None:
102        dt = datetime.datetime.utcnow()
103    else:
104        dt = datetime.datetime.utcfromtimestamp(t)
105    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
106        dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
107
108def time2netscape(t=None):
109    """Return a string representing time in seconds since epoch, t.
110
111    If the function is called without an argument, it will use the current
112    time.
113
114    The format of the returned string is like this:
115
116    Wed, DD-Mon-YYYY HH:MM:SS GMT
117
118    """
119    if t is None:
120        dt = datetime.datetime.utcnow()
121    else:
122        dt = datetime.datetime.utcfromtimestamp(t)
123    return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
124        DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
125        dt.year, dt.hour, dt.minute, dt.second)
126
127
128UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
129
130TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
131def offset_from_tz_string(tz):
132    offset = None
133    if tz in UTC_ZONES:
134        offset = 0
135    else:
136        m = TIMEZONE_RE.search(tz)
137        if m:
138            offset = 3600 * int(m.group(2))
139            if m.group(3):
140                offset = offset + 60 * int(m.group(3))
141            if m.group(1) == '-':
142                offset = -offset
143    return offset
144
145def _str2time(day, mon, yr, hr, min, sec, tz):
146    yr = int(yr)
147    if yr > datetime.MAXYEAR:
148        return None
149
150    # translate month name to number
151    # month numbers start with 1 (January)
152    try:
153        mon = MONTHS_LOWER.index(mon.lower())+1
154    except ValueError:
155        # maybe it's already a number
156        try:
157            imon = int(mon)
158        except ValueError:
159            return None
160        if 1 <= imon <= 12:
161            mon = imon
162        else:
163            return None
164
165    # make sure clock elements are defined
166    if hr is None: hr = 0
167    if min is None: min = 0
168    if sec is None: sec = 0
169
170    day = int(day)
171    hr = int(hr)
172    min = int(min)
173    sec = int(sec)
174
175    if yr < 1000:
176        # find "obvious" year
177        cur_yr = time.localtime(time.time())[0]
178        m = cur_yr % 100
179        tmp = yr
180        yr = yr + cur_yr - m
181        m = m - tmp
182        if abs(m) > 50:
183            if m > 0: yr = yr + 100
184            else: yr = yr - 100
185
186    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
187    t = _timegm((yr, mon, day, hr, min, sec, tz))
188
189    if t is not None:
190        # adjust time using timezone string, to get absolute time since epoch
191        if tz is None:
192            tz = "UTC"
193        tz = tz.upper()
194        offset = offset_from_tz_string(tz)
195        if offset is None:
196            return None
197        t = t - offset
198
199    return t
200
201STRICT_DATE_RE = re.compile(
202    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
203    r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
204WEEKDAY_RE = re.compile(
205    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
206LOOSE_HTTP_DATE_RE = re.compile(
207    r"""^
208    (\d\d?)            # day
209       (?:\s+|[-\/])
210    (\w+)              # month
211        (?:\s+|[-\/])
212    (\d+)              # year
213    (?:
214          (?:\s+|:)    # separator before clock
215       (\d\d?):(\d\d)  # hour:min
216       (?::(\d\d))?    # optional seconds
217    )?                 # optional clock
218       \s*
219    ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
220       \s*
221    (?:\(\w+\))?       # ASCII representation of timezone in parens.
222       \s*$""", re.X | re.ASCII)
223def http2time(text):
224    """Returns time in seconds since epoch of time represented by a string.
225
226    Return value is an integer.
227
228    None is returned if the format of str is unrecognized, the time is outside
229    the representable range, or the timezone string is not recognized.  If the
230    string contains no timezone, UTC is assumed.
231
232    The timezone in the string may be numerical (like "-0800" or "+0100") or a
233    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
234    timezone strings equivalent to UTC (zero offset) are known to the function.
235
236    The function loosely parses the following formats:
237
238    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
239    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
240    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
241    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
242    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
243    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
244
245    The parser ignores leading and trailing whitespace.  The time may be
246    absent.
247
248    If the year is given with only 2 digits, the function will select the
249    century that makes the year closest to the current date.
250
251    """
252    # fast exit for strictly conforming string
253    m = STRICT_DATE_RE.search(text)
254    if m:
255        g = m.groups()
256        mon = MONTHS_LOWER.index(g[1].lower()) + 1
257        tt = (int(g[2]), mon, int(g[0]),
258              int(g[3]), int(g[4]), float(g[5]))
259        return _timegm(tt)
260
261    # No, we need some messy parsing...
262
263    # clean up
264    text = text.lstrip()
265    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
266
267    # tz is time zone specifier string
268    day, mon, yr, hr, min, sec, tz = [None]*7
269
270    # loose regexp parse
271    m = LOOSE_HTTP_DATE_RE.search(text)
272    if m is not None:
273        day, mon, yr, hr, min, sec, tz = m.groups()
274    else:
275        return None  # bad format
276
277    return _str2time(day, mon, yr, hr, min, sec, tz)
278
279ISO_DATE_RE = re.compile(
280    r"""^
281    (\d{4})              # year
282       [-\/]?
283    (\d\d?)              # numerical month
284       [-\/]?
285    (\d\d?)              # day
286   (?:
287         (?:\s+|[-:Tt])  # separator before clock
288      (\d\d?):?(\d\d)    # hour:min
289      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
290   )?                    # optional clock
291      \s*
292   ([-+]?\d\d?:?(:?\d\d)?
293    |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
294      \s*$""", re.X | re. ASCII)
295def iso2time(text):
296    """
297    As for http2time, but parses the ISO 8601 formats:
298
299    1994-02-03 14:15:29 -0100    -- ISO 8601 format
300    1994-02-03 14:15:29          -- zone is optional
301    1994-02-03                   -- only date
302    1994-02-03T14:15:29          -- Use T as separator
303    19940203T141529Z             -- ISO 8601 compact format
304    19940203                     -- only date
305
306    """
307    # clean up
308    text = text.lstrip()
309
310    # tz is time zone specifier string
311    day, mon, yr, hr, min, sec, tz = [None]*7
312
313    # loose regexp parse
314    m = ISO_DATE_RE.search(text)
315    if m is not None:
316        # XXX there's an extra bit of the timezone I'm ignoring here: is
317        #   this the right thing to do?
318        yr, mon, day, hr, min, sec, tz, _ = m.groups()
319    else:
320        return None  # bad format
321
322    return _str2time(day, mon, yr, hr, min, sec, tz)
323
324
325# Header parsing
326# -----------------------------------------------------------------------------
327
328def unmatched(match):
329    """Return unmatched part of re.Match object."""
330    start, end = match.span(0)
331    return match.string[:start]+match.string[end:]
332
333HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
334HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
335HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
336HEADER_ESCAPE_RE = re.compile(r"\\(.)")
337def split_header_words(header_values):
338    r"""Parse header values into a list of lists containing key,value pairs.
339
340    The function knows how to deal with ",", ";" and "=" as well as quoted
341    values after "=".  A list of space separated tokens are parsed as if they
342    were separated by ";".
343
344    If the header_values passed as argument contains multiple values, then they
345    are treated as if they were a single value separated by comma ",".
346
347    This means that this function is useful for parsing header fields that
348    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
349    the requirement for tokens).
350
351      headers           = #header
352      header            = (token | parameter) *( [";"] (token | parameter))
353
354      token             = 1*<any CHAR except CTLs or separators>
355      separators        = "(" | ")" | "<" | ">" | "@"
356                        | "," | ";" | ":" | "\" | <">
357                        | "/" | "[" | "]" | "?" | "="
358                        | "{" | "}" | SP | HT
359
360      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
361      qdtext            = <any TEXT except <">>
362      quoted-pair       = "\" CHAR
363
364      parameter         = attribute "=" value
365      attribute         = token
366      value             = token | quoted-string
367
368    Each header is represented by a list of key/value pairs.  The value for a
369    simple token (not part of a parameter) is None.  Syntactically incorrect
370    headers will not necessarily be parsed as you would want.
371
372    This is easier to describe with some examples:
373
374    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
375    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
376    >>> split_header_words(['text/html; charset="iso-8859-1"'])
377    [[('text/html', None), ('charset', 'iso-8859-1')]]
378    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
379    [[('Basic', None), ('realm', '"foobar"')]]
380
381    """
382    assert not isinstance(header_values, str)
383    result = []
384    for text in header_values:
385        orig_text = text
386        pairs = []
387        while text:
388            m = HEADER_TOKEN_RE.search(text)
389            if m:
390                text = unmatched(m)
391                name = m.group(1)
392                m = HEADER_QUOTED_VALUE_RE.search(text)
393                if m:  # quoted value
394                    text = unmatched(m)
395                    value = m.group(1)
396                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
397                else:
398                    m = HEADER_VALUE_RE.search(text)
399                    if m:  # unquoted value
400                        text = unmatched(m)
401                        value = m.group(1)
402                        value = value.rstrip()
403                    else:
404                        # no value, a lone token
405                        value = None
406                pairs.append((name, value))
407            elif text.lstrip().startswith(","):
408                # concatenated headers, as per RFC 2616 section 4.2
409                text = text.lstrip()[1:]
410                if pairs: result.append(pairs)
411                pairs = []
412            else:
413                # skip junk
414                non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
415                assert nr_junk_chars > 0, (
416                    "split_header_words bug: '%s', '%s', %s" %
417                    (orig_text, text, pairs))
418                text = non_junk
419        if pairs: result.append(pairs)
420    return result
421
422HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
423def join_header_words(lists):
424    """Do the inverse (almost) of the conversion done by split_header_words.
425
426    Takes a list of lists of (key, value) pairs and produces a single header
427    value.  Attribute values are quoted if needed.
428
429    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
430    'text/plain; charset="iso-8859-1"'
431    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
432    'text/plain, charset="iso-8859-1"'
433
434    """
435    headers = []
436    for pairs in lists:
437        attr = []
438        for k, v in pairs:
439            if v is not None:
440                if not re.search(r"^\w+$", v):
441                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
442                    v = '"%s"' % v
443                k = "%s=%s" % (k, v)
444            attr.append(k)
445        if attr: headers.append("; ".join(attr))
446    return ", ".join(headers)
447
448def strip_quotes(text):
449    if text.startswith('"'):
450        text = text[1:]
451    if text.endswith('"'):
452        text = text[:-1]
453    return text
454
455def parse_ns_headers(ns_headers):
456    """Ad-hoc parser for Netscape protocol cookie-attributes.
457
458    The old Netscape cookie format for Set-Cookie can for instance contain
459    an unquoted "," in the expires field, so we have to use this ad-hoc
460    parser instead of split_header_words.
461
462    XXX This may not make the best possible effort to parse all the crap
463    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
464    parser is probably better, so could do worse than following that if
465    this ever gives any trouble.
466
467    Currently, this is also used for parsing RFC 2109 cookies.
468
469    """
470    known_attrs = ("expires", "domain", "path", "secure",
471                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
472                   "version", "port", "max-age")
473
474    result = []
475    for ns_header in ns_headers:
476        pairs = []
477        version_set = False
478
479        # XXX: The following does not strictly adhere to RFCs in that empty
480        # names and values are legal (the former will only appear once and will
481        # be overwritten if multiple occurrences are present). This is
482        # mostly to deal with backwards compatibility.
483        for ii, param in enumerate(ns_header.split(';')):
484            param = param.strip()
485
486            key, sep, val = param.partition('=')
487            key = key.strip()
488
489            if not key:
490                if ii == 0:
491                    break
492                else:
493                    continue
494
495            # allow for a distinction between present and empty and missing
496            # altogether
497            val = val.strip() if sep else None
498
499            if ii != 0:
500                lc = key.lower()
501                if lc in known_attrs:
502                    key = lc
503
504                if key == "version":
505                    # This is an RFC 2109 cookie.
506                    if val is not None:
507                        val = strip_quotes(val)
508                    version_set = True
509                elif key == "expires":
510                    # convert expires date to seconds since epoch
511                    if val is not None:
512                        val = http2time(strip_quotes(val))  # None if invalid
513            pairs.append((key, val))
514
515        if pairs:
516            if not version_set:
517                pairs.append(("version", "0"))
518            result.append(pairs)
519
520    return result
521
522
523IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
524def is_HDN(text):
525    """Return True if text is a host domain name."""
526    # XXX
527    # This may well be wrong.  Which RFC is HDN defined in, if any (for
528    #  the purposes of RFC 2965)?
529    # For the current implementation, what about IPv6?  Remember to look
530    #  at other uses of IPV4_RE also, if change this.
531    if IPV4_RE.search(text):
532        return False
533    if text == "":
534        return False
535    if text[0] == "." or text[-1] == ".":
536        return False
537    return True
538
539def domain_match(A, B):
540    """Return True if domain A domain-matches domain B, according to RFC 2965.
541
542    A and B may be host domain names or IP addresses.
543
544    RFC 2965, section 1:
545
546    Host names can be specified either as an IP address or a HDN string.
547    Sometimes we compare one host name with another.  (Such comparisons SHALL
548    be case-insensitive.)  Host A's name domain-matches host B's if
549
550         *  their host name strings string-compare equal; or
551
552         * A is a HDN string and has the form NB, where N is a non-empty
553            name string, B has the form .B', and B' is a HDN string.  (So,
554            x.y.com domain-matches .Y.com but not Y.com.)
555
556    Note that domain-match is not a commutative operation: a.b.c.com
557    domain-matches .c.com, but not the reverse.
558
559    """
560    # Note that, if A or B are IP addresses, the only relevant part of the
561    # definition of the domain-match algorithm is the direct string-compare.
562    A = A.lower()
563    B = B.lower()
564    if A == B:
565        return True
566    if not is_HDN(A):
567        return False
568    i = A.rfind(B)
569    if i == -1 or i == 0:
570        # A does not have form NB, or N is the empty string
571        return False
572    if not B.startswith("."):
573        return False
574    if not is_HDN(B[1:]):
575        return False
576    return True
577
578def liberal_is_HDN(text):
579    """Return True if text is a sort-of-like a host domain name.
580
581    For accepting/blocking domains.
582
583    """
584    if IPV4_RE.search(text):
585        return False
586    return True
587
588def user_domain_match(A, B):
589    """For blocking/accepting domains.
590
591    A and B may be host domain names or IP addresses.
592
593    """
594    A = A.lower()
595    B = B.lower()
596    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
597        if A == B:
598            # equal IP addresses
599            return True
600        return False
601    initial_dot = B.startswith(".")
602    if initial_dot and A.endswith(B):
603        return True
604    if not initial_dot and A == B:
605        return True
606    return False
607
608cut_port_re = re.compile(r":\d+$", re.ASCII)
609def request_host(request):
610    """Return request-host, as defined by RFC 2965.
611
612    Variation from RFC: returned value is lowercased, for convenient
613    comparison.
614
615    """
616    url = request.get_full_url()
617    host = urllib.parse.urlparse(url)[1]
618    if host == "":
619        host = request.get_header("Host", "")
620
621    # remove port, if present
622    host = cut_port_re.sub("", host, 1)
623    return host.lower()
624
625def eff_request_host(request):
626    """Return a tuple (request-host, effective request-host name).
627
628    As defined by RFC 2965, except both are lowercased.
629
630    """
631    erhn = req_host = request_host(request)
632    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
633        erhn = req_host + ".local"
634    return req_host, erhn
635
636def request_path(request):
637    """Path component of request-URI, as defined by RFC 2965."""
638    url = request.get_full_url()
639    parts = urllib.parse.urlsplit(url)
640    path = escape_path(parts.path)
641    if not path.startswith("/"):
642        # fix bad RFC 2396 absoluteURI
643        path = "/" + path
644    return path
645
646def request_port(request):
647    host = request.host
648    i = host.find(':')
649    if i >= 0:
650        port = host[i+1:]
651        try:
652            int(port)
653        except ValueError:
654            _debug("nonnumeric port: '%s'", port)
655            return None
656    else:
657        port = DEFAULT_HTTP_PORT
658    return port
659
660# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
661# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
662HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
663ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
664def uppercase_escaped_char(match):
665    return "%%%s" % match.group(1).upper()
666def escape_path(path):
667    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
668    # There's no knowing what character encoding was used to create URLs
669    # containing %-escapes, but since we have to pick one to escape invalid
670    # path characters, we pick UTF-8, as recommended in the HTML 4.0
671    # specification:
672    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
673    # And here, kind of: draft-fielding-uri-rfc2396bis-03
674    # (And in draft IRI specification: draft-duerst-iri-05)
675    # (And here, for new URI schemes: RFC 2718)
676    path = urllib.parse.quote(path, HTTP_PATH_SAFE)
677    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
678    return path
679
680def reach(h):
681    """Return reach of host h, as defined by RFC 2965, section 1.
682
683    The reach R of a host name H is defined as follows:
684
685       *  If
686
687          -  H is the host domain name of a host; and,
688
689          -  H has the form A.B; and
690
691          -  A has no embedded (that is, interior) dots; and
692
693          -  B has at least one embedded dot, or B is the string "local".
694             then the reach of H is .B.
695
696       *  Otherwise, the reach of H is H.
697
698    >>> reach("www.acme.com")
699    '.acme.com'
700    >>> reach("acme.com")
701    'acme.com'
702    >>> reach("acme.local")
703    '.local'
704
705    """
706    i = h.find(".")
707    if i >= 0:
708        #a = h[:i]  # this line is only here to show what a is
709        b = h[i+1:]
710        i = b.find(".")
711        if is_HDN(h) and (i >= 0 or b == "local"):
712            return "."+b
713    return h
714
715def is_third_party(request):
716    """
717
718    RFC 2965, section 3.3.6:
719
720        An unverifiable transaction is to a third-party host if its request-
721        host U does not domain-match the reach R of the request-host O in the
722        origin transaction.
723
724    """
725    req_host = request_host(request)
726    if not domain_match(req_host, reach(request.origin_req_host)):
727        return True
728    else:
729        return False
730
731
732class Cookie:
733    """HTTP Cookie.
734
735    This class represents both Netscape and RFC 2965 cookies.
736
737    This is deliberately a very simple class.  It just holds attributes.  It's
738    possible to construct Cookie instances that don't comply with the cookie
739    standards.  CookieJar.make_cookies is the factory function for Cookie
740    objects -- it deals with cookie parsing, supplying defaults, and
741    normalising to the representation used in this class.  CookiePolicy is
742    responsible for checking them to see whether they should be accepted from
743    and returned to the server.
744
745    Note that the port may be present in the headers, but unspecified ("Port"
746    rather than"Port=80", for example); if this is the case, port is None.
747
748    """
749
750    def __init__(self, version, name, value,
751                 port, port_specified,
752                 domain, domain_specified, domain_initial_dot,
753                 path, path_specified,
754                 secure,
755                 expires,
756                 discard,
757                 comment,
758                 comment_url,
759                 rest,
760                 rfc2109=False,
761                 ):
762
763        if version is not None: version = int(version)
764        if expires is not None: expires = int(float(expires))
765        if port is None and port_specified is True:
766            raise ValueError("if port is None, port_specified must be false")
767
768        self.version = version
769        self.name = name
770        self.value = value
771        self.port = port
772        self.port_specified = port_specified
773        # normalise case, as per RFC 2965 section 3.3.3
774        self.domain = domain.lower()
775        self.domain_specified = domain_specified
776        # Sigh.  We need to know whether the domain given in the
777        # cookie-attribute had an initial dot, in order to follow RFC 2965
778        # (as clarified in draft errata).  Needed for the returned $Domain
779        # value.
780        self.domain_initial_dot = domain_initial_dot
781        self.path = path
782        self.path_specified = path_specified
783        self.secure = secure
784        self.expires = expires
785        self.discard = discard
786        self.comment = comment
787        self.comment_url = comment_url
788        self.rfc2109 = rfc2109
789
790        self._rest = copy.copy(rest)
791
792    def has_nonstandard_attr(self, name):
793        return name in self._rest
794    def get_nonstandard_attr(self, name, default=None):
795        return self._rest.get(name, default)
796    def set_nonstandard_attr(self, name, value):
797        self._rest[name] = value
798
799    def is_expired(self, now=None):
800        if now is None: now = time.time()
801        if (self.expires is not None) and (self.expires <= now):
802            return True
803        return False
804
805    def __str__(self):
806        if self.port is None: p = ""
807        else: p = ":"+self.port
808        limit = self.domain + p + self.path
809        if self.value is not None:
810            namevalue = "%s=%s" % (self.name, self.value)
811        else:
812            namevalue = self.name
813        return "<Cookie %s for %s>" % (namevalue, limit)
814
815    def __repr__(self):
816        args = []
817        for name in ("version", "name", "value",
818                     "port", "port_specified",
819                     "domain", "domain_specified", "domain_initial_dot",
820                     "path", "path_specified",
821                     "secure", "expires", "discard", "comment", "comment_url",
822                     ):
823            attr = getattr(self, name)
824            args.append("%s=%s" % (name, repr(attr)))
825        args.append("rest=%s" % repr(self._rest))
826        args.append("rfc2109=%s" % repr(self.rfc2109))
827        return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
828
829
830class CookiePolicy:
831    """Defines which cookies get accepted from and returned to server.
832
833    May also modify cookies, though this is probably a bad idea.
834
835    The subclass DefaultCookiePolicy defines the standard rules for Netscape
836    and RFC 2965 cookies -- override that if you want a customized policy.
837
838    """
839    def set_ok(self, cookie, request):
840        """Return true if (and only if) cookie should be accepted from server.
841
842        Currently, pre-expired cookies never get this far -- the CookieJar
843        class deletes such cookies itself.
844
845        """
846        raise NotImplementedError()
847
848    def return_ok(self, cookie, request):
849        """Return true if (and only if) cookie should be returned to server."""
850        raise NotImplementedError()
851
852    def domain_return_ok(self, domain, request):
853        """Return false if cookies should not be returned, given cookie domain.
854        """
855        return True
856
857    def path_return_ok(self, path, request):
858        """Return false if cookies should not be returned, given cookie path.
859        """
860        return True
861
862
863class DefaultCookiePolicy(CookiePolicy):
864    """Implements the standard rules for accepting and returning cookies."""
865
866    DomainStrictNoDots = 1
867    DomainStrictNonDomain = 2
868    DomainRFC2965Match = 4
869
870    DomainLiberal = 0
871    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
872
873    def __init__(self,
874                 blocked_domains=None, allowed_domains=None,
875                 netscape=True, rfc2965=False,
876                 rfc2109_as_netscape=None,
877                 hide_cookie2=False,
878                 strict_domain=False,
879                 strict_rfc2965_unverifiable=True,
880                 strict_ns_unverifiable=False,
881                 strict_ns_domain=DomainLiberal,
882                 strict_ns_set_initial_dollar=False,
883                 strict_ns_set_path=False,
884                 ):
885        """Constructor arguments should be passed as keyword arguments only."""
886        self.netscape = netscape
887        self.rfc2965 = rfc2965
888        self.rfc2109_as_netscape = rfc2109_as_netscape
889        self.hide_cookie2 = hide_cookie2
890        self.strict_domain = strict_domain
891        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
892        self.strict_ns_unverifiable = strict_ns_unverifiable
893        self.strict_ns_domain = strict_ns_domain
894        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
895        self.strict_ns_set_path = strict_ns_set_path
896
897        if blocked_domains is not None:
898            self._blocked_domains = tuple(blocked_domains)
899        else:
900            self._blocked_domains = ()
901
902        if allowed_domains is not None:
903            allowed_domains = tuple(allowed_domains)
904        self._allowed_domains = allowed_domains
905
906    def blocked_domains(self):
907        """Return the sequence of blocked domains (as a tuple)."""
908        return self._blocked_domains
909    def set_blocked_domains(self, blocked_domains):
910        """Set the sequence of blocked domains."""
911        self._blocked_domains = tuple(blocked_domains)
912
913    def is_blocked(self, domain):
914        for blocked_domain in self._blocked_domains:
915            if user_domain_match(domain, blocked_domain):
916                return True
917        return False
918
919    def allowed_domains(self):
920        """Return None, or the sequence of allowed domains (as a tuple)."""
921        return self._allowed_domains
922    def set_allowed_domains(self, allowed_domains):
923        """Set the sequence of allowed domains, or None."""
924        if allowed_domains is not None:
925            allowed_domains = tuple(allowed_domains)
926        self._allowed_domains = allowed_domains
927
928    def is_not_allowed(self, domain):
929        if self._allowed_domains is None:
930            return False
931        for allowed_domain in self._allowed_domains:
932            if user_domain_match(domain, allowed_domain):
933                return False
934        return True
935
936    def set_ok(self, cookie, request):
937        """
938        If you override .set_ok(), be sure to call this method.  If it returns
939        false, so should your subclass (assuming your subclass wants to be more
940        strict about which cookies to accept).
941
942        """
943        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
944
945        assert cookie.name is not None
946
947        for n in "version", "verifiability", "name", "path", "domain", "port":
948            fn_name = "set_ok_"+n
949            fn = getattr(self, fn_name)
950            if not fn(cookie, request):
951                return False
952
953        return True
954
955    def set_ok_version(self, cookie, request):
956        if cookie.version is None:
957            # Version is always set to 0 by parse_ns_headers if it's a Netscape
958            # cookie, so this must be an invalid RFC 2965 cookie.
959            _debug("   Set-Cookie2 without version attribute (%s=%s)",
960                   cookie.name, cookie.value)
961            return False
962        if cookie.version > 0 and not self.rfc2965:
963            _debug("   RFC 2965 cookies are switched off")
964            return False
965        elif cookie.version == 0 and not self.netscape:
966            _debug("   Netscape cookies are switched off")
967            return False
968        return True
969
970    def set_ok_verifiability(self, cookie, request):
971        if request.unverifiable and is_third_party(request):
972            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
973                _debug("   third-party RFC 2965 cookie during "
974                             "unverifiable transaction")
975                return False
976            elif cookie.version == 0 and self.strict_ns_unverifiable:
977                _debug("   third-party Netscape cookie during "
978                             "unverifiable transaction")
979                return False
980        return True
981
982    def set_ok_name(self, cookie, request):
983        # Try and stop servers setting V0 cookies designed to hack other
984        # servers that know both V0 and V1 protocols.
985        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
986            cookie.name.startswith("$")):
987            _debug("   illegal name (starts with '$'): '%s'", cookie.name)
988            return False
989        return True
990
991    def set_ok_path(self, cookie, request):
992        if cookie.path_specified:
993            req_path = request_path(request)
994            if ((cookie.version > 0 or
995                 (cookie.version == 0 and self.strict_ns_set_path)) and
996                not req_path.startswith(cookie.path)):
997                _debug("   path attribute %s is not a prefix of request "
998                       "path %s", cookie.path, req_path)
999                return False
1000        return True
1001
1002    def set_ok_domain(self, cookie, request):
1003        if self.is_blocked(cookie.domain):
1004            _debug("   domain %s is in user block-list", cookie.domain)
1005            return False
1006        if self.is_not_allowed(cookie.domain):
1007            _debug("   domain %s is not in user allow-list", cookie.domain)
1008            return False
1009        if cookie.domain_specified:
1010            req_host, erhn = eff_request_host(request)
1011            domain = cookie.domain
1012            if self.strict_domain and (domain.count(".") >= 2):
1013                # XXX This should probably be compared with the Konqueror
1014                # (kcookiejar.cpp) and Mozilla implementations, but it's a
1015                # losing battle.
1016                i = domain.rfind(".")
1017                j = domain.rfind(".", 0, i)
1018                if j == 0:  # domain like .foo.bar
1019                    tld = domain[i+1:]
1020                    sld = domain[j+1:i]
1021                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1022                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
1023                       "info", "jobs", "mobi", "museum", "name", "pro",
1024                       "travel", "eu") and len(tld) == 2:
1025                        # domain like .co.uk
1026                        _debug("   country-code second level domain %s", domain)
1027                        return False
1028            if domain.startswith("."):
1029                undotted_domain = domain[1:]
1030            else:
1031                undotted_domain = domain
1032            embedded_dots = (undotted_domain.find(".") >= 0)
1033            if not embedded_dots and domain != ".local":
1034                _debug("   non-local domain %s contains no embedded dot",
1035                       domain)
1036                return False
1037            if cookie.version == 0:
1038                if (not erhn.endswith(domain) and
1039                    (not erhn.startswith(".") and
1040                     not ("."+erhn).endswith(domain))):
1041                    _debug("   effective request-host %s (even with added "
1042                           "initial dot) does not end with %s",
1043                           erhn, domain)
1044                    return False
1045            if (cookie.version > 0 or
1046                (self.strict_ns_domain & self.DomainRFC2965Match)):
1047                if not domain_match(erhn, domain):
1048                    _debug("   effective request-host %s does not domain-match "
1049                           "%s", erhn, domain)
1050                    return False
1051            if (cookie.version > 0 or
1052                (self.strict_ns_domain & self.DomainStrictNoDots)):
1053                host_prefix = req_host[:-len(domain)]
1054                if (host_prefix.find(".") >= 0 and
1055                    not IPV4_RE.search(req_host)):
1056                    _debug("   host prefix %s for domain %s contains a dot",
1057                           host_prefix, domain)
1058                    return False
1059        return True
1060
1061    def set_ok_port(self, cookie, request):
1062        if cookie.port_specified:
1063            req_port = request_port(request)
1064            if req_port is None:
1065                req_port = "80"
1066            else:
1067                req_port = str(req_port)
1068            for p in cookie.port.split(","):
1069                try:
1070                    int(p)
1071                except ValueError:
1072                    _debug("   bad port %s (not numeric)", p)
1073                    return False
1074                if p == req_port:
1075                    break
1076            else:
1077                _debug("   request port (%s) not found in %s",
1078                       req_port, cookie.port)
1079                return False
1080        return True
1081
1082    def return_ok(self, cookie, request):
1083        """
1084        If you override .return_ok(), be sure to call this method.  If it
1085        returns false, so should your subclass (assuming your subclass wants to
1086        be more strict about which cookies to return).
1087
1088        """
1089        # Path has already been checked by .path_return_ok(), and domain
1090        # blocking done by .domain_return_ok().
1091        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1092
1093        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1094            fn_name = "return_ok_"+n
1095            fn = getattr(self, fn_name)
1096            if not fn(cookie, request):
1097                return False
1098        return True
1099
1100    def return_ok_version(self, cookie, request):
1101        if cookie.version > 0 and not self.rfc2965:
1102            _debug("   RFC 2965 cookies are switched off")
1103            return False
1104        elif cookie.version == 0 and not self.netscape:
1105            _debug("   Netscape cookies are switched off")
1106            return False
1107        return True
1108
1109    def return_ok_verifiability(self, cookie, request):
1110        if request.unverifiable and is_third_party(request):
1111            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1112                _debug("   third-party RFC 2965 cookie during unverifiable "
1113                       "transaction")
1114                return False
1115            elif cookie.version == 0 and self.strict_ns_unverifiable:
1116                _debug("   third-party Netscape cookie during unverifiable "
1117                       "transaction")
1118                return False
1119        return True
1120
1121    def return_ok_secure(self, cookie, request):
1122        if cookie.secure and request.type != "https":
1123            _debug("   secure cookie with non-secure request")
1124            return False
1125        return True
1126
1127    def return_ok_expires(self, cookie, request):
1128        if cookie.is_expired(self._now):
1129            _debug("   cookie expired")
1130            return False
1131        return True
1132
1133    def return_ok_port(self, cookie, request):
1134        if cookie.port:
1135            req_port = request_port(request)
1136            if req_port is None:
1137                req_port = "80"
1138            for p in cookie.port.split(","):
1139                if p == req_port:
1140                    break
1141            else:
1142                _debug("   request port %s does not match cookie port %s",
1143                       req_port, cookie.port)
1144                return False
1145        return True
1146
1147    def return_ok_domain(self, cookie, request):
1148        req_host, erhn = eff_request_host(request)
1149        domain = cookie.domain
1150
1151        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1152        if (cookie.version == 0 and
1153            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1154            not cookie.domain_specified and domain != erhn):
1155            _debug("   cookie with unspecified domain does not string-compare "
1156                   "equal to request domain")
1157            return False
1158
1159        if cookie.version > 0 and not domain_match(erhn, domain):
1160            _debug("   effective request-host name %s does not domain-match "
1161                   "RFC 2965 cookie domain %s", erhn, domain)
1162            return False
1163        if cookie.version == 0 and not ("."+erhn).endswith(domain):
1164            _debug("   request-host %s does not match Netscape cookie domain "
1165                   "%s", req_host, domain)
1166            return False
1167        return True
1168
1169    def domain_return_ok(self, domain, request):
1170        # Liberal check of.  This is here as an optimization to avoid
1171        # having to load lots of MSIE cookie files unless necessary.
1172        req_host, erhn = eff_request_host(request)
1173        if not req_host.startswith("."):
1174            req_host = "."+req_host
1175        if not erhn.startswith("."):
1176            erhn = "."+erhn
1177        if not (req_host.endswith(domain) or erhn.endswith(domain)):
1178            #_debug("   request domain %s does not match cookie domain %s",
1179            #       req_host, domain)
1180            return False
1181
1182        if self.is_blocked(domain):
1183            _debug("   domain %s is in user block-list", domain)
1184            return False
1185        if self.is_not_allowed(domain):
1186            _debug("   domain %s is not in user allow-list", domain)
1187            return False
1188
1189        return True
1190
1191    def path_return_ok(self, path, request):
1192        _debug("- checking cookie path=%s", path)
1193        req_path = request_path(request)
1194        if not req_path.startswith(path):
1195            _debug("  %s does not path-match %s", req_path, path)
1196            return False
1197        return True
1198
1199
1200def vals_sorted_by_key(adict):
1201    keys = sorted(adict.keys())
1202    return map(adict.get, keys)
1203
1204def deepvalues(mapping):
1205    """Iterates over nested mapping, depth-first, in sorted order by key."""
1206    values = vals_sorted_by_key(mapping)
1207    for obj in values:
1208        mapping = False
1209        try:
1210            obj.items
1211        except AttributeError:
1212            pass
1213        else:
1214            mapping = True
1215            yield from deepvalues(obj)
1216        if not mapping:
1217            yield obj
1218
1219
1220# Used as second parameter to dict.get() method, to distinguish absent
1221# dict key from one with a None value.
1222class Absent: pass
1223
1224class CookieJar:
1225    """Collection of HTTP cookies.
1226
1227    You may not need to know about this class: try
1228    urllib.request.build_opener(HTTPCookieProcessor).open(url).
1229    """
1230
1231    non_word_re = re.compile(r"\W")
1232    quote_re = re.compile(r"([\"\\])")
1233    strict_domain_re = re.compile(r"\.?[^.]*")
1234    domain_re = re.compile(r"[^.]*")
1235    dots_re = re.compile(r"^\.+")
1236
1237    magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
1238
1239    def __init__(self, policy=None):
1240        if policy is None:
1241            policy = DefaultCookiePolicy()
1242        self._policy = policy
1243
1244        self._cookies_lock = _threading.RLock()
1245        self._cookies = {}
1246
1247    def set_policy(self, policy):
1248        self._policy = policy
1249
1250    def _cookies_for_domain(self, domain, request):
1251        cookies = []
1252        if not self._policy.domain_return_ok(domain, request):
1253            return []
1254        _debug("Checking %s for cookies to return", domain)
1255        cookies_by_path = self._cookies[domain]
1256        for path in cookies_by_path.keys():
1257            if not self._policy.path_return_ok(path, request):
1258                continue
1259            cookies_by_name = cookies_by_path[path]
1260            for cookie in cookies_by_name.values():
1261                if not self._policy.return_ok(cookie, request):
1262                    _debug("   not returning cookie")
1263                    continue
1264                _debug("   it's a match")
1265                cookies.append(cookie)
1266        return cookies
1267
1268    def _cookies_for_request(self, request):
1269        """Return a list of cookies to be returned to server."""
1270        cookies = []
1271        for domain in self._cookies.keys():
1272            cookies.extend(self._cookies_for_domain(domain, request))
1273        return cookies
1274
1275    def _cookie_attrs(self, cookies):
1276        """Return a list of cookie-attributes to be returned to server.
1277
1278        like ['foo="bar"; $Path="/"', ...]
1279
1280        The $Version attribute is also added when appropriate (currently only
1281        once per request).
1282
1283        """
1284        # add cookies in order of most specific (ie. longest) path first
1285        cookies.sort(key=lambda a: len(a.path), reverse=True)
1286
1287        version_set = False
1288
1289        attrs = []
1290        for cookie in cookies:
1291            # set version of Cookie header
1292            # XXX
1293            # What should it be if multiple matching Set-Cookie headers have
1294            #  different versions themselves?
1295            # Answer: there is no answer; was supposed to be settled by
1296            #  RFC 2965 errata, but that may never appear...
1297            version = cookie.version
1298            if not version_set:
1299                version_set = True
1300                if version > 0:
1301                    attrs.append("$Version=%s" % version)
1302
1303            # quote cookie value if necessary
1304            # (not for Netscape protocol, which already has any quotes
1305            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1306            if ((cookie.value is not None) and
1307                self.non_word_re.search(cookie.value) and version > 0):
1308                value = self.quote_re.sub(r"\\\1", cookie.value)
1309            else:
1310                value = cookie.value
1311
1312            # add cookie-attributes to be returned in Cookie header
1313            if cookie.value is None:
1314                attrs.append(cookie.name)
1315            else:
1316                attrs.append("%s=%s" % (cookie.name, value))
1317            if version > 0:
1318                if cookie.path_specified:
1319                    attrs.append('$Path="%s"' % cookie.path)
1320                if cookie.domain.startswith("."):
1321                    domain = cookie.domain
1322                    if (not cookie.domain_initial_dot and
1323                        domain.startswith(".")):
1324                        domain = domain[1:]
1325                    attrs.append('$Domain="%s"' % domain)
1326                if cookie.port is not None:
1327                    p = "$Port"
1328                    if cookie.port_specified:
1329                        p = p + ('="%s"' % cookie.port)
1330                    attrs.append(p)
1331
1332        return attrs
1333
1334    def add_cookie_header(self, request):
1335        """Add correct Cookie: header to request (urllib.request.Request object).
1336
1337        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1338
1339        """
1340        _debug("add_cookie_header")
1341        self._cookies_lock.acquire()
1342        try:
1343
1344            self._policy._now = self._now = int(time.time())
1345
1346            cookies = self._cookies_for_request(request)
1347
1348            attrs = self._cookie_attrs(cookies)
1349            if attrs:
1350                if not request.has_header("Cookie"):
1351                    request.add_unredirected_header(
1352                        "Cookie", "; ".join(attrs))
1353
1354            # if necessary, advertise that we know RFC 2965
1355            if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1356                not request.has_header("Cookie2")):
1357                for cookie in cookies:
1358                    if cookie.version != 1:
1359                        request.add_unredirected_header("Cookie2", '$Version="1"')
1360                        break
1361
1362        finally:
1363            self._cookies_lock.release()
1364
1365        self.clear_expired_cookies()
1366
1367    def _normalized_cookie_tuples(self, attrs_set):
1368        """Return list of tuples containing normalised cookie information.
1369
1370        attrs_set is the list of lists of key,value pairs extracted from
1371        the Set-Cookie or Set-Cookie2 headers.
1372
1373        Tuples are name, value, standard, rest, where name and value are the
1374        cookie name and value, standard is a dictionary containing the standard
1375        cookie-attributes (discard, secure, version, expires or max-age,
1376        domain, path and port) and rest is a dictionary containing the rest of
1377        the cookie-attributes.
1378
1379        """
1380        cookie_tuples = []
1381
1382        boolean_attrs = "discard", "secure"
1383        value_attrs = ("version",
1384                       "expires", "max-age",
1385                       "domain", "path", "port",
1386                       "comment", "commenturl")
1387
1388        for cookie_attrs in attrs_set:
1389            name, value = cookie_attrs[0]
1390
1391            # Build dictionary of standard cookie-attributes (standard) and
1392            # dictionary of other cookie-attributes (rest).
1393
1394            # Note: expiry time is normalised to seconds since epoch.  V0
1395            # cookies should have the Expires cookie-attribute, and V1 cookies
1396            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1397            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1398            # accept either (but prefer Max-Age).
1399            max_age_set = False
1400
1401            bad_cookie = False
1402
1403            standard = {}
1404            rest = {}
1405            for k, v in cookie_attrs[1:]:
1406                lc = k.lower()
1407                # don't lose case distinction for unknown fields
1408                if lc in value_attrs or lc in boolean_attrs:
1409                    k = lc
1410                if k in boolean_attrs and v is None:
1411                    # boolean cookie-attribute is present, but has no value
1412                    # (like "discard", rather than "port=80")
1413                    v = True
1414                if k in standard:
1415                    # only first value is significant
1416                    continue
1417                if k == "domain":
1418                    if v is None:
1419                        _debug("   missing value for domain attribute")
1420                        bad_cookie = True
1421                        break
1422                    # RFC 2965 section 3.3.3
1423                    v = v.lower()
1424                if k == "expires":
1425                    if max_age_set:
1426                        # Prefer max-age to expires (like Mozilla)
1427                        continue
1428                    if v is None:
1429                        _debug("   missing or invalid value for expires "
1430                              "attribute: treating as session cookie")
1431                        continue
1432                if k == "max-age":
1433                    max_age_set = True
1434                    try:
1435                        v = int(v)
1436                    except ValueError:
1437                        _debug("   missing or invalid (non-numeric) value for "
1438                              "max-age attribute")
1439                        bad_cookie = True
1440                        break
1441                    # convert RFC 2965 Max-Age to seconds since epoch
1442                    # XXX Strictly you're supposed to follow RFC 2616
1443                    #   age-calculation rules.  Remember that zero Max-Age
1444                    #   is a request to discard (old and new) cookie, though.
1445                    k = "expires"
1446                    v = self._now + v
1447                if (k in value_attrs) or (k in boolean_attrs):
1448                    if (v is None and
1449                        k not in ("port", "comment", "commenturl")):
1450                        _debug("   missing value for %s attribute" % k)
1451                        bad_cookie = True
1452                        break
1453                    standard[k] = v
1454                else:
1455                    rest[k] = v
1456
1457            if bad_cookie:
1458                continue
1459
1460            cookie_tuples.append((name, value, standard, rest))
1461
1462        return cookie_tuples
1463
1464    def _cookie_from_cookie_tuple(self, tup, request):
1465        # standard is dict of standard cookie-attributes, rest is dict of the
1466        # rest of them
1467        name, value, standard, rest = tup
1468
1469        domain = standard.get("domain", Absent)
1470        path = standard.get("path", Absent)
1471        port = standard.get("port", Absent)
1472        expires = standard.get("expires", Absent)
1473
1474        # set the easy defaults
1475        version = standard.get("version", None)
1476        if version is not None:
1477            try:
1478                version = int(version)
1479            except ValueError:
1480                return None  # invalid version, ignore cookie
1481        secure = standard.get("secure", False)
1482        # (discard is also set if expires is Absent)
1483        discard = standard.get("discard", False)
1484        comment = standard.get("comment", None)
1485        comment_url = standard.get("commenturl", None)
1486
1487        # set default path
1488        if path is not Absent and path != "":
1489            path_specified = True
1490            path = escape_path(path)
1491        else:
1492            path_specified = False
1493            path = request_path(request)
1494            i = path.rfind("/")
1495            if i != -1:
1496                if version == 0:
1497                    # Netscape spec parts company from reality here
1498                    path = path[:i]
1499                else:
1500                    path = path[:i+1]
1501            if len(path) == 0: path = "/"
1502
1503        # set default domain
1504        domain_specified = domain is not Absent
1505        # but first we have to remember whether it starts with a dot
1506        domain_initial_dot = False
1507        if domain_specified:
1508            domain_initial_dot = bool(domain.startswith("."))
1509        if domain is Absent:
1510            req_host, erhn = eff_request_host(request)
1511            domain = erhn
1512        elif not domain.startswith("."):
1513            domain = "."+domain
1514
1515        # set default port
1516        port_specified = False
1517        if port is not Absent:
1518            if port is None:
1519                # Port attr present, but has no value: default to request port.
1520                # Cookie should then only be sent back on that port.
1521                port = request_port(request)
1522            else:
1523                port_specified = True
1524                port = re.sub(r"\s+", "", port)
1525        else:
1526            # No port attr present.  Cookie can be sent back on any port.
1527            port = None
1528
1529        # set default expires and discard
1530        if expires is Absent:
1531            expires = None
1532            discard = True
1533        elif expires <= self._now:
1534            # Expiry date in past is request to delete cookie.  This can't be
1535            # in DefaultCookiePolicy, because can't delete cookies there.
1536            try:
1537                self.clear(domain, path, name)
1538            except KeyError:
1539                pass
1540            _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1541                   domain, path, name)
1542            return None
1543
1544        return Cookie(version,
1545                      name, value,
1546                      port, port_specified,
1547                      domain, domain_specified, domain_initial_dot,
1548                      path, path_specified,
1549                      secure,
1550                      expires,
1551                      discard,
1552                      comment,
1553                      comment_url,
1554                      rest)
1555
1556    def _cookies_from_attrs_set(self, attrs_set, request):
1557        cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1558
1559        cookies = []
1560        for tup in cookie_tuples:
1561            cookie = self._cookie_from_cookie_tuple(tup, request)
1562            if cookie: cookies.append(cookie)
1563        return cookies
1564
1565    def _process_rfc2109_cookies(self, cookies):
1566        rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1567        if rfc2109_as_ns is None:
1568            rfc2109_as_ns = not self._policy.rfc2965
1569        for cookie in cookies:
1570            if cookie.version == 1:
1571                cookie.rfc2109 = True
1572                if rfc2109_as_ns:
1573                    # treat 2109 cookies as Netscape cookies rather than
1574                    # as RFC2965 cookies
1575                    cookie.version = 0
1576
1577    def make_cookies(self, response, request):
1578        """Return sequence of Cookie objects extracted from response object."""
1579        # get cookie-attributes for RFC 2965 and Netscape protocols
1580        headers = response.info()
1581        rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1582        ns_hdrs = headers.get_all("Set-Cookie", [])
1583
1584        rfc2965 = self._policy.rfc2965
1585        netscape = self._policy.netscape
1586
1587        if ((not rfc2965_hdrs and not ns_hdrs) or
1588            (not ns_hdrs and not rfc2965) or
1589            (not rfc2965_hdrs and not netscape) or
1590            (not netscape and not rfc2965)):
1591            return []  # no relevant cookie headers: quick exit
1592
1593        try:
1594            cookies = self._cookies_from_attrs_set(
1595                split_header_words(rfc2965_hdrs), request)
1596        except Exception:
1597            _warn_unhandled_exception()
1598            cookies = []
1599
1600        if ns_hdrs and netscape:
1601            try:
1602                # RFC 2109 and Netscape cookies
1603                ns_cookies = self._cookies_from_attrs_set(
1604                    parse_ns_headers(ns_hdrs), request)
1605            except Exception:
1606                _warn_unhandled_exception()
1607                ns_cookies = []
1608            self._process_rfc2109_cookies(ns_cookies)
1609
1610            # Look for Netscape cookies (from Set-Cookie headers) that match
1611            # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1612            # For each match, keep the RFC 2965 cookie and ignore the Netscape
1613            # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1614            # bundled in with the Netscape cookies for this purpose, which is
1615            # reasonable behaviour.
1616            if rfc2965:
1617                lookup = {}
1618                for cookie in cookies:
1619                    lookup[(cookie.domain, cookie.path, cookie.name)] = None
1620
1621                def no_matching_rfc2965(ns_cookie, lookup=lookup):
1622                    key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1623                    return key not in lookup
1624                ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1625
1626            if ns_cookies:
1627                cookies.extend(ns_cookies)
1628
1629        return cookies
1630
1631    def set_cookie_if_ok(self, cookie, request):
1632        """Set a cookie if policy says it's OK to do so."""
1633        self._cookies_lock.acquire()
1634        try:
1635            self._policy._now = self._now = int(time.time())
1636
1637            if self._policy.set_ok(cookie, request):
1638                self.set_cookie(cookie)
1639
1640
1641        finally:
1642            self._cookies_lock.release()
1643
1644    def set_cookie(self, cookie):
1645        """Set a cookie, without checking whether or not it should be set."""
1646        c = self._cookies
1647        self._cookies_lock.acquire()
1648        try:
1649            if cookie.domain not in c: c[cookie.domain] = {}
1650            c2 = c[cookie.domain]
1651            if cookie.path not in c2: c2[cookie.path] = {}
1652            c3 = c2[cookie.path]
1653            c3[cookie.name] = cookie
1654        finally:
1655            self._cookies_lock.release()
1656
1657    def extract_cookies(self, response, request):
1658        """Extract cookies from response, where allowable given the request."""
1659        _debug("extract_cookies: %s", response.info())
1660        self._cookies_lock.acquire()
1661        try:
1662            self._policy._now = self._now = int(time.time())
1663
1664            for cookie in self.make_cookies(response, request):
1665                if self._policy.set_ok(cookie, request):
1666                    _debug(" setting cookie: %s", cookie)
1667                    self.set_cookie(cookie)
1668        finally:
1669            self._cookies_lock.release()
1670
1671    def clear(self, domain=None, path=None, name=None):
1672        """Clear some cookies.
1673
1674        Invoking this method without arguments will clear all cookies.  If
1675        given a single argument, only cookies belonging to that domain will be
1676        removed.  If given two arguments, cookies belonging to the specified
1677        path within that domain are removed.  If given three arguments, then
1678        the cookie with the specified name, path and domain is removed.
1679
1680        Raises KeyError if no matching cookie exists.
1681
1682        """
1683        if name is not None:
1684            if (domain is None) or (path is None):
1685                raise ValueError(
1686                    "domain and path must be given to remove a cookie by name")
1687            del self._cookies[domain][path][name]
1688        elif path is not None:
1689            if domain is None:
1690                raise ValueError(
1691                    "domain must be given to remove cookies by path")
1692            del self._cookies[domain][path]
1693        elif domain is not None:
1694            del self._cookies[domain]
1695        else:
1696            self._cookies = {}
1697
1698    def clear_session_cookies(self):
1699        """Discard all session cookies.
1700
1701        Note that the .save() method won't save session cookies anyway, unless
1702        you ask otherwise by passing a true ignore_discard argument.
1703
1704        """
1705        self._cookies_lock.acquire()
1706        try:
1707            for cookie in self:
1708                if cookie.discard:
1709                    self.clear(cookie.domain, cookie.path, cookie.name)
1710        finally:
1711            self._cookies_lock.release()
1712
1713    def clear_expired_cookies(self):
1714        """Discard all expired cookies.
1715
1716        You probably don't need to call this method: expired cookies are never
1717        sent back to the server (provided you're using DefaultCookiePolicy),
1718        this method is called by CookieJar itself every so often, and the
1719        .save() method won't save expired cookies anyway (unless you ask
1720        otherwise by passing a true ignore_expires argument).
1721
1722        """
1723        self._cookies_lock.acquire()
1724        try:
1725            now = time.time()
1726            for cookie in self:
1727                if cookie.is_expired(now):
1728                    self.clear(cookie.domain, cookie.path, cookie.name)
1729        finally:
1730            self._cookies_lock.release()
1731
1732    def __iter__(self):
1733        return deepvalues(self._cookies)
1734
1735    def __len__(self):
1736        """Return number of contained cookies."""
1737        i = 0
1738        for cookie in self: i = i + 1
1739        return i
1740
1741    def __repr__(self):
1742        r = []
1743        for cookie in self: r.append(repr(cookie))
1744        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1745
1746    def __str__(self):
1747        r = []
1748        for cookie in self: r.append(str(cookie))
1749        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1750
1751
1752# derives from OSError for backwards-compatibility with Python 2.4.0
1753class LoadError(OSError): pass
1754
1755class FileCookieJar(CookieJar):
1756    """CookieJar that can be loaded from and saved to a file."""
1757
1758    def __init__(self, filename=None, delayload=False, policy=None):
1759        """
1760        Cookies are NOT loaded from the named file until either the .load() or
1761        .revert() method is called.
1762
1763        """
1764        CookieJar.__init__(self, policy)
1765        if filename is not None:
1766            try:
1767                filename+""
1768            except:
1769                raise ValueError("filename must be string-like")
1770        self.filename = filename
1771        self.delayload = bool(delayload)
1772
1773    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1774        """Save cookies to a file."""
1775        raise NotImplementedError()
1776
1777    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1778        """Load cookies from a file."""
1779        if filename is None:
1780            if self.filename is not None: filename = self.filename
1781            else: raise ValueError(MISSING_FILENAME_TEXT)
1782
1783        with open(filename) as f:
1784            self._really_load(f, filename, ignore_discard, ignore_expires)
1785
1786    def revert(self, filename=None,
1787               ignore_discard=False, ignore_expires=False):
1788        """Clear all cookies and reload cookies from a saved file.
1789
1790        Raises LoadError (or OSError) if reversion is not successful; the
1791        object's state will not be altered if this happens.
1792
1793        """
1794        if filename is None:
1795            if self.filename is not None: filename = self.filename
1796            else: raise ValueError(MISSING_FILENAME_TEXT)
1797
1798        self._cookies_lock.acquire()
1799        try:
1800
1801            old_state = copy.deepcopy(self._cookies)
1802            self._cookies = {}
1803            try:
1804                self.load(filename, ignore_discard, ignore_expires)
1805            except OSError:
1806                self._cookies = old_state
1807                raise
1808
1809        finally:
1810            self._cookies_lock.release()
1811
1812
1813def lwp_cookie_str(cookie):
1814    """Return string representation of Cookie in the LWP cookie file format.
1815
1816    Actually, the format is extended a bit -- see module docstring.
1817
1818    """
1819    h = [(cookie.name, cookie.value),
1820         ("path", cookie.path),
1821         ("domain", cookie.domain)]
1822    if cookie.port is not None: h.append(("port", cookie.port))
1823    if cookie.path_specified: h.append(("path_spec", None))
1824    if cookie.port_specified: h.append(("port_spec", None))
1825    if cookie.domain_initial_dot: h.append(("domain_dot", None))
1826    if cookie.secure: h.append(("secure", None))
1827    if cookie.expires: h.append(("expires",
1828                               time2isoz(float(cookie.expires))))
1829    if cookie.discard: h.append(("discard", None))
1830    if cookie.comment: h.append(("comment", cookie.comment))
1831    if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1832
1833    keys = sorted(cookie._rest.keys())
1834    for k in keys:
1835        h.append((k, str(cookie._rest[k])))
1836
1837    h.append(("version", str(cookie.version)))
1838
1839    return join_header_words([h])
1840
1841class LWPCookieJar(FileCookieJar):
1842    """
1843    The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
1844    "Set-Cookie3" is the format used by the libwww-perl library, not known
1845    to be compatible with any browser, but which is easy to read and
1846    doesn't lose information about RFC 2965 cookies.
1847
1848    Additional methods
1849
1850    as_lwp_str(ignore_discard=True, ignore_expired=True)
1851
1852    """
1853
1854    def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1855        """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
1856
1857        ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1858
1859        """
1860        now = time.time()
1861        r = []
1862        for cookie in self:
1863            if not ignore_discard and cookie.discard:
1864                continue
1865            if not ignore_expires and cookie.is_expired(now):
1866                continue
1867            r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1868        return "\n".join(r+[""])
1869
1870    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1871        if filename is None:
1872            if self.filename is not None: filename = self.filename
1873            else: raise ValueError(MISSING_FILENAME_TEXT)
1874
1875        with open(filename, "w") as f:
1876            # There really isn't an LWP Cookies 2.0 format, but this indicates
1877            # that there is extra information in here (domain_dot and
1878            # port_spec) while still being compatible with libwww-perl, I hope.
1879            f.write("#LWP-Cookies-2.0\n")
1880            f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1881
1882    def _really_load(self, f, filename, ignore_discard, ignore_expires):
1883        magic = f.readline()
1884        if not self.magic_re.search(magic):
1885            msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1886                   "file" % filename)
1887            raise LoadError(msg)
1888
1889        now = time.time()
1890
1891        header = "Set-Cookie3:"
1892        boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1893                         "secure", "discard")
1894        value_attrs = ("version",
1895                       "port", "path", "domain",
1896                       "expires",
1897                       "comment", "commenturl")
1898
1899        try:
1900            while 1:
1901                line = f.readline()
1902                if line == "": break
1903                if not line.startswith(header):
1904                    continue
1905                line = line[len(header):].strip()
1906
1907                for data in split_header_words([line]):
1908                    name, value = data[0]
1909                    standard = {}
1910                    rest = {}
1911                    for k in boolean_attrs:
1912                        standard[k] = False
1913                    for k, v in data[1:]:
1914                        if k is not None:
1915                            lc = k.lower()
1916                        else:
1917                            lc = None
1918                        # don't lose case distinction for unknown fields
1919                        if (lc in value_attrs) or (lc in boolean_attrs):
1920                            k = lc
1921                        if k in boolean_attrs:
1922                            if v is None: v = True
1923                            standard[k] = v
1924                        elif k in value_attrs:
1925                            standard[k] = v
1926                        else:
1927                            rest[k] = v
1928
1929                    h = standard.get
1930                    expires = h("expires")
1931                    discard = h("discard")
1932                    if expires is not None:
1933                        expires = iso2time(expires)
1934                    if expires is None:
1935                        discard = True
1936                    domain = h("domain")
1937                    domain_specified = domain.startswith(".")
1938                    c = Cookie(h("version"), name, value,
1939                               h("port"), h("port_spec"),
1940                               domain, domain_specified, h("domain_dot"),
1941                               h("path"), h("path_spec"),
1942                               h("secure"),
1943                               expires,
1944                               discard,
1945                               h("comment"),
1946                               h("commenturl"),
1947                               rest)
1948                    if not ignore_discard and c.discard:
1949                        continue
1950                    if not ignore_expires and c.is_expired(now):
1951                        continue
1952                    self.set_cookie(c)
1953        except OSError:
1954            raise
1955        except Exception:
1956            _warn_unhandled_exception()
1957            raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1958                            (filename, line))
1959
1960
1961class MozillaCookieJar(FileCookieJar):
1962    """
1963
1964    WARNING: you may want to backup your browser's cookies file if you use
1965    this class to save cookies.  I *think* it works, but there have been
1966    bugs in the past!
1967
1968    This class differs from CookieJar only in the format it uses to save and
1969    load cookies to and from a file.  This class uses the Mozilla/Netscape
1970    `cookies.txt' format.  lynx uses this file format, too.
1971
1972    Don't expect cookies saved while the browser is running to be noticed by
1973    the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1974    you change them on disk while it's running; on Windows, you probably can't
1975    save at all while the browser is running).
1976
1977    Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1978    Netscape cookies on saving.
1979
1980    In particular, the cookie version and port number information is lost,
1981    together with information about whether or not Path, Port and Discard were
1982    specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1983    domain as set in the HTTP header started with a dot (yes, I'm aware some
1984    domains in Netscape files start with a dot and some don't -- trust me, you
1985    really don't want to know any more about this).
1986
1987    Note that though Mozilla and Netscape use the same format, they use
1988    slightly different headers.  The class saves cookies using the Netscape
1989    header by default (Mozilla can cope with that).
1990
1991    """
1992    magic_re = re.compile("#( Netscape)? HTTP Cookie File")
1993    header = """\
1994# Netscape HTTP Cookie File
1995# http://curl.haxx.se/rfc/cookie_spec.html
1996# This is a generated file!  Do not edit.
1997
1998"""
1999
2000    def _really_load(self, f, filename, ignore_discard, ignore_expires):
2001        now = time.time()
2002
2003        magic = f.readline()
2004        if not self.magic_re.search(magic):
2005            raise LoadError(
2006                "%r does not look like a Netscape format cookies file" %
2007                filename)
2008
2009        try:
2010            while 1:
2011                line = f.readline()
2012                if line == "": break
2013
2014                # last field may be absent, so keep any trailing tab
2015                if line.endswith("\n"): line = line[:-1]
2016
2017                # skip comments and blank lines XXX what is $ for?
2018                if (line.strip().startswith(("#", "$")) or
2019                    line.strip() == ""):
2020                    continue
2021
2022                domain, domain_specified, path, secure, expires, name, value = \
2023                        line.split("\t")
2024                secure = (secure == "TRUE")
2025                domain_specified = (domain_specified == "TRUE")
2026                if name == "":
2027                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2028                    # with no name, whereas http.cookiejar regards it as a
2029                    # cookie with no value.
2030                    name = value
2031                    value = None
2032
2033                initial_dot = domain.startswith(".")
2034                assert domain_specified == initial_dot
2035
2036                discard = False
2037                if expires == "":
2038                    expires = None
2039                    discard = True
2040
2041                # assume path_specified is false
2042                c = Cookie(0, name, value,
2043                           None, False,
2044                           domain, domain_specified, initial_dot,
2045                           path, False,
2046                           secure,
2047                           expires,
2048                           discard,
2049                           None,
2050                           None,
2051                           {})
2052                if not ignore_discard and c.discard:
2053                    continue
2054                if not ignore_expires and c.is_expired(now):
2055                    continue
2056                self.set_cookie(c)
2057
2058        except OSError:
2059            raise
2060        except Exception:
2061            _warn_unhandled_exception()
2062            raise LoadError("invalid Netscape format cookies file %r: %r" %
2063                            (filename, line))
2064
2065    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2066        if filename is None:
2067            if self.filename is not None: filename = self.filename
2068            else: raise ValueError(MISSING_FILENAME_TEXT)
2069
2070        with open(filename, "w") as f:
2071            f.write(self.header)
2072            now = time.time()
2073            for cookie in self:
2074                if not ignore_discard and cookie.discard:
2075                    continue
2076                if not ignore_expires and cookie.is_expired(now):
2077                    continue
2078                if cookie.secure: secure = "TRUE"
2079                else: secure = "FALSE"
2080                if cookie.domain.startswith("."): initial_dot = "TRUE"
2081                else: initial_dot = "FALSE"
2082                if cookie.expires is not None:
2083                    expires = str(cookie.expires)
2084                else:
2085                    expires = ""
2086                if cookie.value is None:
2087                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2088                    # with no name, whereas http.cookiejar regards it as a
2089                    # cookie with no value.
2090                    name = ""
2091                    value = cookie.name
2092                else:
2093                    name = cookie.name
2094                    value = cookie.value
2095                f.write(
2096                    "\t".join([cookie.domain, initial_dot, cookie.path,
2097                               secure, expires, name, value])+
2098                    "\n")
2099