1"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
13
14                        CookieJar____
15                        /     \      \
16            FileCookieJar      \      \
17             /    |   \         \      \
18 MozillaCookieJar | LWPCookieJar \      \
19                  |               |      \
20                  |   ---MSIEBase |       \
21                  |  /      |     |        \
22                  | /   MSIEDBCookieJar BSDDBCookieJar
23                  |/
24               MSIECookieJar
25
26"""
27
28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29           'FileCookieJar', 'LWPCookieJar', 'lwp_cookie_str', 'LoadError',
30           'MozillaCookieJar']
31
32import re, urlparse, copy, time, urllib
33try:
34    import threading as _threading
35except ImportError:
36    import dummy_threading as _threading
37import httplib  # only for the default HTTP port
38from calendar import timegm
39
40debug = False   # set to True to enable debugging via the logging module
41logger = None
42
43def _debug(*args):
44    if not debug:
45        return
46    global logger
47    if not logger:
48        import logging
49        logger = logging.getLogger("cookielib")
50    return logger.debug(*args)
51
52
53DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
54MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55                         "instance initialised with one)")
56
57def _warn_unhandled_exception():
58    # There are a few catch-all except: statements in this module, for
59    # catching input that's bad in unexpected ways.  Warn if any
60    # exceptions are caught there.
61    import warnings, traceback, StringIO
62    f = StringIO.StringIO()
63    traceback.print_exc(None, f)
64    msg = f.getvalue()
65    warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
66
67
68# Date/time conversion
69# -----------------------------------------------------------------------------
70
71EPOCH_YEAR = 1970
72def _timegm(tt):
73    year, month, mday, hour, min, sec = tt[:6]
74    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76        return timegm(tt)
77    else:
78        return None
79
80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83MONTHS_LOWER = []
84for month in MONTHS: MONTHS_LOWER.append(month.lower())
85
86def time2isoz(t=None):
87    """Return a string representing time in seconds since epoch, t.
88
89    If the function is called without an argument, it will use the current
90    time.
91
92    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93    representing Universal Time (UTC, aka GMT).  An example of this format is:
94
95    1994-11-24 08:49:37Z
96
97    """
98    if t is None: t = time.time()
99    year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
100    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
101        year, mon, mday, hour, min, sec)
102
103def time2netscape(t=None):
104    """Return a string representing time in seconds since epoch, t.
105
106    If the function is called without an argument, it will use the current
107    time.
108
109    The format of the returned string is like this:
110
111    Wed, DD-Mon-YYYY HH:MM:SS GMT
112
113    """
114    if t is None: t = time.time()
115    year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
116    return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
117        DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
118
119
120UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
121
122TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
123def offset_from_tz_string(tz):
124    offset = None
125    if tz in UTC_ZONES:
126        offset = 0
127    else:
128        m = TIMEZONE_RE.search(tz)
129        if m:
130            offset = 3600 * int(m.group(2))
131            if m.group(3):
132                offset = offset + 60 * int(m.group(3))
133            if m.group(1) == '-':
134                offset = -offset
135    return offset
136
137def _str2time(day, mon, yr, hr, min, sec, tz):
138    # translate month name to number
139    # month numbers start with 1 (January)
140    try:
141        mon = MONTHS_LOWER.index(mon.lower())+1
142    except ValueError:
143        # maybe it's already a number
144        try:
145            imon = int(mon)
146        except ValueError:
147            return None
148        if 1 <= imon <= 12:
149            mon = imon
150        else:
151            return None
152
153    # make sure clock elements are defined
154    if hr is None: hr = 0
155    if min is None: min = 0
156    if sec is None: sec = 0
157
158    yr = int(yr)
159    day = int(day)
160    hr = int(hr)
161    min = int(min)
162    sec = int(sec)
163
164    if yr < 1000:
165        # find "obvious" year
166        cur_yr = time.localtime(time.time())[0]
167        m = cur_yr % 100
168        tmp = yr
169        yr = yr + cur_yr - m
170        m = m - tmp
171        if abs(m) > 50:
172            if m > 0: yr = yr + 100
173            else: yr = yr - 100
174
175    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
176    t = _timegm((yr, mon, day, hr, min, sec, tz))
177
178    if t is not None:
179        # adjust time using timezone string, to get absolute time since epoch
180        if tz is None:
181            tz = "UTC"
182        tz = tz.upper()
183        offset = offset_from_tz_string(tz)
184        if offset is None:
185            return None
186        t = t - offset
187
188    return t
189
190STRICT_DATE_RE = re.compile(
191    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
192    "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
193WEEKDAY_RE = re.compile(
194    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
195LOOSE_HTTP_DATE_RE = re.compile(
196    r"""^
197    (\d\d?)            # day
198       (?:\s+|[-\/])
199    (\w+)              # month
200        (?:\s+|[-\/])
201    (\d+)              # year
202    (?:
203          (?:\s+|:)    # separator before clock
204       (\d\d?):(\d\d)  # hour:min
205       (?::(\d\d))?    # optional seconds
206    )?                 # optional clock
207       \s*
208    ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
209       \s*
210    (?:\(\w+\))?       # ASCII representation of timezone in parens.
211       \s*$""", re.X)
212def http2time(text):
213    """Returns time in seconds since epoch of time represented by a string.
214
215    Return value is an integer.
216
217    None is returned if the format of str is unrecognized, the time is outside
218    the representable range, or the timezone string is not recognized.  If the
219    string contains no timezone, UTC is assumed.
220
221    The timezone in the string may be numerical (like "-0800" or "+0100") or a
222    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
223    timezone strings equivalent to UTC (zero offset) are known to the function.
224
225    The function loosely parses the following formats:
226
227    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
228    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
229    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
230    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
231    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
232    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
233
234    The parser ignores leading and trailing whitespace.  The time may be
235    absent.
236
237    If the year is given with only 2 digits, the function will select the
238    century that makes the year closest to the current date.
239
240    """
241    # fast exit for strictly conforming string
242    m = STRICT_DATE_RE.search(text)
243    if m:
244        g = m.groups()
245        mon = MONTHS_LOWER.index(g[1].lower()) + 1
246        tt = (int(g[2]), mon, int(g[0]),
247              int(g[3]), int(g[4]), float(g[5]))
248        return _timegm(tt)
249
250    # No, we need some messy parsing...
251
252    # clean up
253    text = text.lstrip()
254    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
255
256    # tz is time zone specifier string
257    day, mon, yr, hr, min, sec, tz = [None]*7
258
259    # loose regexp parse
260    m = LOOSE_HTTP_DATE_RE.search(text)
261    if m is not None:
262        day, mon, yr, hr, min, sec, tz = m.groups()
263    else:
264        return None  # bad format
265
266    return _str2time(day, mon, yr, hr, min, sec, tz)
267
268ISO_DATE_RE = re.compile(
269    """^
270    (\d{4})              # year
271       [-\/]?
272    (\d\d?)              # numerical month
273       [-\/]?
274    (\d\d?)              # day
275   (?:
276         (?:\s+|[-:Tt])  # separator before clock
277      (\d\d?):?(\d\d)    # hour:min
278      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
279   )?                    # optional clock
280      \s*
281   ([-+]?\d\d?:?(:?\d\d)?
282    |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
283      \s*$""", re.X)
284def iso2time(text):
285    """
286    As for http2time, but parses the ISO 8601 formats:
287
288    1994-02-03 14:15:29 -0100    -- ISO 8601 format
289    1994-02-03 14:15:29          -- zone is optional
290    1994-02-03                   -- only date
291    1994-02-03T14:15:29          -- Use T as separator
292    19940203T141529Z             -- ISO 8601 compact format
293    19940203                     -- only date
294
295    """
296    # clean up
297    text = text.lstrip()
298
299    # tz is time zone specifier string
300    day, mon, yr, hr, min, sec, tz = [None]*7
301
302    # loose regexp parse
303    m = ISO_DATE_RE.search(text)
304    if m is not None:
305        # XXX there's an extra bit of the timezone I'm ignoring here: is
306        #   this the right thing to do?
307        yr, mon, day, hr, min, sec, tz, _ = m.groups()
308    else:
309        return None  # bad format
310
311    return _str2time(day, mon, yr, hr, min, sec, tz)
312
313
314# Header parsing
315# -----------------------------------------------------------------------------
316
317def unmatched(match):
318    """Return unmatched part of re.Match object."""
319    start, end = match.span(0)
320    return match.string[:start]+match.string[end:]
321
322HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
323HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
324HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
325HEADER_ESCAPE_RE = re.compile(r"\\(.)")
326def split_header_words(header_values):
327    r"""Parse header values into a list of lists containing key,value pairs.
328
329    The function knows how to deal with ",", ";" and "=" as well as quoted
330    values after "=".  A list of space separated tokens are parsed as if they
331    were separated by ";".
332
333    If the header_values passed as argument contains multiple values, then they
334    are treated as if they were a single value separated by comma ",".
335
336    This means that this function is useful for parsing header fields that
337    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
338    the requirement for tokens).
339
340      headers           = #header
341      header            = (token | parameter) *( [";"] (token | parameter))
342
343      token             = 1*<any CHAR except CTLs or separators>
344      separators        = "(" | ")" | "<" | ">" | "@"
345                        | "," | ";" | ":" | "\" | <">
346                        | "/" | "[" | "]" | "?" | "="
347                        | "{" | "}" | SP | HT
348
349      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
350      qdtext            = <any TEXT except <">>
351      quoted-pair       = "\" CHAR
352
353      parameter         = attribute "=" value
354      attribute         = token
355      value             = token | quoted-string
356
357    Each header is represented by a list of key/value pairs.  The value for a
358    simple token (not part of a parameter) is None.  Syntactically incorrect
359    headers will not necessarily be parsed as you would want.
360
361    This is easier to describe with some examples:
362
363    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
364    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
365    >>> split_header_words(['text/html; charset="iso-8859-1"'])
366    [[('text/html', None), ('charset', 'iso-8859-1')]]
367    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
368    [[('Basic', None), ('realm', '"foobar"')]]
369
370    """
371    assert not isinstance(header_values, basestring)
372    result = []
373    for text in header_values:
374        orig_text = text
375        pairs = []
376        while text:
377            m = HEADER_TOKEN_RE.search(text)
378            if m:
379                text = unmatched(m)
380                name = m.group(1)
381                m = HEADER_QUOTED_VALUE_RE.search(text)
382                if m:  # quoted value
383                    text = unmatched(m)
384                    value = m.group(1)
385                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
386                else:
387                    m = HEADER_VALUE_RE.search(text)
388                    if m:  # unquoted value
389                        text = unmatched(m)
390                        value = m.group(1)
391                        value = value.rstrip()
392                    else:
393                        # no value, a lone token
394                        value = None
395                pairs.append((name, value))
396            elif text.lstrip().startswith(","):
397                # concatenated headers, as per RFC 2616 section 4.2
398                text = text.lstrip()[1:]
399                if pairs: result.append(pairs)
400                pairs = []
401            else:
402                # skip junk
403                non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
404                assert nr_junk_chars > 0, (
405                    "split_header_words bug: '%s', '%s', %s" %
406                    (orig_text, text, pairs))
407                text = non_junk
408        if pairs: result.append(pairs)
409    return result
410
411HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
412def join_header_words(lists):
413    """Do the inverse (almost) of the conversion done by split_header_words.
414
415    Takes a list of lists of (key, value) pairs and produces a single header
416    value.  Attribute values are quoted if needed.
417
418    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
419    'text/plain; charset="iso-8859/1"'
420    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
421    'text/plain, charset="iso-8859/1"'
422
423    """
424    headers = []
425    for pairs in lists:
426        attr = []
427        for k, v in pairs:
428            if v is not None:
429                if not re.search(r"^\w+$", v):
430                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
431                    v = '"%s"' % v
432                k = "%s=%s" % (k, v)
433            attr.append(k)
434        if attr: headers.append("; ".join(attr))
435    return ", ".join(headers)
436
437def _strip_quotes(text):
438    if text.startswith('"'):
439        text = text[1:]
440    if text.endswith('"'):
441        text = text[:-1]
442    return text
443
444def parse_ns_headers(ns_headers):
445    """Ad-hoc parser for Netscape protocol cookie-attributes.
446
447    The old Netscape cookie format for Set-Cookie can for instance contain
448    an unquoted "," in the expires field, so we have to use this ad-hoc
449    parser instead of split_header_words.
450
451    XXX This may not make the best possible effort to parse all the crap
452    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
453    parser is probably better, so could do worse than following that if
454    this ever gives any trouble.
455
456    Currently, this is also used for parsing RFC 2109 cookies.
457
458    """
459    known_attrs = ("expires", "domain", "path", "secure",
460                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
461                   "version", "port", "max-age")
462
463    result = []
464    for ns_header in ns_headers:
465        pairs = []
466        version_set = False
467        for ii, param in enumerate(re.split(r";\s*", ns_header)):
468            param = param.rstrip()
469            if param == "": continue
470            if "=" not in param:
471                k, v = param, None
472            else:
473                k, v = re.split(r"\s*=\s*", param, 1)
474                k = k.lstrip()
475            if ii != 0:
476                lc = k.lower()
477                if lc in known_attrs:
478                    k = lc
479                if k == "version":
480                    # This is an RFC 2109 cookie.
481                    v = _strip_quotes(v)
482                    version_set = True
483                if k == "expires":
484                    # convert expires date to seconds since epoch
485                    v = http2time(_strip_quotes(v))  # None if invalid
486            pairs.append((k, v))
487
488        if pairs:
489            if not version_set:
490                pairs.append(("version", "0"))
491            result.append(pairs)
492
493    return result
494
495
496IPV4_RE = re.compile(r"\.\d+$")
497def is_HDN(text):
498    """Return True if text is a host domain name."""
499    # XXX
500    # This may well be wrong.  Which RFC is HDN defined in, if any (for
501    #  the purposes of RFC 2965)?
502    # For the current implementation, what about IPv6?  Remember to look
503    #  at other uses of IPV4_RE also, if change this.
504    if IPV4_RE.search(text):
505        return False
506    if text == "":
507        return False
508    if text[0] == "." or text[-1] == ".":
509        return False
510    return True
511
512def domain_match(A, B):
513    """Return True if domain A domain-matches domain B, according to RFC 2965.
514
515    A and B may be host domain names or IP addresses.
516
517    RFC 2965, section 1:
518
519    Host names can be specified either as an IP address or a HDN string.
520    Sometimes we compare one host name with another.  (Such comparisons SHALL
521    be case-insensitive.)  Host A's name domain-matches host B's if
522
523         *  their host name strings string-compare equal; or
524
525         * A is a HDN string and has the form NB, where N is a non-empty
526            name string, B has the form .B', and B' is a HDN string.  (So,
527            x.y.com domain-matches .Y.com but not Y.com.)
528
529    Note that domain-match is not a commutative operation: a.b.c.com
530    domain-matches .c.com, but not the reverse.
531
532    """
533    # Note that, if A or B are IP addresses, the only relevant part of the
534    # definition of the domain-match algorithm is the direct string-compare.
535    A = A.lower()
536    B = B.lower()
537    if A == B:
538        return True
539    if not is_HDN(A):
540        return False
541    i = A.rfind(B)
542    if i == -1 or i == 0:
543        # A does not have form NB, or N is the empty string
544        return False
545    if not B.startswith("."):
546        return False
547    if not is_HDN(B[1:]):
548        return False
549    return True
550
551def liberal_is_HDN(text):
552    """Return True if text is a sort-of-like a host domain name.
553
554    For accepting/blocking domains.
555
556    """
557    if IPV4_RE.search(text):
558        return False
559    return True
560
561def user_domain_match(A, B):
562    """For blocking/accepting domains.
563
564    A and B may be host domain names or IP addresses.
565
566    """
567    A = A.lower()
568    B = B.lower()
569    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
570        if A == B:
571            # equal IP addresses
572            return True
573        return False
574    initial_dot = B.startswith(".")
575    if initial_dot and A.endswith(B):
576        return True
577    if not initial_dot and A == B:
578        return True
579    return False
580
581cut_port_re = re.compile(r":\d+$")
582def request_host(request):
583    """Return request-host, as defined by RFC 2965.
584
585    Variation from RFC: returned value is lowercased, for convenient
586    comparison.
587
588    """
589    url = request.get_full_url()
590    host = urlparse.urlparse(url)[1]
591    if host == "":
592        host = request.get_header("Host", "")
593
594    # remove port, if present
595    host = cut_port_re.sub("", host, 1)
596    return host.lower()
597
598def eff_request_host(request):
599    """Return a tuple (request-host, effective request-host name).
600
601    As defined by RFC 2965, except both are lowercased.
602
603    """
604    erhn = req_host = request_host(request)
605    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
606        erhn = req_host + ".local"
607    return req_host, erhn
608
609def request_path(request):
610    """Path component of request-URI, as defined by RFC 2965."""
611    url = request.get_full_url()
612    parts = urlparse.urlsplit(url)
613    path = escape_path(parts.path)
614    if not path.startswith("/"):
615        # fix bad RFC 2396 absoluteURI
616        path = "/" + path
617    return path
618
619def request_port(request):
620    host = request.get_host()
621    i = host.find(':')
622    if i >= 0:
623        port = host[i+1:]
624        try:
625            int(port)
626        except ValueError:
627            _debug("nonnumeric port: '%s'", port)
628            return None
629    else:
630        port = DEFAULT_HTTP_PORT
631    return port
632
633# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
634# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
635HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
636ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
637def uppercase_escaped_char(match):
638    return "%%%s" % match.group(1).upper()
639def escape_path(path):
640    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
641    # There's no knowing what character encoding was used to create URLs
642    # containing %-escapes, but since we have to pick one to escape invalid
643    # path characters, we pick UTF-8, as recommended in the HTML 4.0
644    # specification:
645    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
646    # And here, kind of: draft-fielding-uri-rfc2396bis-03
647    # (And in draft IRI specification: draft-duerst-iri-05)
648    # (And here, for new URI schemes: RFC 2718)
649    if isinstance(path, unicode):
650        path = path.encode("utf-8")
651    path = urllib.quote(path, HTTP_PATH_SAFE)
652    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
653    return path
654
655def reach(h):
656    """Return reach of host h, as defined by RFC 2965, section 1.
657
658    The reach R of a host name H is defined as follows:
659
660       *  If
661
662          -  H is the host domain name of a host; and,
663
664          -  H has the form A.B; and
665
666          -  A has no embedded (that is, interior) dots; and
667
668          -  B has at least one embedded dot, or B is the string "local".
669             then the reach of H is .B.
670
671       *  Otherwise, the reach of H is H.
672
673    >>> reach("www.acme.com")
674    '.acme.com'
675    >>> reach("acme.com")
676    'acme.com'
677    >>> reach("acme.local")
678    '.local'
679
680    """
681    i = h.find(".")
682    if i >= 0:
683        #a = h[:i]  # this line is only here to show what a is
684        b = h[i+1:]
685        i = b.find(".")
686        if is_HDN(h) and (i >= 0 or b == "local"):
687            return "."+b
688    return h
689
690def is_third_party(request):
691    """
692
693    RFC 2965, section 3.3.6:
694
695        An unverifiable transaction is to a third-party host if its request-
696        host U does not domain-match the reach R of the request-host O in the
697        origin transaction.
698
699    """
700    req_host = request_host(request)
701    if not domain_match(req_host, reach(request.get_origin_req_host())):
702        return True
703    else:
704        return False
705
706
707class Cookie:
708    """HTTP Cookie.
709
710    This class represents both Netscape and RFC 2965 cookies.
711
712    This is deliberately a very simple class.  It just holds attributes.  It's
713    possible to construct Cookie instances that don't comply with the cookie
714    standards.  CookieJar.make_cookies is the factory function for Cookie
715    objects -- it deals with cookie parsing, supplying defaults, and
716    normalising to the representation used in this class.  CookiePolicy is
717    responsible for checking them to see whether they should be accepted from
718    and returned to the server.
719
720    Note that the port may be present in the headers, but unspecified ("Port"
721    rather than"Port=80", for example); if this is the case, port is None.
722
723    """
724
725    def __init__(self, version, name, value,
726                 port, port_specified,
727                 domain, domain_specified, domain_initial_dot,
728                 path, path_specified,
729                 secure,
730                 expires,
731                 discard,
732                 comment,
733                 comment_url,
734                 rest,
735                 rfc2109=False,
736                 ):
737
738        if version is not None: version = int(version)
739        if expires is not None: expires = int(expires)
740        if port is None and port_specified is True:
741            raise ValueError("if port is None, port_specified must be false")
742
743        self.version = version
744        self.name = name
745        self.value = value
746        self.port = port
747        self.port_specified = port_specified
748        # normalise case, as per RFC 2965 section 3.3.3
749        self.domain = domain.lower()
750        self.domain_specified = domain_specified
751        # Sigh.  We need to know whether the domain given in the
752        # cookie-attribute had an initial dot, in order to follow RFC 2965
753        # (as clarified in draft errata).  Needed for the returned $Domain
754        # value.
755        self.domain_initial_dot = domain_initial_dot
756        self.path = path
757        self.path_specified = path_specified
758        self.secure = secure
759        self.expires = expires
760        self.discard = discard
761        self.comment = comment
762        self.comment_url = comment_url
763        self.rfc2109 = rfc2109
764
765        self._rest = copy.copy(rest)
766
767    def has_nonstandard_attr(self, name):
768        return name in self._rest
769    def get_nonstandard_attr(self, name, default=None):
770        return self._rest.get(name, default)
771    def set_nonstandard_attr(self, name, value):
772        self._rest[name] = value
773
774    def is_expired(self, now=None):
775        if now is None: now = time.time()
776        if (self.expires is not None) and (self.expires <= now):
777            return True
778        return False
779
780    def __str__(self):
781        if self.port is None: p = ""
782        else: p = ":"+self.port
783        limit = self.domain + p + self.path
784        if self.value is not None:
785            namevalue = "%s=%s" % (self.name, self.value)
786        else:
787            namevalue = self.name
788        return "<Cookie %s for %s>" % (namevalue, limit)
789
790    def __repr__(self):
791        args = []
792        for name in ("version", "name", "value",
793                     "port", "port_specified",
794                     "domain", "domain_specified", "domain_initial_dot",
795                     "path", "path_specified",
796                     "secure", "expires", "discard", "comment", "comment_url",
797                     ):
798            attr = getattr(self, name)
799            args.append("%s=%s" % (name, repr(attr)))
800        args.append("rest=%s" % repr(self._rest))
801        args.append("rfc2109=%s" % repr(self.rfc2109))
802        return "Cookie(%s)" % ", ".join(args)
803
804
805class CookiePolicy:
806    """Defines which cookies get accepted from and returned to server.
807
808    May also modify cookies, though this is probably a bad idea.
809
810    The subclass DefaultCookiePolicy defines the standard rules for Netscape
811    and RFC 2965 cookies -- override that if you want a customised policy.
812
813    """
814    def set_ok(self, cookie, request):
815        """Return true if (and only if) cookie should be accepted from server.
816
817        Currently, pre-expired cookies never get this far -- the CookieJar
818        class deletes such cookies itself.
819
820        """
821        raise NotImplementedError()
822
823    def return_ok(self, cookie, request):
824        """Return true if (and only if) cookie should be returned to server."""
825        raise NotImplementedError()
826
827    def domain_return_ok(self, domain, request):
828        """Return false if cookies should not be returned, given cookie domain.
829        """
830        return True
831
832    def path_return_ok(self, path, request):
833        """Return false if cookies should not be returned, given cookie path.
834        """
835        return True
836
837
838class DefaultCookiePolicy(CookiePolicy):
839    """Implements the standard rules for accepting and returning cookies."""
840
841    DomainStrictNoDots = 1
842    DomainStrictNonDomain = 2
843    DomainRFC2965Match = 4
844
845    DomainLiberal = 0
846    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
847
848    def __init__(self,
849                 blocked_domains=None, allowed_domains=None,
850                 netscape=True, rfc2965=False,
851                 rfc2109_as_netscape=None,
852                 hide_cookie2=False,
853                 strict_domain=False,
854                 strict_rfc2965_unverifiable=True,
855                 strict_ns_unverifiable=False,
856                 strict_ns_domain=DomainLiberal,
857                 strict_ns_set_initial_dollar=False,
858                 strict_ns_set_path=False,
859                 ):
860        """Constructor arguments should be passed as keyword arguments only."""
861        self.netscape = netscape
862        self.rfc2965 = rfc2965
863        self.rfc2109_as_netscape = rfc2109_as_netscape
864        self.hide_cookie2 = hide_cookie2
865        self.strict_domain = strict_domain
866        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
867        self.strict_ns_unverifiable = strict_ns_unverifiable
868        self.strict_ns_domain = strict_ns_domain
869        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
870        self.strict_ns_set_path = strict_ns_set_path
871
872        if blocked_domains is not None:
873            self._blocked_domains = tuple(blocked_domains)
874        else:
875            self._blocked_domains = ()
876
877        if allowed_domains is not None:
878            allowed_domains = tuple(allowed_domains)
879        self._allowed_domains = allowed_domains
880
881    def blocked_domains(self):
882        """Return the sequence of blocked domains (as a tuple)."""
883        return self._blocked_domains
884    def set_blocked_domains(self, blocked_domains):
885        """Set the sequence of blocked domains."""
886        self._blocked_domains = tuple(blocked_domains)
887
888    def is_blocked(self, domain):
889        for blocked_domain in self._blocked_domains:
890            if user_domain_match(domain, blocked_domain):
891                return True
892        return False
893
894    def allowed_domains(self):
895        """Return None, or the sequence of allowed domains (as a tuple)."""
896        return self._allowed_domains
897    def set_allowed_domains(self, allowed_domains):
898        """Set the sequence of allowed domains, or None."""
899        if allowed_domains is not None:
900            allowed_domains = tuple(allowed_domains)
901        self._allowed_domains = allowed_domains
902
903    def is_not_allowed(self, domain):
904        if self._allowed_domains is None:
905            return False
906        for allowed_domain in self._allowed_domains:
907            if user_domain_match(domain, allowed_domain):
908                return False
909        return True
910
911    def set_ok(self, cookie, request):
912        """
913        If you override .set_ok(), be sure to call this method.  If it returns
914        false, so should your subclass (assuming your subclass wants to be more
915        strict about which cookies to accept).
916
917        """
918        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
919
920        assert cookie.name is not None
921
922        for n in "version", "verifiability", "name", "path", "domain", "port":
923            fn_name = "set_ok_"+n
924            fn = getattr(self, fn_name)
925            if not fn(cookie, request):
926                return False
927
928        return True
929
930    def set_ok_version(self, cookie, request):
931        if cookie.version is None:
932            # Version is always set to 0 by parse_ns_headers if it's a Netscape
933            # cookie, so this must be an invalid RFC 2965 cookie.
934            _debug("   Set-Cookie2 without version attribute (%s=%s)",
935                   cookie.name, cookie.value)
936            return False
937        if cookie.version > 0 and not self.rfc2965:
938            _debug("   RFC 2965 cookies are switched off")
939            return False
940        elif cookie.version == 0 and not self.netscape:
941            _debug("   Netscape cookies are switched off")
942            return False
943        return True
944
945    def set_ok_verifiability(self, cookie, request):
946        if request.is_unverifiable() and is_third_party(request):
947            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
948                _debug("   third-party RFC 2965 cookie during "
949                             "unverifiable transaction")
950                return False
951            elif cookie.version == 0 and self.strict_ns_unverifiable:
952                _debug("   third-party Netscape cookie during "
953                             "unverifiable transaction")
954                return False
955        return True
956
957    def set_ok_name(self, cookie, request):
958        # Try and stop servers setting V0 cookies designed to hack other
959        # servers that know both V0 and V1 protocols.
960        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
961            cookie.name.startswith("$")):
962            _debug("   illegal name (starts with '$'): '%s'", cookie.name)
963            return False
964        return True
965
966    def set_ok_path(self, cookie, request):
967        if cookie.path_specified:
968            req_path = request_path(request)
969            if ((cookie.version > 0 or
970                 (cookie.version == 0 and self.strict_ns_set_path)) and
971                not req_path.startswith(cookie.path)):
972                _debug("   path attribute %s is not a prefix of request "
973                       "path %s", cookie.path, req_path)
974                return False
975        return True
976
977    def set_ok_domain(self, cookie, request):
978        if self.is_blocked(cookie.domain):
979            _debug("   domain %s is in user block-list", cookie.domain)
980            return False
981        if self.is_not_allowed(cookie.domain):
982            _debug("   domain %s is not in user allow-list", cookie.domain)
983            return False
984        if cookie.domain_specified:
985            req_host, erhn = eff_request_host(request)
986            domain = cookie.domain
987            if self.strict_domain and (domain.count(".") >= 2):
988                # XXX This should probably be compared with the Konqueror
989                # (kcookiejar.cpp) and Mozilla implementations, but it's a
990                # losing battle.
991                i = domain.rfind(".")
992                j = domain.rfind(".", 0, i)
993                if j == 0:  # domain like .foo.bar
994                    tld = domain[i+1:]
995                    sld = domain[j+1:i]
996                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
997                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
998                       "info", "jobs", "mobi", "museum", "name", "pro",
999                       "travel", "eu") and len(tld) == 2:
1000                        # domain like .co.uk
1001                        _debug("   country-code second level domain %s", domain)
1002                        return False
1003            if domain.startswith("."):
1004                undotted_domain = domain[1:]
1005            else:
1006                undotted_domain = domain
1007            embedded_dots = (undotted_domain.find(".") >= 0)
1008            if not embedded_dots and domain != ".local":
1009                _debug("   non-local domain %s contains no embedded dot",
1010                       domain)
1011                return False
1012            if cookie.version == 0:
1013                if (not erhn.endswith(domain) and
1014                    (not erhn.startswith(".") and
1015                     not ("."+erhn).endswith(domain))):
1016                    _debug("   effective request-host %s (even with added "
1017                           "initial dot) does not end end with %s",
1018                           erhn, domain)
1019                    return False
1020            if (cookie.version > 0 or
1021                (self.strict_ns_domain & self.DomainRFC2965Match)):
1022                if not domain_match(erhn, domain):
1023                    _debug("   effective request-host %s does not domain-match "
1024                           "%s", erhn, domain)
1025                    return False
1026            if (cookie.version > 0 or
1027                (self.strict_ns_domain & self.DomainStrictNoDots)):
1028                host_prefix = req_host[:-len(domain)]
1029                if (host_prefix.find(".") >= 0 and
1030                    not IPV4_RE.search(req_host)):
1031                    _debug("   host prefix %s for domain %s contains a dot",
1032                           host_prefix, domain)
1033                    return False
1034        return True
1035
1036    def set_ok_port(self, cookie, request):
1037        if cookie.port_specified:
1038            req_port = request_port(request)
1039            if req_port is None:
1040                req_port = "80"
1041            else:
1042                req_port = str(req_port)
1043            for p in cookie.port.split(","):
1044                try:
1045                    int(p)
1046                except ValueError:
1047                    _debug("   bad port %s (not numeric)", p)
1048                    return False
1049                if p == req_port:
1050                    break
1051            else:
1052                _debug("   request port (%s) not found in %s",
1053                       req_port, cookie.port)
1054                return False
1055        return True
1056
1057    def return_ok(self, cookie, request):
1058        """
1059        If you override .return_ok(), be sure to call this method.  If it
1060        returns false, so should your subclass (assuming your subclass wants to
1061        be more strict about which cookies to return).
1062
1063        """
1064        # Path has already been checked by .path_return_ok(), and domain
1065        # blocking done by .domain_return_ok().
1066        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1067
1068        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1069            fn_name = "return_ok_"+n
1070            fn = getattr(self, fn_name)
1071            if not fn(cookie, request):
1072                return False
1073        return True
1074
1075    def return_ok_version(self, cookie, request):
1076        if cookie.version > 0 and not self.rfc2965:
1077            _debug("   RFC 2965 cookies are switched off")
1078            return False
1079        elif cookie.version == 0 and not self.netscape:
1080            _debug("   Netscape cookies are switched off")
1081            return False
1082        return True
1083
1084    def return_ok_verifiability(self, cookie, request):
1085        if request.is_unverifiable() and is_third_party(request):
1086            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1087                _debug("   third-party RFC 2965 cookie during unverifiable "
1088                       "transaction")
1089                return False
1090            elif cookie.version == 0 and self.strict_ns_unverifiable:
1091                _debug("   third-party Netscape cookie during unverifiable "
1092                       "transaction")
1093                return False
1094        return True
1095
1096    def return_ok_secure(self, cookie, request):
1097        if cookie.secure and request.get_type() != "https":
1098            _debug("   secure cookie with non-secure request")
1099            return False
1100        return True
1101
1102    def return_ok_expires(self, cookie, request):
1103        if cookie.is_expired(self._now):
1104            _debug("   cookie expired")
1105            return False
1106        return True
1107
1108    def return_ok_port(self, cookie, request):
1109        if cookie.port:
1110            req_port = request_port(request)
1111            if req_port is None:
1112                req_port = "80"
1113            for p in cookie.port.split(","):
1114                if p == req_port:
1115                    break
1116            else:
1117                _debug("   request port %s does not match cookie port %s",
1118                       req_port, cookie.port)
1119                return False
1120        return True
1121
1122    def return_ok_domain(self, cookie, request):
1123        req_host, erhn = eff_request_host(request)
1124        domain = cookie.domain
1125
1126        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1127        if (cookie.version == 0 and
1128            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1129            not cookie.domain_specified and domain != erhn):
1130            _debug("   cookie with unspecified domain does not string-compare "
1131                   "equal to request domain")
1132            return False
1133
1134        if cookie.version > 0 and not domain_match(erhn, domain):
1135            _debug("   effective request-host name %s does not domain-match "
1136                   "RFC 2965 cookie domain %s", erhn, domain)
1137            return False
1138        if cookie.version == 0 and not ("."+erhn).endswith(domain):
1139            _debug("   request-host %s does not match Netscape cookie domain "
1140                   "%s", req_host, domain)
1141            return False
1142        return True
1143
1144    def domain_return_ok(self, domain, request):
1145        # Liberal check of.  This is here as an optimization to avoid
1146        # having to load lots of MSIE cookie files unless necessary.
1147        req_host, erhn = eff_request_host(request)
1148        if not req_host.startswith("."):
1149            req_host = "."+req_host
1150        if not erhn.startswith("."):
1151            erhn = "."+erhn
1152        if not (req_host.endswith(domain) or erhn.endswith(domain)):
1153            #_debug("   request domain %s does not match cookie domain %s",
1154            #       req_host, domain)
1155            return False
1156
1157        if self.is_blocked(domain):
1158            _debug("   domain %s is in user block-list", domain)
1159            return False
1160        if self.is_not_allowed(domain):
1161            _debug("   domain %s is not in user allow-list", domain)
1162            return False
1163
1164        return True
1165
1166    def path_return_ok(self, path, request):
1167        _debug("- checking cookie path=%s", path)
1168        req_path = request_path(request)
1169        if not req_path.startswith(path):
1170            _debug("  %s does not path-match %s", req_path, path)
1171            return False
1172        return True
1173
1174
1175def vals_sorted_by_key(adict):
1176    keys = adict.keys()
1177    keys.sort()
1178    return map(adict.get, keys)
1179
1180def deepvalues(mapping):
1181    """Iterates over nested mapping, depth-first, in sorted order by key."""
1182    values = vals_sorted_by_key(mapping)
1183    for obj in values:
1184        mapping = False
1185        try:
1186            obj.items
1187        except AttributeError:
1188            pass
1189        else:
1190            mapping = True
1191            for subobj in deepvalues(obj):
1192                yield subobj
1193        if not mapping:
1194            yield obj
1195
1196
1197# Used as second parameter to dict.get() method, to distinguish absent
1198# dict key from one with a None value.
1199class Absent: pass
1200
1201class CookieJar:
1202    """Collection of HTTP cookies.
1203
1204    You may not need to know about this class: try
1205    urllib2.build_opener(HTTPCookieProcessor).open(url).
1206
1207    """
1208
1209    non_word_re = re.compile(r"\W")
1210    quote_re = re.compile(r"([\"\\])")
1211    strict_domain_re = re.compile(r"\.?[^.]*")
1212    domain_re = re.compile(r"[^.]*")
1213    dots_re = re.compile(r"^\.+")
1214
1215    magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1216
1217    def __init__(self, policy=None):
1218        if policy is None:
1219            policy = DefaultCookiePolicy()
1220        self._policy = policy
1221
1222        self._cookies_lock = _threading.RLock()
1223        self._cookies = {}
1224
1225    def set_policy(self, policy):
1226        self._policy = policy
1227
1228    def _cookies_for_domain(self, domain, request):
1229        cookies = []
1230        if not self._policy.domain_return_ok(domain, request):
1231            return []
1232        _debug("Checking %s for cookies to return", domain)
1233        cookies_by_path = self._cookies[domain]
1234        for path in cookies_by_path.keys():
1235            if not self._policy.path_return_ok(path, request):
1236                continue
1237            cookies_by_name = cookies_by_path[path]
1238            for cookie in cookies_by_name.values():
1239                if not self._policy.return_ok(cookie, request):
1240                    _debug("   not returning cookie")
1241                    continue
1242                _debug("   it's a match")
1243                cookies.append(cookie)
1244        return cookies
1245
1246    def _cookies_for_request(self, request):
1247        """Return a list of cookies to be returned to server."""
1248        cookies = []
1249        for domain in self._cookies.keys():
1250            cookies.extend(self._cookies_for_domain(domain, request))
1251        return cookies
1252
1253    def _cookie_attrs(self, cookies):
1254        """Return a list of cookie-attributes to be returned to server.
1255
1256        like ['foo="bar"; $Path="/"', ...]
1257
1258        The $Version attribute is also added when appropriate (currently only
1259        once per request).
1260
1261        """
1262        # add cookies in order of most specific (ie. longest) path first
1263        cookies.sort(key=lambda arg: len(arg.path), reverse=True)
1264
1265        version_set = False
1266
1267        attrs = []
1268        for cookie in cookies:
1269            # set version of Cookie header
1270            # XXX
1271            # What should it be if multiple matching Set-Cookie headers have
1272            #  different versions themselves?
1273            # Answer: there is no answer; was supposed to be settled by
1274            #  RFC 2965 errata, but that may never appear...
1275            version = cookie.version
1276            if not version_set:
1277                version_set = True
1278                if version > 0:
1279                    attrs.append("$Version=%s" % version)
1280
1281            # quote cookie value if necessary
1282            # (not for Netscape protocol, which already has any quotes
1283            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1284            if ((cookie.value is not None) and
1285                self.non_word_re.search(cookie.value) and version > 0):
1286                value = self.quote_re.sub(r"\\\1", cookie.value)
1287            else:
1288                value = cookie.value
1289
1290            # add cookie-attributes to be returned in Cookie header
1291            if cookie.value is None:
1292                attrs.append(cookie.name)
1293            else:
1294                attrs.append("%s=%s" % (cookie.name, value))
1295            if version > 0:
1296                if cookie.path_specified:
1297                    attrs.append('$Path="%s"' % cookie.path)
1298                if cookie.domain.startswith("."):
1299                    domain = cookie.domain
1300                    if (not cookie.domain_initial_dot and
1301                        domain.startswith(".")):
1302                        domain = domain[1:]
1303                    attrs.append('$Domain="%s"' % domain)
1304                if cookie.port is not None:
1305                    p = "$Port"
1306                    if cookie.port_specified:
1307                        p = p + ('="%s"' % cookie.port)
1308                    attrs.append(p)
1309
1310        return attrs
1311
1312    def add_cookie_header(self, request):
1313        """Add correct Cookie: header to request (urllib2.Request object).
1314
1315        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1316
1317        """
1318        _debug("add_cookie_header")
1319        self._cookies_lock.acquire()
1320        try:
1321
1322            self._policy._now = self._now = int(time.time())
1323
1324            cookies = self._cookies_for_request(request)
1325
1326            attrs = self._cookie_attrs(cookies)
1327            if attrs:
1328                if not request.has_header("Cookie"):
1329                    request.add_unredirected_header(
1330                        "Cookie", "; ".join(attrs))
1331
1332            # if necessary, advertise that we know RFC 2965
1333            if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1334                not request.has_header("Cookie2")):
1335                for cookie in cookies:
1336                    if cookie.version != 1:
1337                        request.add_unredirected_header("Cookie2", '$Version="1"')
1338                        break
1339
1340        finally:
1341            self._cookies_lock.release()
1342
1343        self.clear_expired_cookies()
1344
1345    def _normalized_cookie_tuples(self, attrs_set):
1346        """Return list of tuples containing normalised cookie information.
1347
1348        attrs_set is the list of lists of key,value pairs extracted from
1349        the Set-Cookie or Set-Cookie2 headers.
1350
1351        Tuples are name, value, standard, rest, where name and value are the
1352        cookie name and value, standard is a dictionary containing the standard
1353        cookie-attributes (discard, secure, version, expires or max-age,
1354        domain, path and port) and rest is a dictionary containing the rest of
1355        the cookie-attributes.
1356
1357        """
1358        cookie_tuples = []
1359
1360        boolean_attrs = "discard", "secure"
1361        value_attrs = ("version",
1362                       "expires", "max-age",
1363                       "domain", "path", "port",
1364                       "comment", "commenturl")
1365
1366        for cookie_attrs in attrs_set:
1367            name, value = cookie_attrs[0]
1368
1369            # Build dictionary of standard cookie-attributes (standard) and
1370            # dictionary of other cookie-attributes (rest).
1371
1372            # Note: expiry time is normalised to seconds since epoch.  V0
1373            # cookies should have the Expires cookie-attribute, and V1 cookies
1374            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1375            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1376            # accept either (but prefer Max-Age).
1377            max_age_set = False
1378
1379            bad_cookie = False
1380
1381            standard = {}
1382            rest = {}
1383            for k, v in cookie_attrs[1:]:
1384                lc = k.lower()
1385                # don't lose case distinction for unknown fields
1386                if lc in value_attrs or lc in boolean_attrs:
1387                    k = lc
1388                if k in boolean_attrs and v is None:
1389                    # boolean cookie-attribute is present, but has no value
1390                    # (like "discard", rather than "port=80")
1391                    v = True
1392                if k in standard:
1393                    # only first value is significant
1394                    continue
1395                if k == "domain":
1396                    if v is None:
1397                        _debug("   missing value for domain attribute")
1398                        bad_cookie = True
1399                        break
1400                    # RFC 2965 section 3.3.3
1401                    v = v.lower()
1402                if k == "expires":
1403                    if max_age_set:
1404                        # Prefer max-age to expires (like Mozilla)
1405                        continue
1406                    if v is None:
1407                        _debug("   missing or invalid value for expires "
1408                              "attribute: treating as session cookie")
1409                        continue
1410                if k == "max-age":
1411                    max_age_set = True
1412                    try:
1413                        v = int(v)
1414                    except ValueError:
1415                        _debug("   missing or invalid (non-numeric) value for "
1416                              "max-age attribute")
1417                        bad_cookie = True
1418                        break
1419                    # convert RFC 2965 Max-Age to seconds since epoch
1420                    # XXX Strictly you're supposed to follow RFC 2616
1421                    #   age-calculation rules.  Remember that zero Max-Age is a
1422                    #   is a request to discard (old and new) cookie, though.
1423                    k = "expires"
1424                    v = self._now + v
1425                if (k in value_attrs) or (k in boolean_attrs):
1426                    if (v is None and
1427                        k not in ("port", "comment", "commenturl")):
1428                        _debug("   missing value for %s attribute" % k)
1429                        bad_cookie = True
1430                        break
1431                    standard[k] = v
1432                else:
1433                    rest[k] = v
1434
1435            if bad_cookie:
1436                continue
1437
1438            cookie_tuples.append((name, value, standard, rest))
1439
1440        return cookie_tuples
1441
1442    def _cookie_from_cookie_tuple(self, tup, request):
1443        # standard is dict of standard cookie-attributes, rest is dict of the
1444        # rest of them
1445        name, value, standard, rest = tup
1446
1447        domain = standard.get("domain", Absent)
1448        path = standard.get("path", Absent)
1449        port = standard.get("port", Absent)
1450        expires = standard.get("expires", Absent)
1451
1452        # set the easy defaults
1453        version = standard.get("version", None)
1454        if version is not None:
1455            try:
1456                version = int(version)
1457            except ValueError:
1458                return None  # invalid version, ignore cookie
1459        secure = standard.get("secure", False)
1460        # (discard is also set if expires is Absent)
1461        discard = standard.get("discard", False)
1462        comment = standard.get("comment", None)
1463        comment_url = standard.get("commenturl", None)
1464
1465        # set default path
1466        if path is not Absent and path != "":
1467            path_specified = True
1468            path = escape_path(path)
1469        else:
1470            path_specified = False
1471            path = request_path(request)
1472            i = path.rfind("/")
1473            if i != -1:
1474                if version == 0:
1475                    # Netscape spec parts company from reality here
1476                    path = path[:i]
1477                else:
1478                    path = path[:i+1]
1479            if len(path) == 0: path = "/"
1480
1481        # set default domain
1482        domain_specified = domain is not Absent
1483        # but first we have to remember whether it starts with a dot
1484        domain_initial_dot = False
1485        if domain_specified:
1486            domain_initial_dot = bool(domain.startswith("."))
1487        if domain is Absent:
1488            req_host, erhn = eff_request_host(request)
1489            domain = erhn
1490        elif not domain.startswith("."):
1491            domain = "."+domain
1492
1493        # set default port
1494        port_specified = False
1495        if port is not Absent:
1496            if port is None:
1497                # Port attr present, but has no value: default to request port.
1498                # Cookie should then only be sent back on that port.
1499                port = request_port(request)
1500            else:
1501                port_specified = True
1502                port = re.sub(r"\s+", "", port)
1503        else:
1504            # No port attr present.  Cookie can be sent back on any port.
1505            port = None
1506
1507        # set default expires and discard
1508        if expires is Absent:
1509            expires = None
1510            discard = True
1511        elif expires <= self._now:
1512            # Expiry date in past is request to delete cookie.  This can't be
1513            # in DefaultCookiePolicy, because can't delete cookies there.
1514            try:
1515                self.clear(domain, path, name)
1516            except KeyError:
1517                pass
1518            _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1519                   domain, path, name)
1520            return None
1521
1522        return Cookie(version,
1523                      name, value,
1524                      port, port_specified,
1525                      domain, domain_specified, domain_initial_dot,
1526                      path, path_specified,
1527                      secure,
1528                      expires,
1529                      discard,
1530                      comment,
1531                      comment_url,
1532                      rest)
1533
1534    def _cookies_from_attrs_set(self, attrs_set, request):
1535        cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1536
1537        cookies = []
1538        for tup in cookie_tuples:
1539            cookie = self._cookie_from_cookie_tuple(tup, request)
1540            if cookie: cookies.append(cookie)
1541        return cookies
1542
1543    def _process_rfc2109_cookies(self, cookies):
1544        rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1545        if rfc2109_as_ns is None:
1546            rfc2109_as_ns = not self._policy.rfc2965
1547        for cookie in cookies:
1548            if cookie.version == 1:
1549                cookie.rfc2109 = True
1550                if rfc2109_as_ns:
1551                    # treat 2109 cookies as Netscape cookies rather than
1552                    # as RFC2965 cookies
1553                    cookie.version = 0
1554
1555    def make_cookies(self, response, request):
1556        """Return sequence of Cookie objects extracted from response object."""
1557        # get cookie-attributes for RFC 2965 and Netscape protocols
1558        headers = response.info()
1559        rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1560        ns_hdrs = headers.getheaders("Set-Cookie")
1561
1562        rfc2965 = self._policy.rfc2965
1563        netscape = self._policy.netscape
1564
1565        if ((not rfc2965_hdrs and not ns_hdrs) or
1566            (not ns_hdrs and not rfc2965) or
1567            (not rfc2965_hdrs and not netscape) or
1568            (not netscape and not rfc2965)):
1569            return []  # no relevant cookie headers: quick exit
1570
1571        try:
1572            cookies = self._cookies_from_attrs_set(
1573                split_header_words(rfc2965_hdrs), request)
1574        except Exception:
1575            _warn_unhandled_exception()
1576            cookies = []
1577
1578        if ns_hdrs and netscape:
1579            try:
1580                # RFC 2109 and Netscape cookies
1581                ns_cookies = self._cookies_from_attrs_set(
1582                    parse_ns_headers(ns_hdrs), request)
1583            except Exception:
1584                _warn_unhandled_exception()
1585                ns_cookies = []
1586            self._process_rfc2109_cookies(ns_cookies)
1587
1588            # Look for Netscape cookies (from Set-Cookie headers) that match
1589            # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1590            # For each match, keep the RFC 2965 cookie and ignore the Netscape
1591            # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1592            # bundled in with the Netscape cookies for this purpose, which is
1593            # reasonable behaviour.
1594            if rfc2965:
1595                lookup = {}
1596                for cookie in cookies:
1597                    lookup[(cookie.domain, cookie.path, cookie.name)] = None
1598
1599                def no_matching_rfc2965(ns_cookie, lookup=lookup):
1600                    key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1601                    return key not in lookup
1602                ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1603
1604            if ns_cookies:
1605                cookies.extend(ns_cookies)
1606
1607        return cookies
1608
1609    def set_cookie_if_ok(self, cookie, request):
1610        """Set a cookie if policy says it's OK to do so."""
1611        self._cookies_lock.acquire()
1612        try:
1613            self._policy._now = self._now = int(time.time())
1614
1615            if self._policy.set_ok(cookie, request):
1616                self.set_cookie(cookie)
1617
1618
1619        finally:
1620            self._cookies_lock.release()
1621
1622    def set_cookie(self, cookie):
1623        """Set a cookie, without checking whether or not it should be set."""
1624        c = self._cookies
1625        self._cookies_lock.acquire()
1626        try:
1627            if cookie.domain not in c: c[cookie.domain] = {}
1628            c2 = c[cookie.domain]
1629            if cookie.path not in c2: c2[cookie.path] = {}
1630            c3 = c2[cookie.path]
1631            c3[cookie.name] = cookie
1632        finally:
1633            self._cookies_lock.release()
1634
1635    def extract_cookies(self, response, request):
1636        """Extract cookies from response, where allowable given the request."""
1637        _debug("extract_cookies: %s", response.info())
1638        self._cookies_lock.acquire()
1639        try:
1640            self._policy._now = self._now = int(time.time())
1641
1642            for cookie in self.make_cookies(response, request):
1643                if self._policy.set_ok(cookie, request):
1644                    _debug(" setting cookie: %s", cookie)
1645                    self.set_cookie(cookie)
1646        finally:
1647            self._cookies_lock.release()
1648
1649    def clear(self, domain=None, path=None, name=None):
1650        """Clear some cookies.
1651
1652        Invoking this method without arguments will clear all cookies.  If
1653        given a single argument, only cookies belonging to that domain will be
1654        removed.  If given two arguments, cookies belonging to the specified
1655        path within that domain are removed.  If given three arguments, then
1656        the cookie with the specified name, path and domain is removed.
1657
1658        Raises KeyError if no matching cookie exists.
1659
1660        """
1661        if name is not None:
1662            if (domain is None) or (path is None):
1663                raise ValueError(
1664                    "domain and path must be given to remove a cookie by name")
1665            del self._cookies[domain][path][name]
1666        elif path is not None:
1667            if domain is None:
1668                raise ValueError(
1669                    "domain must be given to remove cookies by path")
1670            del self._cookies[domain][path]
1671        elif domain is not None:
1672            del self._cookies[domain]
1673        else:
1674            self._cookies = {}
1675
1676    def clear_session_cookies(self):
1677        """Discard all session cookies.
1678
1679        Note that the .save() method won't save session cookies anyway, unless
1680        you ask otherwise by passing a true ignore_discard argument.
1681
1682        """
1683        self._cookies_lock.acquire()
1684        try:
1685            for cookie in self:
1686                if cookie.discard:
1687                    self.clear(cookie.domain, cookie.path, cookie.name)
1688        finally:
1689            self._cookies_lock.release()
1690
1691    def clear_expired_cookies(self):
1692        """Discard all expired cookies.
1693
1694        You probably don't need to call this method: expired cookies are never
1695        sent back to the server (provided you're using DefaultCookiePolicy),
1696        this method is called by CookieJar itself every so often, and the
1697        .save() method won't save expired cookies anyway (unless you ask
1698        otherwise by passing a true ignore_expires argument).
1699
1700        """
1701        self._cookies_lock.acquire()
1702        try:
1703            now = time.time()
1704            for cookie in self:
1705                if cookie.is_expired(now):
1706                    self.clear(cookie.domain, cookie.path, cookie.name)
1707        finally:
1708            self._cookies_lock.release()
1709
1710    def __iter__(self):
1711        return deepvalues(self._cookies)
1712
1713    def __len__(self):
1714        """Return number of contained cookies."""
1715        i = 0
1716        for cookie in self: i = i + 1
1717        return i
1718
1719    def __repr__(self):
1720        r = []
1721        for cookie in self: r.append(repr(cookie))
1722        return "<%s[%s]>" % (self.__class__, ", ".join(r))
1723
1724    def __str__(self):
1725        r = []
1726        for cookie in self: r.append(str(cookie))
1727        return "<%s[%s]>" % (self.__class__, ", ".join(r))
1728
1729
1730# derives from IOError for backwards-compatibility with Python 2.4.0
1731class LoadError(IOError): pass
1732
1733class FileCookieJar(CookieJar):
1734    """CookieJar that can be loaded from and saved to a file."""
1735
1736    def __init__(self, filename=None, delayload=False, policy=None):
1737        """
1738        Cookies are NOT loaded from the named file until either the .load() or
1739        .revert() method is called.
1740
1741        """
1742        CookieJar.__init__(self, policy)
1743        if filename is not None:
1744            try:
1745                filename+""
1746            except:
1747                raise ValueError("filename must be string-like")
1748        self.filename = filename
1749        self.delayload = bool(delayload)
1750
1751    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1752        """Save cookies to a file."""
1753        raise NotImplementedError()
1754
1755    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1756        """Load cookies from a file."""
1757        if filename is None:
1758            if self.filename is not None: filename = self.filename
1759            else: raise ValueError(MISSING_FILENAME_TEXT)
1760
1761        f = open(filename)
1762        try:
1763            self._really_load(f, filename, ignore_discard, ignore_expires)
1764        finally:
1765            f.close()
1766
1767    def revert(self, filename=None,
1768               ignore_discard=False, ignore_expires=False):
1769        """Clear all cookies and reload cookies from a saved file.
1770
1771        Raises LoadError (or IOError) if reversion is not successful; the
1772        object's state will not be altered if this happens.
1773
1774        """
1775        if filename is None:
1776            if self.filename is not None: filename = self.filename
1777            else: raise ValueError(MISSING_FILENAME_TEXT)
1778
1779        self._cookies_lock.acquire()
1780        try:
1781
1782            old_state = copy.deepcopy(self._cookies)
1783            self._cookies = {}
1784            try:
1785                self.load(filename, ignore_discard, ignore_expires)
1786            except (LoadError, IOError):
1787                self._cookies = old_state
1788                raise
1789
1790        finally:
1791            self._cookies_lock.release()
1792
1793from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1794from _MozillaCookieJar import MozillaCookieJar
1795