cookielib.py revision e58334ae9e4a635794ff0605f125eec459b9b98f
1"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14                        CookieJar____
15                        /     \      \
16            FileCookieJar      \      \
17             /    |   \         \      \
18 MozillaCookieJar | LWPCookieJar \      \
19                  |               |      \
20                  |   ---MSIEBase |       \
21                  |  /      |     |        \
22                  | /   MSIEDBCookieJar BSDDBCookieJar
23                  |/
24               MSIECookieJar
25
26"""
27
28import sys, re, urlparse, copy, time, urllib, logging
29try:
30    import threading as _threading
31except ImportError:
32    import dummy_threading as _threading
33import httplib  # only for the default HTTP port
34from calendar import timegm
35
36debug = logging.getLogger("cookielib").debug
37
38DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
39MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
40                         "instance initialised with one)")
41
42def reraise_unmasked_exceptions(unmasked=()):
43    # There are a few catch-all except: statements in this module, for
44    # catching input that's bad in unexpected ways.
45    # This function re-raises some exceptions we don't want to trap.
46    unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
47    etype = sys.exc_info()[0]
48    if issubclass(etype, unmasked):
49        raise
50    # swallowed an exception
51    import warnings, traceback, StringIO
52    f = StringIO.StringIO()
53    traceback.print_exc(None, f)
54    msg = f.getvalue()
55    warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
56
57
58# Date/time conversion
59# -----------------------------------------------------------------------------
60
61EPOCH_YEAR = 1970
62def _timegm(tt):
63    year, month, mday, hour, min, sec = tt[:6]
64    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
65        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
66        return timegm(tt)
67    else:
68        return None
69
70DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
71MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
72          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
73MONTHS_LOWER = []
74for month in MONTHS: MONTHS_LOWER.append(month.lower())
75
76def time2isoz(t=None):
77    """Return a string representing time in seconds since epoch, t.
78
79    If the function is called without an argument, it will use the current
80    time.
81
82    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
83    representing Universal Time (UTC, aka GMT).  An example of this format is:
84
85    1994-11-24 08:49:37Z
86
87    """
88    if t is None: t = time.time()
89    year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
90    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
91        year, mon, mday, hour, min, sec)
92
93def time2netscape(t=None):
94    """Return a string representing time in seconds since epoch, t.
95
96    If the function is called without an argument, it will use the current
97    time.
98
99    The format of the returned string is like this:
100
101    Wed, DD-Mon-YYYY HH:MM:SS GMT
102
103    """
104    if t is None: t = time.time()
105    year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
106    return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
107        DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
108
109
110UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
111
112TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
113def offset_from_tz_string(tz):
114    offset = None
115    if tz in UTC_ZONES:
116        offset = 0
117    else:
118        m = TIMEZONE_RE.search(tz)
119        if m:
120            offset = 3600 * int(m.group(2))
121            if m.group(3):
122                offset = offset + 60 * int(m.group(3))
123            if m.group(1) == '-':
124                offset = -offset
125    return offset
126
127def _str2time(day, mon, yr, hr, min, sec, tz):
128    # translate month name to number
129    # month numbers start with 1 (January)
130    try:
131        mon = MONTHS_LOWER.index(mon.lower())+1
132    except ValueError:
133        # maybe it's already a number
134        try:
135            imon = int(mon)
136        except ValueError:
137            return None
138        if 1 <= imon <= 12:
139            mon = imon
140        else:
141            return None
142
143    # make sure clock elements are defined
144    if hr is None: hr = 0
145    if min is None: min = 0
146    if sec is None: sec = 0
147
148    yr = int(yr)
149    day = int(day)
150    hr = int(hr)
151    min = int(min)
152    sec = int(sec)
153
154    if yr < 1000:
155        # find "obvious" year
156        cur_yr = time.localtime(time.time())[0]
157        m = cur_yr % 100
158        tmp = yr
159        yr = yr + cur_yr - m
160        m = m - tmp
161        if abs(m) > 50:
162            if m > 0: yr = yr + 100
163            else: yr = yr - 100
164
165    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
166    t = _timegm((yr, mon, day, hr, min, sec, tz))
167
168    if t is not None:
169        # adjust time using timezone string, to get absolute time since epoch
170        if tz is None:
171            tz = "UTC"
172        tz = tz.upper()
173        offset = offset_from_tz_string(tz)
174        if offset is None:
175            return None
176        t = t - offset
177
178    return t
179
180STRICT_DATE_RE = re.compile(
181    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
182    "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
183WEEKDAY_RE = re.compile(
184    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
185LOOSE_HTTP_DATE_RE = re.compile(
186    r"""^
187    (\d\d?)            # day
188       (?:\s+|[-\/])
189    (\w+)              # month
190        (?:\s+|[-\/])
191    (\d+)              # year
192    (?:
193          (?:\s+|:)    # separator before clock
194       (\d\d?):(\d\d)  # hour:min
195       (?::(\d\d))?    # optional seconds
196    )?                 # optional clock
197       \s*
198    ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
199       \s*
200    (?:\(\w+\))?       # ASCII representation of timezone in parens.
201       \s*$""", re.X)
202def http2time(text):
203    """Returns time in seconds since epoch of time represented by a string.
204
205    Return value is an integer.
206
207    None is returned if the format of str is unrecognized, the time is outside
208    the representable range, or the timezone string is not recognized.  If the
209    string contains no timezone, UTC is assumed.
210
211    The timezone in the string may be numerical (like "-0800" or "+0100") or a
212    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
213    timezone strings equivalent to UTC (zero offset) are known to the function.
214
215    The function loosely parses the following formats:
216
217    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
218    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
219    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
220    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
221    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
222    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
223
224    The parser ignores leading and trailing whitespace.  The time may be
225    absent.
226
227    If the year is given with only 2 digits, the function will select the
228    century that makes the year closest to the current date.
229
230    """
231    # fast exit for strictly conforming string
232    m = STRICT_DATE_RE.search(text)
233    if m:
234        g = m.groups()
235        mon = MONTHS_LOWER.index(g[1].lower()) + 1
236        tt = (int(g[2]), mon, int(g[0]),
237              int(g[3]), int(g[4]), float(g[5]))
238        return _timegm(tt)
239
240    # No, we need some messy parsing...
241
242    # clean up
243    text = text.lstrip()
244    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
245
246    # tz is time zone specifier string
247    day, mon, yr, hr, min, sec, tz = [None]*7
248
249    # loose regexp parse
250    m = LOOSE_HTTP_DATE_RE.search(text)
251    if m is not None:
252        day, mon, yr, hr, min, sec, tz = m.groups()
253    else:
254        return None  # bad format
255
256    return _str2time(day, mon, yr, hr, min, sec, tz)
257
258ISO_DATE_RE = re.compile(
259    """^
260    (\d{4})              # year
261       [-\/]?
262    (\d\d?)              # numerical month
263       [-\/]?
264    (\d\d?)              # day
265   (?:
266         (?:\s+|[-:Tt])  # separator before clock
267      (\d\d?):?(\d\d)    # hour:min
268      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
269   )?                    # optional clock
270      \s*
271   ([-+]?\d\d?:?(:?\d\d)?
272    |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
273      \s*$""", re.X)
274def iso2time(text):
275    """
276    As for http2time, but parses the ISO 8601 formats:
277
278    1994-02-03 14:15:29 -0100    -- ISO 8601 format
279    1994-02-03 14:15:29          -- zone is optional
280    1994-02-03                   -- only date
281    1994-02-03T14:15:29          -- Use T as separator
282    19940203T141529Z             -- ISO 8601 compact format
283    19940203                     -- only date
284
285    """
286    # clean up
287    text = text.lstrip()
288
289    # tz is time zone specifier string
290    day, mon, yr, hr, min, sec, tz = [None]*7
291
292    # loose regexp parse
293    m = ISO_DATE_RE.search(text)
294    if m is not None:
295        # XXX there's an extra bit of the timezone I'm ignoring here: is
296        #   this the right thing to do?
297        yr, mon, day, hr, min, sec, tz, _ = m.groups()
298    else:
299        return None  # bad format
300
301    return _str2time(day, mon, yr, hr, min, sec, tz)
302
303
304# Header parsing
305# -----------------------------------------------------------------------------
306
307def unmatched(match):
308    """Return unmatched part of re.Match object."""
309    start, end = match.span(0)
310    return match.string[:start]+match.string[end:]
311
312HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
313HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
314HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
315HEADER_ESCAPE_RE = re.compile(r"\\(.)")
316def split_header_words(header_values):
317    r"""Parse header values into a list of lists containing key,value pairs.
318
319    The function knows how to deal with ",", ";" and "=" as well as quoted
320    values after "=".  A list of space separated tokens are parsed as if they
321    were separated by ";".
322
323    If the header_values passed as argument contains multiple values, then they
324    are treated as if they were a single value separated by comma ",".
325
326    This means that this function is useful for parsing header fields that
327    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
328    the requirement for tokens).
329
330      headers           = #header
331      header            = (token | parameter) *( [";"] (token | parameter))
332
333      token             = 1*<any CHAR except CTLs or separators>
334      separators        = "(" | ")" | "<" | ">" | "@"
335                        | "," | ";" | ":" | "\" | <">
336                        | "/" | "[" | "]" | "?" | "="
337                        | "{" | "}" | SP | HT
338
339      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
340      qdtext            = <any TEXT except <">>
341      quoted-pair       = "\" CHAR
342
343      parameter         = attribute "=" value
344      attribute         = token
345      value             = token | quoted-string
346
347    Each header is represented by a list of key/value pairs.  The value for a
348    simple token (not part of a parameter) is None.  Syntactically incorrect
349    headers will not necessarily be parsed as you would want.
350
351    This is easier to describe with some examples:
352
353    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
354    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
355    >>> split_header_words(['text/html; charset="iso-8859-1"'])
356    [[('text/html', None), ('charset', 'iso-8859-1')]]
357    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
358    [[('Basic', None), ('realm', '"foobar"')]]
359
360    """
361    assert not isinstance(header_values, basestring)
362    result = []
363    for text in header_values:
364        orig_text = text
365        pairs = []
366        while text:
367            m = HEADER_TOKEN_RE.search(text)
368            if m:
369                text = unmatched(m)
370                name = m.group(1)
371                m = HEADER_QUOTED_VALUE_RE.search(text)
372                if m:  # quoted value
373                    text = unmatched(m)
374                    value = m.group(1)
375                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
376                else:
377                    m = HEADER_VALUE_RE.search(text)
378                    if m:  # unquoted value
379                        text = unmatched(m)
380                        value = m.group(1)
381                        value = value.rstrip()
382                    else:
383                        # no value, a lone token
384                        value = None
385                pairs.append((name, value))
386            elif text.lstrip().startswith(","):
387                # concatenated headers, as per RFC 2616 section 4.2
388                text = text.lstrip()[1:]
389                if pairs: result.append(pairs)
390                pairs = []
391            else:
392                # skip junk
393                non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
394                assert nr_junk_chars > 0, (
395                    "split_header_words bug: '%s', '%s', %s" %
396                    (orig_text, text, pairs))
397                text = non_junk
398        if pairs: result.append(pairs)
399    return result
400
401HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
402def join_header_words(lists):
403    """Do the inverse (almost) of the conversion done by split_header_words.
404
405    Takes a list of lists of (key, value) pairs and produces a single header
406    value.  Attribute values are quoted if needed.
407
408    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
409    'text/plain; charset="iso-8859/1"'
410    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
411    'text/plain, charset="iso-8859/1"'
412
413    """
414    headers = []
415    for pairs in lists:
416        attr = []
417        for k, v in pairs:
418            if v is not None:
419                if not re.search(r"^\w+$", v):
420                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
421                    v = '"%s"' % v
422                k = "%s=%s" % (k, v)
423            attr.append(k)
424        if attr: headers.append("; ".join(attr))
425    return ", ".join(headers)
426
427def parse_ns_headers(ns_headers):
428    """Ad-hoc parser for Netscape protocol cookie-attributes.
429
430    The old Netscape cookie format for Set-Cookie can for instance contain
431    an unquoted "," in the expires field, so we have to use this ad-hoc
432    parser instead of split_header_words.
433
434    XXX This may not make the best possible effort to parse all the crap
435    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
436    parser is probably better, so could do worse than following that if
437    this ever gives any trouble.
438
439    Currently, this is also used for parsing RFC 2109 cookies.
440
441    """
442    known_attrs = ("expires", "domain", "path", "secure",
443                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
444                   "port", "max-age")
445
446    result = []
447    for ns_header in ns_headers:
448        pairs = []
449        version_set = False
450        for ii, param in enumerate(re.split(r";\s*", ns_header)):
451            param = param.rstrip()
452            if param == "": continue
453            if "=" not in param:
454                k, v = param, None
455            else:
456                k, v = re.split(r"\s*=\s*", param, 1)
457                k = k.lstrip()
458            if ii != 0:
459                lc = k.lower()
460                if lc in known_attrs:
461                    k = lc
462                if k == "version":
463                    # This is an RFC 2109 cookie.
464                    version_set = True
465                if k == "expires":
466                    # convert expires date to seconds since epoch
467                    if v.startswith('"'): v = v[1:]
468                    if v.endswith('"'): v = v[:-1]
469                    v = http2time(v)  # None if invalid
470            pairs.append((k, v))
471
472        if pairs:
473            if not version_set:
474                pairs.append(("version", "0"))
475            result.append(pairs)
476
477    return result
478
479
480IPV4_RE = re.compile(r"\.\d+$")
481def is_HDN(text):
482    """Return True if text is a host domain name."""
483    # XXX
484    # This may well be wrong.  Which RFC is HDN defined in, if any (for
485    #  the purposes of RFC 2965)?
486    # For the current implementation, what about IPv6?  Remember to look
487    #  at other uses of IPV4_RE also, if change this.
488    if IPV4_RE.search(text):
489        return False
490    if text == "":
491        return False
492    if text[0] == "." or text[-1] == ".":
493        return False
494    return True
495
496def domain_match(A, B):
497    """Return True if domain A domain-matches domain B, according to RFC 2965.
498
499    A and B may be host domain names or IP addresses.
500
501    RFC 2965, section 1:
502
503    Host names can be specified either as an IP address or a HDN string.
504    Sometimes we compare one host name with another.  (Such comparisons SHALL
505    be case-insensitive.)  Host A's name domain-matches host B's if
506
507         *  their host name strings string-compare equal; or
508
509         * A is a HDN string and has the form NB, where N is a non-empty
510            name string, B has the form .B', and B' is a HDN string.  (So,
511            x.y.com domain-matches .Y.com but not Y.com.)
512
513    Note that domain-match is not a commutative operation: a.b.c.com
514    domain-matches .c.com, but not the reverse.
515
516    """
517    # Note that, if A or B are IP addresses, the only relevant part of the
518    # definition of the domain-match algorithm is the direct string-compare.
519    A = A.lower()
520    B = B.lower()
521    if A == B:
522        return True
523    if not is_HDN(A):
524        return False
525    i = A.rfind(B)
526    if i == -1 or i == 0:
527        # A does not have form NB, or N is the empty string
528        return False
529    if not B.startswith("."):
530        return False
531    if not is_HDN(B[1:]):
532        return False
533    return True
534
535def liberal_is_HDN(text):
536    """Return True if text is a sort-of-like a host domain name.
537
538    For accepting/blocking domains.
539
540    """
541    if IPV4_RE.search(text):
542        return False
543    return True
544
545def user_domain_match(A, B):
546    """For blocking/accepting domains.
547
548    A and B may be host domain names or IP addresses.
549
550    """
551    A = A.lower()
552    B = B.lower()
553    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
554        if A == B:
555            # equal IP addresses
556            return True
557        return False
558    initial_dot = B.startswith(".")
559    if initial_dot and A.endswith(B):
560        return True
561    if not initial_dot and A == B:
562        return True
563    return False
564
565cut_port_re = re.compile(r":\d+$")
566def request_host(request):
567    """Return request-host, as defined by RFC 2965.
568
569    Variation from RFC: returned value is lowercased, for convenient
570    comparison.
571
572    """
573    url = request.get_full_url()
574    host = urlparse.urlparse(url)[1]
575    if host == "":
576        host = request.get_header("Host", "")
577
578    # remove port, if present
579    host = cut_port_re.sub("", host, 1)
580    return host.lower()
581
582def eff_request_host(request):
583    """Return a tuple (request-host, effective request-host name).
584
585    As defined by RFC 2965, except both are lowercased.
586
587    """
588    erhn = req_host = request_host(request)
589    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
590        erhn = req_host + ".local"
591    return req_host, erhn
592
593def request_path(request):
594    """request-URI, as defined by RFC 2965."""
595    url = request.get_full_url()
596    #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
597    #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
598    path, parameters, query, frag = urlparse.urlparse(url)[2:]
599    if parameters:
600        path = "%s;%s" % (path, parameters)
601    path = escape_path(path)
602    req_path = urlparse.urlunparse(("", "", path, "", query, frag))
603    if not req_path.startswith("/"):
604        # fix bad RFC 2396 absoluteURI
605        req_path = "/"+req_path
606    return req_path
607
608def request_port(request):
609    host = request.get_host()
610    i = host.find(':')
611    if i >= 0:
612        port = host[i+1:]
613        try:
614            int(port)
615        except ValueError:
616            debug("nonnumeric port: '%s'", port)
617            return None
618    else:
619        port = DEFAULT_HTTP_PORT
620    return port
621
622# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
623# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
624HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
625ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
626def uppercase_escaped_char(match):
627    return "%%%s" % match.group(1).upper()
628def escape_path(path):
629    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
630    # There's no knowing what character encoding was used to create URLs
631    # containing %-escapes, but since we have to pick one to escape invalid
632    # path characters, we pick UTF-8, as recommended in the HTML 4.0
633    # specification:
634    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
635    # And here, kind of: draft-fielding-uri-rfc2396bis-03
636    # (And in draft IRI specification: draft-duerst-iri-05)
637    # (And here, for new URI schemes: RFC 2718)
638    if isinstance(path, unicode):
639        path = path.encode("utf-8")
640    path = urllib.quote(path, HTTP_PATH_SAFE)
641    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
642    return path
643
644def reach(h):
645    """Return reach of host h, as defined by RFC 2965, section 1.
646
647    The reach R of a host name H is defined as follows:
648
649       *  If
650
651          -  H is the host domain name of a host; and,
652
653          -  H has the form A.B; and
654
655          -  A has no embedded (that is, interior) dots; and
656
657          -  B has at least one embedded dot, or B is the string "local".
658             then the reach of H is .B.
659
660       *  Otherwise, the reach of H is H.
661
662    >>> reach("www.acme.com")
663    '.acme.com'
664    >>> reach("acme.com")
665    'acme.com'
666    >>> reach("acme.local")
667    '.local'
668
669    """
670    i = h.find(".")
671    if i >= 0:
672        #a = h[:i]  # this line is only here to show what a is
673        b = h[i+1:]
674        i = b.find(".")
675        if is_HDN(h) and (i >= 0 or b == "local"):
676            return "."+b
677    return h
678
679def is_third_party(request):
680    """
681
682    RFC 2965, section 3.3.6:
683
684        An unverifiable transaction is to a third-party host if its request-
685        host U does not domain-match the reach R of the request-host O in the
686        origin transaction.
687
688    """
689    req_host = request_host(request)
690    if not domain_match(req_host, reach(request.get_origin_req_host())):
691        return True
692    else:
693        return False
694
695
696class Cookie:
697    """HTTP Cookie.
698
699    This class represents both Netscape and RFC 2965 cookies.
700
701    This is deliberately a very simple class.  It just holds attributes.  It's
702    possible to construct Cookie instances that don't comply with the cookie
703    standards.  CookieJar.make_cookies is the factory function for Cookie
704    objects -- it deals with cookie parsing, supplying defaults, and
705    normalising to the representation used in this class.  CookiePolicy is
706    responsible for checking them to see whether they should be accepted from
707    and returned to the server.
708
709    Note that the port may be present in the headers, but unspecified ("Port"
710    rather than"Port=80", for example); if this is the case, port is None.
711
712    """
713
714    def __init__(self, version, name, value,
715                 port, port_specified,
716                 domain, domain_specified, domain_initial_dot,
717                 path, path_specified,
718                 secure,
719                 expires,
720                 discard,
721                 comment,
722                 comment_url,
723                 rest,
724                 rfc2109=False,
725                 ):
726
727        if version is not None: version = int(version)
728        if expires is not None: expires = int(expires)
729        if port is None and port_specified is True:
730            raise ValueError("if port is None, port_specified must be false")
731
732        self.version = version
733        self.name = name
734        self.value = value
735        self.port = port
736        self.port_specified = port_specified
737        # normalise case, as per RFC 2965 section 3.3.3
738        self.domain = domain.lower()
739        self.domain_specified = domain_specified
740        # Sigh.  We need to know whether the domain given in the
741        # cookie-attribute had an initial dot, in order to follow RFC 2965
742        # (as clarified in draft errata).  Needed for the returned $Domain
743        # value.
744        self.domain_initial_dot = domain_initial_dot
745        self.path = path
746        self.path_specified = path_specified
747        self.secure = secure
748        self.expires = expires
749        self.discard = discard
750        self.comment = comment
751        self.comment_url = comment_url
752        self.rfc2109 = rfc2109
753
754        self._rest = copy.copy(rest)
755
756    def has_nonstandard_attr(self, name):
757        return name in self._rest
758    def get_nonstandard_attr(self, name, default=None):
759        return self._rest.get(name, default)
760    def set_nonstandard_attr(self, name, value):
761        self._rest[name] = value
762
763    def is_expired(self, now=None):
764        if now is None: now = time.time()
765        if (self.expires is not None) and (self.expires <= now):
766            return True
767        return False
768
769    def __str__(self):
770        if self.port is None: p = ""
771        else: p = ":"+self.port
772        limit = self.domain + p + self.path
773        if self.value is not None:
774            namevalue = "%s=%s" % (self.name, self.value)
775        else:
776            namevalue = self.name
777        return "<Cookie %s for %s>" % (namevalue, limit)
778
779    def __repr__(self):
780        args = []
781        for name in ("version", "name", "value",
782                     "port", "port_specified",
783                     "domain", "domain_specified", "domain_initial_dot",
784                     "path", "path_specified",
785                     "secure", "expires", "discard", "comment", "comment_url",
786                     ):
787            attr = getattr(self, name)
788            args.append("%s=%s" % (name, repr(attr)))
789        args.append("rest=%s" % repr(self._rest))
790        args.append("rfc2109=%s" % repr(self.rfc2109))
791        return "Cookie(%s)" % ", ".join(args)
792
793
794class CookiePolicy:
795    """Defines which cookies get accepted from and returned to server.
796
797    May also modify cookies, though this is probably a bad idea.
798
799    The subclass DefaultCookiePolicy defines the standard rules for Netscape
800    and RFC 2965 cookies -- override that if you want a customised policy.
801
802    """
803    def set_ok(self, cookie, request):
804        """Return true if (and only if) cookie should be accepted from server.
805
806        Currently, pre-expired cookies never get this far -- the CookieJar
807        class deletes such cookies itself.
808
809        """
810        raise NotImplementedError()
811
812    def return_ok(self, cookie, request):
813        """Return true if (and only if) cookie should be returned to server."""
814        raise NotImplementedError()
815
816    def domain_return_ok(self, domain, request):
817        """Return false if cookies should not be returned, given cookie domain.
818        """
819        return True
820
821    def path_return_ok(self, path, request):
822        """Return false if cookies should not be returned, given cookie path.
823        """
824        return True
825
826
827class DefaultCookiePolicy(CookiePolicy):
828    """Implements the standard rules for accepting and returning cookies."""
829
830    DomainStrictNoDots = 1
831    DomainStrictNonDomain = 2
832    DomainRFC2965Match = 4
833
834    DomainLiberal = 0
835    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
836
837    def __init__(self,
838                 blocked_domains=None, allowed_domains=None,
839                 netscape=True, rfc2965=False,
840                 rfc2109_as_netscape=None,
841                 hide_cookie2=False,
842                 strict_domain=False,
843                 strict_rfc2965_unverifiable=True,
844                 strict_ns_unverifiable=False,
845                 strict_ns_domain=DomainLiberal,
846                 strict_ns_set_initial_dollar=False,
847                 strict_ns_set_path=False,
848                 ):
849        """Constructor arguments should be passed as keyword arguments only."""
850        self.netscape = netscape
851        self.rfc2965 = rfc2965
852        self.rfc2109_as_netscape = rfc2109_as_netscape
853        self.hide_cookie2 = hide_cookie2
854        self.strict_domain = strict_domain
855        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
856        self.strict_ns_unverifiable = strict_ns_unverifiable
857        self.strict_ns_domain = strict_ns_domain
858        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
859        self.strict_ns_set_path = strict_ns_set_path
860
861        if blocked_domains is not None:
862            self._blocked_domains = tuple(blocked_domains)
863        else:
864            self._blocked_domains = ()
865
866        if allowed_domains is not None:
867            allowed_domains = tuple(allowed_domains)
868        self._allowed_domains = allowed_domains
869
870    def blocked_domains(self):
871        """Return the sequence of blocked domains (as a tuple)."""
872        return self._blocked_domains
873    def set_blocked_domains(self, blocked_domains):
874        """Set the sequence of blocked domains."""
875        self._blocked_domains = tuple(blocked_domains)
876
877    def is_blocked(self, domain):
878        for blocked_domain in self._blocked_domains:
879            if user_domain_match(domain, blocked_domain):
880                return True
881        return False
882
883    def allowed_domains(self):
884        """Return None, or the sequence of allowed domains (as a tuple)."""
885        return self._allowed_domains
886    def set_allowed_domains(self, allowed_domains):
887        """Set the sequence of allowed domains, or None."""
888        if allowed_domains is not None:
889            allowed_domains = tuple(allowed_domains)
890        self._allowed_domains = allowed_domains
891
892    def is_not_allowed(self, domain):
893        if self._allowed_domains is None:
894            return False
895        for allowed_domain in self._allowed_domains:
896            if user_domain_match(domain, allowed_domain):
897                return False
898        return True
899
900    def set_ok(self, cookie, request):
901        """
902        If you override .set_ok(), be sure to call this method.  If it returns
903        false, so should your subclass (assuming your subclass wants to be more
904        strict about which cookies to accept).
905
906        """
907        debug(" - checking cookie %s=%s", cookie.name, cookie.value)
908
909        assert cookie.name is not None
910
911        for n in "version", "verifiability", "name", "path", "domain", "port":
912            fn_name = "set_ok_"+n
913            fn = getattr(self, fn_name)
914            if not fn(cookie, request):
915                return False
916
917        return True
918
919    def set_ok_version(self, cookie, request):
920        if cookie.version is None:
921            # Version is always set to 0 by parse_ns_headers if it's a Netscape
922            # cookie, so this must be an invalid RFC 2965 cookie.
923            debug("   Set-Cookie2 without version attribute (%s=%s)",
924                  cookie.name, cookie.value)
925            return False
926        if cookie.version > 0 and not self.rfc2965:
927            debug("   RFC 2965 cookies are switched off")
928            return False
929        elif cookie.version == 0 and not self.netscape:
930            debug("   Netscape cookies are switched off")
931            return False
932        return True
933
934    def set_ok_verifiability(self, cookie, request):
935        if request.is_unverifiable() and is_third_party(request):
936            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
937                debug("   third-party RFC 2965 cookie during "
938                             "unverifiable transaction")
939                return False
940            elif cookie.version == 0 and self.strict_ns_unverifiable:
941                debug("   third-party Netscape cookie during "
942                             "unverifiable transaction")
943                return False
944        return True
945
946    def set_ok_name(self, cookie, request):
947        # Try and stop servers setting V0 cookies designed to hack other
948        # servers that know both V0 and V1 protocols.
949        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
950            cookie.name.startswith("$")):
951            debug("   illegal name (starts with '$'): '%s'", cookie.name)
952            return False
953        return True
954
955    def set_ok_path(self, cookie, request):
956        if cookie.path_specified:
957            req_path = request_path(request)
958            if ((cookie.version > 0 or
959                 (cookie.version == 0 and self.strict_ns_set_path)) and
960                not req_path.startswith(cookie.path)):
961                debug("   path attribute %s is not a prefix of request "
962                      "path %s", cookie.path, req_path)
963                return False
964        return True
965
966    def set_ok_domain(self, cookie, request):
967        if self.is_blocked(cookie.domain):
968            debug("   domain %s is in user block-list", cookie.domain)
969            return False
970        if self.is_not_allowed(cookie.domain):
971            debug("   domain %s is not in user allow-list", cookie.domain)
972            return False
973        if cookie.domain_specified:
974            req_host, erhn = eff_request_host(request)
975            domain = cookie.domain
976            if self.strict_domain and (domain.count(".") >= 2):
977                # XXX This should probably be compared with the Konqueror
978                # (kcookiejar.cpp) and Mozilla implementations, but it's a
979                # losing battle.
980                i = domain.rfind(".")
981                j = domain.rfind(".", 0, i)
982                if j == 0:  # domain like .foo.bar
983                    tld = domain[i+1:]
984                    sld = domain[j+1:i]
985                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
986                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
987                       "info", "jobs", "mobi", "museum", "name", "pro",
988                       "travel", "eu") and len(tld) == 2:
989                        # domain like .co.uk
990                        debug("   country-code second level domain %s", domain)
991                        return False
992            if domain.startswith("."):
993                undotted_domain = domain[1:]
994            else:
995                undotted_domain = domain
996            embedded_dots = (undotted_domain.find(".") >= 0)
997            if not embedded_dots and domain != ".local":
998                debug("   non-local domain %s contains no embedded dot",
999                      domain)
1000                return False
1001            if cookie.version == 0:
1002                if (not erhn.endswith(domain) and
1003                    (not erhn.startswith(".") and
1004                     not ("."+erhn).endswith(domain))):
1005                    debug("   effective request-host %s (even with added "
1006                          "initial dot) does not end end with %s",
1007                          erhn, domain)
1008                    return False
1009            if (cookie.version > 0 or
1010                (self.strict_ns_domain & self.DomainRFC2965Match)):
1011                if not domain_match(erhn, domain):
1012                    debug("   effective request-host %s does not domain-match "
1013                          "%s", erhn, domain)
1014                    return False
1015            if (cookie.version > 0 or
1016                (self.strict_ns_domain & self.DomainStrictNoDots)):
1017                host_prefix = req_host[:-len(domain)]
1018                if (host_prefix.find(".") >= 0 and
1019                    not IPV4_RE.search(req_host)):
1020                    debug("   host prefix %s for domain %s contains a dot",
1021                          host_prefix, domain)
1022                    return False
1023        return True
1024
1025    def set_ok_port(self, cookie, request):
1026        if cookie.port_specified:
1027            req_port = request_port(request)
1028            if req_port is None:
1029                req_port = "80"
1030            else:
1031                req_port = str(req_port)
1032            for p in cookie.port.split(","):
1033                try:
1034                    int(p)
1035                except ValueError:
1036                    debug("   bad port %s (not numeric)", p)
1037                    return False
1038                if p == req_port:
1039                    break
1040            else:
1041                debug("   request port (%s) not found in %s",
1042                      req_port, cookie.port)
1043                return False
1044        return True
1045
1046    def return_ok(self, cookie, request):
1047        """
1048        If you override .return_ok(), be sure to call this method.  If it
1049        returns false, so should your subclass (assuming your subclass wants to
1050        be more strict about which cookies to return).
1051
1052        """
1053        # Path has already been checked by .path_return_ok(), and domain
1054        # blocking done by .domain_return_ok().
1055        debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1056
1057        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1058            fn_name = "return_ok_"+n
1059            fn = getattr(self, fn_name)
1060            if not fn(cookie, request):
1061                return False
1062        return True
1063
1064    def return_ok_version(self, cookie, request):
1065        if cookie.version > 0 and not self.rfc2965:
1066            debug("   RFC 2965 cookies are switched off")
1067            return False
1068        elif cookie.version == 0 and not self.netscape:
1069            debug("   Netscape cookies are switched off")
1070            return False
1071        return True
1072
1073    def return_ok_verifiability(self, cookie, request):
1074        if request.is_unverifiable() and is_third_party(request):
1075            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1076                debug("   third-party RFC 2965 cookie during unverifiable "
1077                      "transaction")
1078                return False
1079            elif cookie.version == 0 and self.strict_ns_unverifiable:
1080                debug("   third-party Netscape cookie during unverifiable "
1081                      "transaction")
1082                return False
1083        return True
1084
1085    def return_ok_secure(self, cookie, request):
1086        if cookie.secure and request.get_type() != "https":
1087            debug("   secure cookie with non-secure request")
1088            return False
1089        return True
1090
1091    def return_ok_expires(self, cookie, request):
1092        if cookie.is_expired(self._now):
1093            debug("   cookie expired")
1094            return False
1095        return True
1096
1097    def return_ok_port(self, cookie, request):
1098        if cookie.port:
1099            req_port = request_port(request)
1100            if req_port is None:
1101                req_port = "80"
1102            for p in cookie.port.split(","):
1103                if p == req_port:
1104                    break
1105            else:
1106                debug("   request port %s does not match cookie port %s",
1107                      req_port, cookie.port)
1108                return False
1109        return True
1110
1111    def return_ok_domain(self, cookie, request):
1112        req_host, erhn = eff_request_host(request)
1113        domain = cookie.domain
1114
1115        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1116        if (cookie.version == 0 and
1117            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1118            not cookie.domain_specified and domain != erhn):
1119            debug("   cookie with unspecified domain does not string-compare "
1120                  "equal to request domain")
1121            return False
1122
1123        if cookie.version > 0 and not domain_match(erhn, domain):
1124            debug("   effective request-host name %s does not domain-match "
1125                  "RFC 2965 cookie domain %s", erhn, domain)
1126            return False
1127        if cookie.version == 0 and not ("."+erhn).endswith(domain):
1128            debug("   request-host %s does not match Netscape cookie domain "
1129                  "%s", req_host, domain)
1130            return False
1131        return True
1132
1133    def domain_return_ok(self, domain, request):
1134        # Liberal check of.  This is here as an optimization to avoid
1135        # having to load lots of MSIE cookie files unless necessary.
1136        req_host, erhn = eff_request_host(request)
1137        if not req_host.startswith("."):
1138            req_host = "."+req_host
1139        if not erhn.startswith("."):
1140            erhn = "."+erhn
1141        if not (req_host.endswith(domain) or erhn.endswith(domain)):
1142            #debug("   request domain %s does not match cookie domain %s",
1143            #      req_host, domain)
1144            return False
1145
1146        if self.is_blocked(domain):
1147            debug("   domain %s is in user block-list", domain)
1148            return False
1149        if self.is_not_allowed(domain):
1150            debug("   domain %s is not in user allow-list", domain)
1151            return False
1152
1153        return True
1154
1155    def path_return_ok(self, path, request):
1156        debug("- checking cookie path=%s", path)
1157        req_path = request_path(request)
1158        if not req_path.startswith(path):
1159            debug("  %s does not path-match %s", req_path, path)
1160            return False
1161        return True
1162
1163
1164def vals_sorted_by_key(adict):
1165    keys = adict.keys()
1166    keys.sort()
1167    return map(adict.get, keys)
1168
1169def deepvalues(mapping):
1170    """Iterates over nested mapping, depth-first, in sorted order by key."""
1171    values = vals_sorted_by_key(mapping)
1172    for obj in values:
1173        mapping = False
1174        try:
1175            obj.items
1176        except AttributeError:
1177            pass
1178        else:
1179            mapping = True
1180            for subobj in deepvalues(obj):
1181                yield subobj
1182        if not mapping:
1183            yield obj
1184
1185
1186# Used as second parameter to dict.get() method, to distinguish absent
1187# dict key from one with a None value.
1188class Absent: pass
1189
1190class CookieJar:
1191    """Collection of HTTP cookies.
1192
1193    You may not need to know about this class: try
1194    urllib2.build_opener(HTTPCookieProcessor).open(url).
1195
1196    """
1197
1198    non_word_re = re.compile(r"\W")
1199    quote_re = re.compile(r"([\"\\])")
1200    strict_domain_re = re.compile(r"\.?[^.]*")
1201    domain_re = re.compile(r"[^.]*")
1202    dots_re = re.compile(r"^\.+")
1203
1204    magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1205
1206    def __init__(self, policy=None):
1207        if policy is None:
1208            policy = DefaultCookiePolicy()
1209        self._policy = policy
1210
1211        self._cookies_lock = _threading.RLock()
1212        self._cookies = {}
1213
1214    def set_policy(self, policy):
1215        self._policy = policy
1216
1217    def _cookies_for_domain(self, domain, request):
1218        cookies = []
1219        if not self._policy.domain_return_ok(domain, request):
1220            return []
1221        debug("Checking %s for cookies to return", domain)
1222        cookies_by_path = self._cookies[domain]
1223        for path in cookies_by_path.keys():
1224            if not self._policy.path_return_ok(path, request):
1225                continue
1226            cookies_by_name = cookies_by_path[path]
1227            for cookie in cookies_by_name.values():
1228                if not self._policy.return_ok(cookie, request):
1229                    debug("   not returning cookie")
1230                    continue
1231                debug("   it's a match")
1232                cookies.append(cookie)
1233        return cookies
1234
1235    def _cookies_for_request(self, request):
1236        """Return a list of cookies to be returned to server."""
1237        cookies = []
1238        for domain in self._cookies.keys():
1239            cookies.extend(self._cookies_for_domain(domain, request))
1240        return cookies
1241
1242    def _cookie_attrs(self, cookies):
1243        """Return a list of cookie-attributes to be returned to server.
1244
1245        like ['foo="bar"; $Path="/"', ...]
1246
1247        The $Version attribute is also added when appropriate (currently only
1248        once per request).
1249
1250        """
1251        # add cookies in order of most specific (ie. longest) path first
1252        def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1253        cookies.sort(decreasing_size)
1254
1255        version_set = False
1256
1257        attrs = []
1258        for cookie in cookies:
1259            # set version of Cookie header
1260            # XXX
1261            # What should it be if multiple matching Set-Cookie headers have
1262            #  different versions themselves?
1263            # Answer: there is no answer; was supposed to be settled by
1264            #  RFC 2965 errata, but that may never appear...
1265            version = cookie.version
1266            if not version_set:
1267                version_set = True
1268                if version > 0:
1269                    attrs.append("$Version=%s" % version)
1270
1271            # quote cookie value if necessary
1272            # (not for Netscape protocol, which already has any quotes
1273            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1274            if ((cookie.value is not None) and
1275                self.non_word_re.search(cookie.value) and version > 0):
1276                value = self.quote_re.sub(r"\\\1", cookie.value)
1277            else:
1278                value = cookie.value
1279
1280            # add cookie-attributes to be returned in Cookie header
1281            if cookie.value is None:
1282                attrs.append(cookie.name)
1283            else:
1284                attrs.append("%s=%s" % (cookie.name, value))
1285            if version > 0:
1286                if cookie.path_specified:
1287                    attrs.append('$Path="%s"' % cookie.path)
1288                if cookie.domain.startswith("."):
1289                    domain = cookie.domain
1290                    if (not cookie.domain_initial_dot and
1291                        domain.startswith(".")):
1292                        domain = domain[1:]
1293                    attrs.append('$Domain="%s"' % domain)
1294                if cookie.port is not None:
1295                    p = "$Port"
1296                    if cookie.port_specified:
1297                        p = p + ('="%s"' % cookie.port)
1298                    attrs.append(p)
1299
1300        return attrs
1301
1302    def add_cookie_header(self, request):
1303        """Add correct Cookie: header to request (urllib2.Request object).
1304
1305        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1306
1307        """
1308        debug("add_cookie_header")
1309        self._cookies_lock.acquire()
1310
1311        self._policy._now = self._now = int(time.time())
1312
1313        cookies = self._cookies_for_request(request)
1314
1315        attrs = self._cookie_attrs(cookies)
1316        if attrs:
1317            if not request.has_header("Cookie"):
1318                request.add_unredirected_header(
1319                    "Cookie", "; ".join(attrs))
1320
1321        # if necessary, advertise that we know RFC 2965
1322        if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1323            not request.has_header("Cookie2")):
1324            for cookie in cookies:
1325                if cookie.version != 1:
1326                    request.add_unredirected_header("Cookie2", '$Version="1"')
1327                    break
1328
1329        self._cookies_lock.release()
1330
1331        self.clear_expired_cookies()
1332
1333    def _normalized_cookie_tuples(self, attrs_set):
1334        """Return list of tuples containing normalised cookie information.
1335
1336        attrs_set is the list of lists of key,value pairs extracted from
1337        the Set-Cookie or Set-Cookie2 headers.
1338
1339        Tuples are name, value, standard, rest, where name and value are the
1340        cookie name and value, standard is a dictionary containing the standard
1341        cookie-attributes (discard, secure, version, expires or max-age,
1342        domain, path and port) and rest is a dictionary containing the rest of
1343        the cookie-attributes.
1344
1345        """
1346        cookie_tuples = []
1347
1348        boolean_attrs = "discard", "secure"
1349        value_attrs = ("version",
1350                       "expires", "max-age",
1351                       "domain", "path", "port",
1352                       "comment", "commenturl")
1353
1354        for cookie_attrs in attrs_set:
1355            name, value = cookie_attrs[0]
1356
1357            # Build dictionary of standard cookie-attributes (standard) and
1358            # dictionary of other cookie-attributes (rest).
1359
1360            # Note: expiry time is normalised to seconds since epoch.  V0
1361            # cookies should have the Expires cookie-attribute, and V1 cookies
1362            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1363            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1364            # accept either (but prefer Max-Age).
1365            max_age_set = False
1366
1367            bad_cookie = False
1368
1369            standard = {}
1370            rest = {}
1371            for k, v in cookie_attrs[1:]:
1372                lc = k.lower()
1373                # don't lose case distinction for unknown fields
1374                if lc in value_attrs or lc in boolean_attrs:
1375                    k = lc
1376                if k in boolean_attrs and v is None:
1377                    # boolean cookie-attribute is present, but has no value
1378                    # (like "discard", rather than "port=80")
1379                    v = True
1380                if k in standard:
1381                    # only first value is significant
1382                    continue
1383                if k == "domain":
1384                    if v is None:
1385                        debug("   missing value for domain attribute")
1386                        bad_cookie = True
1387                        break
1388                    # RFC 2965 section 3.3.3
1389                    v = v.lower()
1390                if k == "expires":
1391                    if max_age_set:
1392                        # Prefer max-age to expires (like Mozilla)
1393                        continue
1394                    if v is None:
1395                        debug("   missing or invalid value for expires "
1396                              "attribute: treating as session cookie")
1397                        continue
1398                if k == "max-age":
1399                    max_age_set = True
1400                    try:
1401                        v = int(v)
1402                    except ValueError:
1403                        debug("   missing or invalid (non-numeric) value for "
1404                              "max-age attribute")
1405                        bad_cookie = True
1406                        break
1407                    # convert RFC 2965 Max-Age to seconds since epoch
1408                    # XXX Strictly you're supposed to follow RFC 2616
1409                    #   age-calculation rules.  Remember that zero Max-Age is a
1410                    #   is a request to discard (old and new) cookie, though.
1411                    k = "expires"
1412                    v = self._now + v
1413                if (k in value_attrs) or (k in boolean_attrs):
1414                    if (v is None and
1415                        k not in ("port", "comment", "commenturl")):
1416                        debug("   missing value for %s attribute" % k)
1417                        bad_cookie = True
1418                        break
1419                    standard[k] = v
1420                else:
1421                    rest[k] = v
1422
1423            if bad_cookie:
1424                continue
1425
1426            cookie_tuples.append((name, value, standard, rest))
1427
1428        return cookie_tuples
1429
1430    def _cookie_from_cookie_tuple(self, tup, request):
1431        # standard is dict of standard cookie-attributes, rest is dict of the
1432        # rest of them
1433        name, value, standard, rest = tup
1434
1435        domain = standard.get("domain", Absent)
1436        path = standard.get("path", Absent)
1437        port = standard.get("port", Absent)
1438        expires = standard.get("expires", Absent)
1439
1440        # set the easy defaults
1441        version = standard.get("version", None)
1442        if version is not None: version = int(version)
1443        secure = standard.get("secure", False)
1444        # (discard is also set if expires is Absent)
1445        discard = standard.get("discard", False)
1446        comment = standard.get("comment", None)
1447        comment_url = standard.get("commenturl", None)
1448
1449        # set default path
1450        if path is not Absent and path != "":
1451            path_specified = True
1452            path = escape_path(path)
1453        else:
1454            path_specified = False
1455            path = request_path(request)
1456            i = path.rfind("/")
1457            if i != -1:
1458                if version == 0:
1459                    # Netscape spec parts company from reality here
1460                    path = path[:i]
1461                else:
1462                    path = path[:i+1]
1463            if len(path) == 0: path = "/"
1464
1465        # set default domain
1466        domain_specified = domain is not Absent
1467        # but first we have to remember whether it starts with a dot
1468        domain_initial_dot = False
1469        if domain_specified:
1470            domain_initial_dot = bool(domain.startswith("."))
1471        if domain is Absent:
1472            req_host, erhn = eff_request_host(request)
1473            domain = erhn
1474        elif not domain.startswith("."):
1475            domain = "."+domain
1476
1477        # set default port
1478        port_specified = False
1479        if port is not Absent:
1480            if port is None:
1481                # Port attr present, but has no value: default to request port.
1482                # Cookie should then only be sent back on that port.
1483                port = request_port(request)
1484            else:
1485                port_specified = True
1486                port = re.sub(r"\s+", "", port)
1487        else:
1488            # No port attr present.  Cookie can be sent back on any port.
1489            port = None
1490
1491        # set default expires and discard
1492        if expires is Absent:
1493            expires = None
1494            discard = True
1495        elif expires <= self._now:
1496            # Expiry date in past is request to delete cookie.  This can't be
1497            # in DefaultCookiePolicy, because can't delete cookies there.
1498            try:
1499                self.clear(domain, path, name)
1500            except KeyError:
1501                pass
1502            debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1503                  domain, path, name)
1504            return None
1505
1506        return Cookie(version,
1507                      name, value,
1508                      port, port_specified,
1509                      domain, domain_specified, domain_initial_dot,
1510                      path, path_specified,
1511                      secure,
1512                      expires,
1513                      discard,
1514                      comment,
1515                      comment_url,
1516                      rest)
1517
1518    def _cookies_from_attrs_set(self, attrs_set, request):
1519        cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1520
1521        cookies = []
1522        for tup in cookie_tuples:
1523            cookie = self._cookie_from_cookie_tuple(tup, request)
1524            if cookie: cookies.append(cookie)
1525        return cookies
1526
1527    def _process_rfc2109_cookies(self, cookies):
1528        rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1529        if rfc2109_as_ns is None:
1530            rfc2109_as_ns = not self._policy.rfc2965
1531        for cookie in cookies:
1532            if cookie.version == 1:
1533                cookie.rfc2109 = True
1534                if rfc2109_as_ns:
1535                    # treat 2109 cookies as Netscape cookies rather than
1536                    # as RFC2965 cookies
1537                    cookie.version = 0
1538
1539    def make_cookies(self, response, request):
1540        """Return sequence of Cookie objects extracted from response object."""
1541        # get cookie-attributes for RFC 2965 and Netscape protocols
1542        headers = response.info()
1543        rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1544        ns_hdrs = headers.getheaders("Set-Cookie")
1545
1546        rfc2965 = self._policy.rfc2965
1547        netscape = self._policy.netscape
1548
1549        if ((not rfc2965_hdrs and not ns_hdrs) or
1550            (not ns_hdrs and not rfc2965) or
1551            (not rfc2965_hdrs and not netscape) or
1552            (not netscape and not rfc2965)):
1553            return []  # no relevant cookie headers: quick exit
1554
1555        try:
1556            cookies = self._cookies_from_attrs_set(
1557                split_header_words(rfc2965_hdrs), request)
1558        except:
1559            reraise_unmasked_exceptions()
1560            cookies = []
1561
1562        if ns_hdrs and netscape:
1563            try:
1564                # RFC 2109 and Netscape cookies
1565                ns_cookies = self._cookies_from_attrs_set(
1566                    parse_ns_headers(ns_hdrs), request)
1567            except:
1568                reraise_unmasked_exceptions()
1569                ns_cookies = []
1570            self._process_rfc2109_cookies(ns_cookies)
1571
1572            # Look for Netscape cookies (from Set-Cookie headers) that match
1573            # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1574            # For each match, keep the RFC 2965 cookie and ignore the Netscape
1575            # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1576            # bundled in with the Netscape cookies for this purpose, which is
1577            # reasonable behaviour.
1578            if rfc2965:
1579                lookup = {}
1580                for cookie in cookies:
1581                    lookup[(cookie.domain, cookie.path, cookie.name)] = None
1582
1583                def no_matching_rfc2965(ns_cookie, lookup=lookup):
1584                    key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1585                    return key not in lookup
1586                ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1587
1588            if ns_cookies:
1589                cookies.extend(ns_cookies)
1590
1591        return cookies
1592
1593    def set_cookie_if_ok(self, cookie, request):
1594        """Set a cookie if policy says it's OK to do so."""
1595        self._cookies_lock.acquire()
1596        self._policy._now = self._now = int(time.time())
1597
1598        if self._policy.set_ok(cookie, request):
1599            self.set_cookie(cookie)
1600
1601        self._cookies_lock.release()
1602
1603    def set_cookie(self, cookie):
1604        """Set a cookie, without checking whether or not it should be set."""
1605        c = self._cookies
1606        self._cookies_lock.acquire()
1607        try:
1608            if cookie.domain not in c: c[cookie.domain] = {}
1609            c2 = c[cookie.domain]
1610            if cookie.path not in c2: c2[cookie.path] = {}
1611            c3 = c2[cookie.path]
1612            c3[cookie.name] = cookie
1613        finally:
1614            self._cookies_lock.release()
1615
1616    def extract_cookies(self, response, request):
1617        """Extract cookies from response, where allowable given the request."""
1618        debug("extract_cookies: %s", response.info())
1619        self._cookies_lock.acquire()
1620        self._policy._now = self._now = int(time.time())
1621
1622        for cookie in self.make_cookies(response, request):
1623            if self._policy.set_ok(cookie, request):
1624                debug(" setting cookie: %s", cookie)
1625                self.set_cookie(cookie)
1626        self._cookies_lock.release()
1627
1628    def clear(self, domain=None, path=None, name=None):
1629        """Clear some cookies.
1630
1631        Invoking this method without arguments will clear all cookies.  If
1632        given a single argument, only cookies belonging to that domain will be
1633        removed.  If given two arguments, cookies belonging to the specified
1634        path within that domain are removed.  If given three arguments, then
1635        the cookie with the specified name, path and domain is removed.
1636
1637        Raises KeyError if no matching cookie exists.
1638
1639        """
1640        if name is not None:
1641            if (domain is None) or (path is None):
1642                raise ValueError(
1643                    "domain and path must be given to remove a cookie by name")
1644            del self._cookies[domain][path][name]
1645        elif path is not None:
1646            if domain is None:
1647                raise ValueError(
1648                    "domain must be given to remove cookies by path")
1649            del self._cookies[domain][path]
1650        elif domain is not None:
1651            del self._cookies[domain]
1652        else:
1653            self._cookies = {}
1654
1655    def clear_session_cookies(self):
1656        """Discard all session cookies.
1657
1658        Note that the .save() method won't save session cookies anyway, unless
1659        you ask otherwise by passing a true ignore_discard argument.
1660
1661        """
1662        self._cookies_lock.acquire()
1663        for cookie in self:
1664            if cookie.discard:
1665                self.clear(cookie.domain, cookie.path, cookie.name)
1666        self._cookies_lock.release()
1667
1668    def clear_expired_cookies(self):
1669        """Discard all expired cookies.
1670
1671        You probably don't need to call this method: expired cookies are never
1672        sent back to the server (provided you're using DefaultCookiePolicy),
1673        this method is called by CookieJar itself every so often, and the
1674        .save() method won't save expired cookies anyway (unless you ask
1675        otherwise by passing a true ignore_expires argument).
1676
1677        """
1678        self._cookies_lock.acquire()
1679        now = time.time()
1680        for cookie in self:
1681            if cookie.is_expired(now):
1682                self.clear(cookie.domain, cookie.path, cookie.name)
1683        self._cookies_lock.release()
1684
1685    def __iter__(self):
1686        return deepvalues(self._cookies)
1687
1688    def __len__(self):
1689        """Return number of contained cookies."""
1690        i = 0
1691        for cookie in self: i = i + 1
1692        return i
1693
1694    def __repr__(self):
1695        r = []
1696        for cookie in self: r.append(repr(cookie))
1697        return "<%s[%s]>" % (self.__class__, ", ".join(r))
1698
1699    def __str__(self):
1700        r = []
1701        for cookie in self: r.append(str(cookie))
1702        return "<%s[%s]>" % (self.__class__, ", ".join(r))
1703
1704
1705# derives from IOError for backwards-compatibility with Python 2.4.0
1706class LoadError(IOError): pass
1707
1708class FileCookieJar(CookieJar):
1709    """CookieJar that can be loaded from and saved to a file."""
1710
1711    def __init__(self, filename=None, delayload=False, policy=None):
1712        """
1713        Cookies are NOT loaded from the named file until either the .load() or
1714        .revert() method is called.
1715
1716        """
1717        CookieJar.__init__(self, policy)
1718        if filename is not None:
1719            try:
1720                filename+""
1721            except:
1722                raise ValueError("filename must be string-like")
1723        self.filename = filename
1724        self.delayload = bool(delayload)
1725
1726    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1727        """Save cookies to a file."""
1728        raise NotImplementedError()
1729
1730    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1731        """Load cookies from a file."""
1732        if filename is None:
1733            if self.filename is not None: filename = self.filename
1734            else: raise ValueError(MISSING_FILENAME_TEXT)
1735
1736        f = open(filename)
1737        try:
1738            self._really_load(f, filename, ignore_discard, ignore_expires)
1739        finally:
1740            f.close()
1741
1742    def revert(self, filename=None,
1743               ignore_discard=False, ignore_expires=False):
1744        """Clear all cookies and reload cookies from a saved file.
1745
1746        Raises LoadError (or IOError) if reversion is not successful; the
1747        object's state will not be altered if this happens.
1748
1749        """
1750        if filename is None:
1751            if self.filename is not None: filename = self.filename
1752            else: raise ValueError(MISSING_FILENAME_TEXT)
1753
1754        self._cookies_lock.acquire()
1755
1756        old_state = copy.deepcopy(self._cookies)
1757        self._cookies = {}
1758        try:
1759            self.load(filename, ignore_discard, ignore_expires)
1760        except (LoadError, IOError):
1761            self._cookies = old_state
1762            raise
1763
1764        self._cookies_lock.release()
1765
1766from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1767from _MozillaCookieJar import MozillaCookieJar
1768