1r"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
13
14                        CookieJar____
15                        /     \      \
16            FileCookieJar      \      \
17             /    |   \         \      \
18 MozillaCookieJar | LWPCookieJar \      \
19                  |               |      \
20                  |   ---MSIEBase |       \
21                  |  /      |     |        \
22                  | /   MSIEDBCookieJar BSDDBCookieJar
23                  |/
24               MSIECookieJar
25
26"""
27
28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29           'FileCookieJar', 'LWPCookieJar', 'lwp_cookie_str', 'LoadError',
30           'MozillaCookieJar']
31
32import re, urlparse, copy, time, urllib
33try:
34    import threading as _threading
35except ImportError:
36    import dummy_threading as _threading
37import httplib  # only for the default HTTP port
38from calendar import timegm
39
40debug = False   # set to True to enable debugging via the logging module
41logger = None
42
43def _debug(*args):
44    if not debug:
45        return
46    global logger
47    if not logger:
48        import logging
49        logger = logging.getLogger("cookielib")
50    return logger.debug(*args)
51
52
53DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
54MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55                         "instance initialised with one)")
56
57def _warn_unhandled_exception():
58    # There are a few catch-all except: statements in this module, for
59    # catching input that's bad in unexpected ways.  Warn if any
60    # exceptions are caught there.
61    import warnings, traceback, StringIO
62    f = StringIO.StringIO()
63    traceback.print_exc(None, f)
64    msg = f.getvalue()
65    warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
66
67
68# Date/time conversion
69# -----------------------------------------------------------------------------
70
71EPOCH_YEAR = 1970
72def _timegm(tt):
73    year, month, mday, hour, min, sec = tt[:6]
74    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76        return timegm(tt)
77    else:
78        return None
79
80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83MONTHS_LOWER = []
84for month in MONTHS: MONTHS_LOWER.append(month.lower())
85
86def time2isoz(t=None):
87    """Return a string representing time in seconds since epoch, t.
88
89    If the function is called without an argument, it will use the current
90    time.
91
92    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93    representing Universal Time (UTC, aka GMT).  An example of this format is:
94
95    1994-11-24 08:49:37Z
96
97    """
98    if t is None: t = time.time()
99    year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
100    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
101        year, mon, mday, hour, min, sec)
102
103def time2netscape(t=None):
104    """Return a string representing time in seconds since epoch, t.
105
106    If the function is called without an argument, it will use the current
107    time.
108
109    The format of the returned string is like this:
110
111    Wed, DD-Mon-YYYY HH:MM:SS GMT
112
113    """
114    if t is None: t = time.time()
115    year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
116    return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
117        DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
118
119
120UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
121
122TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
123def offset_from_tz_string(tz):
124    offset = None
125    if tz in UTC_ZONES:
126        offset = 0
127    else:
128        m = TIMEZONE_RE.search(tz)
129        if m:
130            offset = 3600 * int(m.group(2))
131            if m.group(3):
132                offset = offset + 60 * int(m.group(3))
133            if m.group(1) == '-':
134                offset = -offset
135    return offset
136
137def _str2time(day, mon, yr, hr, min, sec, tz):
138    # translate month name to number
139    # month numbers start with 1 (January)
140    try:
141        mon = MONTHS_LOWER.index(mon.lower())+1
142    except ValueError:
143        # maybe it's already a number
144        try:
145            imon = int(mon)
146        except ValueError:
147            return None
148        if 1 <= imon <= 12:
149            mon = imon
150        else:
151            return None
152
153    # make sure clock elements are defined
154    if hr is None: hr = 0
155    if min is None: min = 0
156    if sec is None: sec = 0
157
158    yr = int(yr)
159    day = int(day)
160    hr = int(hr)
161    min = int(min)
162    sec = int(sec)
163
164    if yr < 1000:
165        # find "obvious" year
166        cur_yr = time.localtime(time.time())[0]
167        m = cur_yr % 100
168        tmp = yr
169        yr = yr + cur_yr - m
170        m = m - tmp
171        if abs(m) > 50:
172            if m > 0: yr = yr + 100
173            else: yr = yr - 100
174
175    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
176    t = _timegm((yr, mon, day, hr, min, sec, tz))
177
178    if t is not None:
179        # adjust time using timezone string, to get absolute time since epoch
180        if tz is None:
181            tz = "UTC"
182        tz = tz.upper()
183        offset = offset_from_tz_string(tz)
184        if offset is None:
185            return None
186        t = t - offset
187
188    return t
189
190STRICT_DATE_RE = re.compile(
191    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
192    "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
193WEEKDAY_RE = re.compile(
194    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
195LOOSE_HTTP_DATE_RE = re.compile(
196    r"""^
197    (\d\d?)            # day
198       (?:\s+|[-\/])
199    (\w+)              # month
200        (?:\s+|[-\/])
201    (\d+)              # year
202    (?:
203          (?:\s+|:)    # separator before clock
204       (\d\d?):(\d\d)  # hour:min
205       (?::(\d\d))?    # optional seconds
206    )?                 # optional clock
207       \s*
208    ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
209       \s*
210    (?:\(\w+\))?       # ASCII representation of timezone in parens.
211       \s*$""", re.X)
212def http2time(text):
213    """Returns time in seconds since epoch of time represented by a string.
214
215    Return value is an integer.
216
217    None is returned if the format of str is unrecognized, the time is outside
218    the representable range, or the timezone string is not recognized.  If the
219    string contains no timezone, UTC is assumed.
220
221    The timezone in the string may be numerical (like "-0800" or "+0100") or a
222    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
223    timezone strings equivalent to UTC (zero offset) are known to the function.
224
225    The function loosely parses the following formats:
226
227    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
228    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
229    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
230    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
231    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
232    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
233
234    The parser ignores leading and trailing whitespace.  The time may be
235    absent.
236
237    If the year is given with only 2 digits, the function will select the
238    century that makes the year closest to the current date.
239
240    """
241    # fast exit for strictly conforming string
242    m = STRICT_DATE_RE.search(text)
243    if m:
244        g = m.groups()
245        mon = MONTHS_LOWER.index(g[1].lower()) + 1
246        tt = (int(g[2]), mon, int(g[0]),
247              int(g[3]), int(g[4]), float(g[5]))
248        return _timegm(tt)
249
250    # No, we need some messy parsing...
251
252    # clean up
253    text = text.lstrip()
254    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
255
256    # tz is time zone specifier string
257    day, mon, yr, hr, min, sec, tz = [None]*7
258
259    # loose regexp parse
260    m = LOOSE_HTTP_DATE_RE.search(text)
261    if m is not None:
262        day, mon, yr, hr, min, sec, tz = m.groups()
263    else:
264        return None  # bad format
265
266    return _str2time(day, mon, yr, hr, min, sec, tz)
267
268ISO_DATE_RE = re.compile(
269    """^
270    (\d{4})              # year
271       [-\/]?
272    (\d\d?)              # numerical month
273       [-\/]?
274    (\d\d?)              # day
275   (?:
276         (?:\s+|[-:Tt])  # separator before clock
277      (\d\d?):?(\d\d)    # hour:min
278      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
279   )?                    # optional clock
280      \s*
281   ([-+]?\d\d?:?(:?\d\d)?
282    |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
283      \s*$""", re.X)
284def iso2time(text):
285    """
286    As for http2time, but parses the ISO 8601 formats:
287
288    1994-02-03 14:15:29 -0100    -- ISO 8601 format
289    1994-02-03 14:15:29          -- zone is optional
290    1994-02-03                   -- only date
291    1994-02-03T14:15:29          -- Use T as separator
292    19940203T141529Z             -- ISO 8601 compact format
293    19940203                     -- only date
294
295    """
296    # clean up
297    text = text.lstrip()
298
299    # tz is time zone specifier string
300    day, mon, yr, hr, min, sec, tz = [None]*7
301
302    # loose regexp parse
303    m = ISO_DATE_RE.search(text)
304    if m is not None:
305        # XXX there's an extra bit of the timezone I'm ignoring here: is
306        #   this the right thing to do?
307        yr, mon, day, hr, min, sec, tz, _ = m.groups()
308    else:
309        return None  # bad format
310
311    return _str2time(day, mon, yr, hr, min, sec, tz)
312
313
314# Header parsing
315# -----------------------------------------------------------------------------
316
317def unmatched(match):
318    """Return unmatched part of re.Match object."""
319    start, end = match.span(0)
320    return match.string[:start]+match.string[end:]
321
322HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
323HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
324HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
325HEADER_ESCAPE_RE = re.compile(r"\\(.)")
326def split_header_words(header_values):
327    r"""Parse header values into a list of lists containing key,value pairs.
328
329    The function knows how to deal with ",", ";" and "=" as well as quoted
330    values after "=".  A list of space separated tokens are parsed as if they
331    were separated by ";".
332
333    If the header_values passed as argument contains multiple values, then they
334    are treated as if they were a single value separated by comma ",".
335
336    This means that this function is useful for parsing header fields that
337    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
338    the requirement for tokens).
339
340      headers           = #header
341      header            = (token | parameter) *( [";"] (token | parameter))
342
343      token             = 1*<any CHAR except CTLs or separators>
344      separators        = "(" | ")" | "<" | ">" | "@"
345                        | "," | ";" | ":" | "\" | <">
346                        | "/" | "[" | "]" | "?" | "="
347                        | "{" | "}" | SP | HT
348
349      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
350      qdtext            = <any TEXT except <">>
351      quoted-pair       = "\" CHAR
352
353      parameter         = attribute "=" value
354      attribute         = token
355      value             = token | quoted-string
356
357    Each header is represented by a list of key/value pairs.  The value for a
358    simple token (not part of a parameter) is None.  Syntactically incorrect
359    headers will not necessarily be parsed as you would want.
360
361    This is easier to describe with some examples:
362
363    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
364    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
365    >>> split_header_words(['text/html; charset="iso-8859-1"'])
366    [[('text/html', None), ('charset', 'iso-8859-1')]]
367    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
368    [[('Basic', None), ('realm', '"foobar"')]]
369
370    """
371    assert not isinstance(header_values, basestring)
372    result = []
373    for text in header_values:
374        orig_text = text
375        pairs = []
376        while text:
377            m = HEADER_TOKEN_RE.search(text)
378            if m:
379                text = unmatched(m)
380                name = m.group(1)
381                m = HEADER_QUOTED_VALUE_RE.search(text)
382                if m:  # quoted value
383                    text = unmatched(m)
384                    value = m.group(1)
385                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
386                else:
387                    m = HEADER_VALUE_RE.search(text)
388                    if m:  # unquoted value
389                        text = unmatched(m)
390                        value = m.group(1)
391                        value = value.rstrip()
392                    else:
393                        # no value, a lone token
394                        value = None
395                pairs.append((name, value))
396            elif text.lstrip().startswith(","):
397                # concatenated headers, as per RFC 2616 section 4.2
398                text = text.lstrip()[1:]
399                if pairs: result.append(pairs)
400                pairs = []
401            else:
402                # skip junk
403                non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
404                assert nr_junk_chars > 0, (
405                    "split_header_words bug: '%s', '%s', %s" %
406                    (orig_text, text, pairs))
407                text = non_junk
408        if pairs: result.append(pairs)
409    return result
410
411HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
412def join_header_words(lists):
413    """Do the inverse (almost) of the conversion done by split_header_words.
414
415    Takes a list of lists of (key, value) pairs and produces a single header
416    value.  Attribute values are quoted if needed.
417
418    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
419    'text/plain; charset="iso-8859/1"'
420    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
421    'text/plain, charset="iso-8859/1"'
422
423    """
424    headers = []
425    for pairs in lists:
426        attr = []
427        for k, v in pairs:
428            if v is not None:
429                if not re.search(r"^\w+$", v):
430                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
431                    v = '"%s"' % v
432                k = "%s=%s" % (k, v)
433            attr.append(k)
434        if attr: headers.append("; ".join(attr))
435    return ", ".join(headers)
436
437def _strip_quotes(text):
438    if text.startswith('"'):
439        text = text[1:]
440    if text.endswith('"'):
441        text = text[:-1]
442    return text
443
444def parse_ns_headers(ns_headers):
445    """Ad-hoc parser for Netscape protocol cookie-attributes.
446
447    The old Netscape cookie format for Set-Cookie can for instance contain
448    an unquoted "," in the expires field, so we have to use this ad-hoc
449    parser instead of split_header_words.
450
451    XXX This may not make the best possible effort to parse all the crap
452    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
453    parser is probably better, so could do worse than following that if
454    this ever gives any trouble.
455
456    Currently, this is also used for parsing RFC 2109 cookies.
457
458    """
459    known_attrs = ("expires", "domain", "path", "secure",
460                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
461                   "version", "port", "max-age")
462
463    result = []
464    for ns_header in ns_headers:
465        pairs = []
466        version_set = False
467
468        # XXX: The following does not strictly adhere to RFCs in that empty
469        # names and values are legal (the former will only appear once and will
470        # be overwritten if multiple occurrences are present). This is
471        # mostly to deal with backwards compatibility.
472        for ii, param in enumerate(ns_header.split(';')):
473            param = param.strip()
474
475            key, sep, val = param.partition('=')
476            key = key.strip()
477
478            if not key:
479                if ii == 0:
480                    break
481                else:
482                    continue
483
484            # allow for a distinction between present and empty and missing
485            # altogether
486            val = val.strip() if sep else None
487
488            if ii != 0:
489                lc = key.lower()
490                if lc in known_attrs:
491                    key = lc
492
493                if key == "version":
494                    # This is an RFC 2109 cookie.
495                    if val is not None:
496                        val = _strip_quotes(val)
497                    version_set = True
498                elif key == "expires":
499                    # convert expires date to seconds since epoch
500                    if val is not None:
501                        val = http2time(_strip_quotes(val))  # None if invalid
502            pairs.append((key, val))
503
504        if pairs:
505            if not version_set:
506                pairs.append(("version", "0"))
507            result.append(pairs)
508
509    return result
510
511
512IPV4_RE = re.compile(r"\.\d+$")
513def is_HDN(text):
514    """Return True if text is a host domain name."""
515    # XXX
516    # This may well be wrong.  Which RFC is HDN defined in, if any (for
517    #  the purposes of RFC 2965)?
518    # For the current implementation, what about IPv6?  Remember to look
519    #  at other uses of IPV4_RE also, if change this.
520    if IPV4_RE.search(text):
521        return False
522    if text == "":
523        return False
524    if text[0] == "." or text[-1] == ".":
525        return False
526    return True
527
528def domain_match(A, B):
529    """Return True if domain A domain-matches domain B, according to RFC 2965.
530
531    A and B may be host domain names or IP addresses.
532
533    RFC 2965, section 1:
534
535    Host names can be specified either as an IP address or a HDN string.
536    Sometimes we compare one host name with another.  (Such comparisons SHALL
537    be case-insensitive.)  Host A's name domain-matches host B's if
538
539         *  their host name strings string-compare equal; or
540
541         * A is a HDN string and has the form NB, where N is a non-empty
542            name string, B has the form .B', and B' is a HDN string.  (So,
543            x.y.com domain-matches .Y.com but not Y.com.)
544
545    Note that domain-match is not a commutative operation: a.b.c.com
546    domain-matches .c.com, but not the reverse.
547
548    """
549    # Note that, if A or B are IP addresses, the only relevant part of the
550    # definition of the domain-match algorithm is the direct string-compare.
551    A = A.lower()
552    B = B.lower()
553    if A == B:
554        return True
555    if not is_HDN(A):
556        return False
557    i = A.rfind(B)
558    if i == -1 or i == 0:
559        # A does not have form NB, or N is the empty string
560        return False
561    if not B.startswith("."):
562        return False
563    if not is_HDN(B[1:]):
564        return False
565    return True
566
567def liberal_is_HDN(text):
568    """Return True if text is a sort-of-like a host domain name.
569
570    For accepting/blocking domains.
571
572    """
573    if IPV4_RE.search(text):
574        return False
575    return True
576
577def user_domain_match(A, B):
578    """For blocking/accepting domains.
579
580    A and B may be host domain names or IP addresses.
581
582    """
583    A = A.lower()
584    B = B.lower()
585    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
586        if A == B:
587            # equal IP addresses
588            return True
589        return False
590    initial_dot = B.startswith(".")
591    if initial_dot and A.endswith(B):
592        return True
593    if not initial_dot and A == B:
594        return True
595    return False
596
597cut_port_re = re.compile(r":\d+$")
598def request_host(request):
599    """Return request-host, as defined by RFC 2965.
600
601    Variation from RFC: returned value is lowercased, for convenient
602    comparison.
603
604    """
605    url = request.get_full_url()
606    host = urlparse.urlparse(url)[1]
607    if host == "":
608        host = request.get_header("Host", "")
609
610    # remove port, if present
611    host = cut_port_re.sub("", host, 1)
612    return host.lower()
613
614def eff_request_host(request):
615    """Return a tuple (request-host, effective request-host name).
616
617    As defined by RFC 2965, except both are lowercased.
618
619    """
620    erhn = req_host = request_host(request)
621    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
622        erhn = req_host + ".local"
623    return req_host, erhn
624
625def request_path(request):
626    """Path component of request-URI, as defined by RFC 2965."""
627    url = request.get_full_url()
628    parts = urlparse.urlsplit(url)
629    path = escape_path(parts.path)
630    if not path.startswith("/"):
631        # fix bad RFC 2396 absoluteURI
632        path = "/" + path
633    return path
634
635def request_port(request):
636    host = request.get_host()
637    i = host.find(':')
638    if i >= 0:
639        port = host[i+1:]
640        try:
641            int(port)
642        except ValueError:
643            _debug("nonnumeric port: '%s'", port)
644            return None
645    else:
646        port = DEFAULT_HTTP_PORT
647    return port
648
649# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
650# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
651HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
652ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
653def uppercase_escaped_char(match):
654    return "%%%s" % match.group(1).upper()
655def escape_path(path):
656    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
657    # There's no knowing what character encoding was used to create URLs
658    # containing %-escapes, but since we have to pick one to escape invalid
659    # path characters, we pick UTF-8, as recommended in the HTML 4.0
660    # specification:
661    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
662    # And here, kind of: draft-fielding-uri-rfc2396bis-03
663    # (And in draft IRI specification: draft-duerst-iri-05)
664    # (And here, for new URI schemes: RFC 2718)
665    if isinstance(path, unicode):
666        path = path.encode("utf-8")
667    path = urllib.quote(path, HTTP_PATH_SAFE)
668    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
669    return path
670
671def reach(h):
672    """Return reach of host h, as defined by RFC 2965, section 1.
673
674    The reach R of a host name H is defined as follows:
675
676       *  If
677
678          -  H is the host domain name of a host; and,
679
680          -  H has the form A.B; and
681
682          -  A has no embedded (that is, interior) dots; and
683
684          -  B has at least one embedded dot, or B is the string "local".
685             then the reach of H is .B.
686
687       *  Otherwise, the reach of H is H.
688
689    >>> reach("www.acme.com")
690    '.acme.com'
691    >>> reach("acme.com")
692    'acme.com'
693    >>> reach("acme.local")
694    '.local'
695
696    """
697    i = h.find(".")
698    if i >= 0:
699        #a = h[:i]  # this line is only here to show what a is
700        b = h[i+1:]
701        i = b.find(".")
702        if is_HDN(h) and (i >= 0 or b == "local"):
703            return "."+b
704    return h
705
706def is_third_party(request):
707    """
708
709    RFC 2965, section 3.3.6:
710
711        An unverifiable transaction is to a third-party host if its request-
712        host U does not domain-match the reach R of the request-host O in the
713        origin transaction.
714
715    """
716    req_host = request_host(request)
717    if not domain_match(req_host, reach(request.get_origin_req_host())):
718        return True
719    else:
720        return False
721
722
723class Cookie:
724    """HTTP Cookie.
725
726    This class represents both Netscape and RFC 2965 cookies.
727
728    This is deliberately a very simple class.  It just holds attributes.  It's
729    possible to construct Cookie instances that don't comply with the cookie
730    standards.  CookieJar.make_cookies is the factory function for Cookie
731    objects -- it deals with cookie parsing, supplying defaults, and
732    normalising to the representation used in this class.  CookiePolicy is
733    responsible for checking them to see whether they should be accepted from
734    and returned to the server.
735
736    Note that the port may be present in the headers, but unspecified ("Port"
737    rather than"Port=80", for example); if this is the case, port is None.
738
739    """
740
741    def __init__(self, version, name, value,
742                 port, port_specified,
743                 domain, domain_specified, domain_initial_dot,
744                 path, path_specified,
745                 secure,
746                 expires,
747                 discard,
748                 comment,
749                 comment_url,
750                 rest,
751                 rfc2109=False,
752                 ):
753
754        if version is not None: version = int(version)
755        if expires is not None: expires = int(expires)
756        if port is None and port_specified is True:
757            raise ValueError("if port is None, port_specified must be false")
758
759        self.version = version
760        self.name = name
761        self.value = value
762        self.port = port
763        self.port_specified = port_specified
764        # normalise case, as per RFC 2965 section 3.3.3
765        self.domain = domain.lower()
766        self.domain_specified = domain_specified
767        # Sigh.  We need to know whether the domain given in the
768        # cookie-attribute had an initial dot, in order to follow RFC 2965
769        # (as clarified in draft errata).  Needed for the returned $Domain
770        # value.
771        self.domain_initial_dot = domain_initial_dot
772        self.path = path
773        self.path_specified = path_specified
774        self.secure = secure
775        self.expires = expires
776        self.discard = discard
777        self.comment = comment
778        self.comment_url = comment_url
779        self.rfc2109 = rfc2109
780
781        self._rest = copy.copy(rest)
782
783    def has_nonstandard_attr(self, name):
784        return name in self._rest
785    def get_nonstandard_attr(self, name, default=None):
786        return self._rest.get(name, default)
787    def set_nonstandard_attr(self, name, value):
788        self._rest[name] = value
789
790    def is_expired(self, now=None):
791        if now is None: now = time.time()
792        if (self.expires is not None) and (self.expires <= now):
793            return True
794        return False
795
796    def __str__(self):
797        if self.port is None: p = ""
798        else: p = ":"+self.port
799        limit = self.domain + p + self.path
800        if self.value is not None:
801            namevalue = "%s=%s" % (self.name, self.value)
802        else:
803            namevalue = self.name
804        return "<Cookie %s for %s>" % (namevalue, limit)
805
806    def __repr__(self):
807        args = []
808        for name in ("version", "name", "value",
809                     "port", "port_specified",
810                     "domain", "domain_specified", "domain_initial_dot",
811                     "path", "path_specified",
812                     "secure", "expires", "discard", "comment", "comment_url",
813                     ):
814            attr = getattr(self, name)
815            args.append("%s=%s" % (name, repr(attr)))
816        args.append("rest=%s" % repr(self._rest))
817        args.append("rfc2109=%s" % repr(self.rfc2109))
818        return "Cookie(%s)" % ", ".join(args)
819
820
821class CookiePolicy:
822    """Defines which cookies get accepted from and returned to server.
823
824    May also modify cookies, though this is probably a bad idea.
825
826    The subclass DefaultCookiePolicy defines the standard rules for Netscape
827    and RFC 2965 cookies -- override that if you want a customised policy.
828
829    """
830    def set_ok(self, cookie, request):
831        """Return true if (and only if) cookie should be accepted from server.
832
833        Currently, pre-expired cookies never get this far -- the CookieJar
834        class deletes such cookies itself.
835
836        """
837        raise NotImplementedError()
838
839    def return_ok(self, cookie, request):
840        """Return true if (and only if) cookie should be returned to server."""
841        raise NotImplementedError()
842
843    def domain_return_ok(self, domain, request):
844        """Return false if cookies should not be returned, given cookie domain.
845        """
846        return True
847
848    def path_return_ok(self, path, request):
849        """Return false if cookies should not be returned, given cookie path.
850        """
851        return True
852
853
854class DefaultCookiePolicy(CookiePolicy):
855    """Implements the standard rules for accepting and returning cookies."""
856
857    DomainStrictNoDots = 1
858    DomainStrictNonDomain = 2
859    DomainRFC2965Match = 4
860
861    DomainLiberal = 0
862    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
863
864    def __init__(self,
865                 blocked_domains=None, allowed_domains=None,
866                 netscape=True, rfc2965=False,
867                 rfc2109_as_netscape=None,
868                 hide_cookie2=False,
869                 strict_domain=False,
870                 strict_rfc2965_unverifiable=True,
871                 strict_ns_unverifiable=False,
872                 strict_ns_domain=DomainLiberal,
873                 strict_ns_set_initial_dollar=False,
874                 strict_ns_set_path=False,
875                 ):
876        """Constructor arguments should be passed as keyword arguments only."""
877        self.netscape = netscape
878        self.rfc2965 = rfc2965
879        self.rfc2109_as_netscape = rfc2109_as_netscape
880        self.hide_cookie2 = hide_cookie2
881        self.strict_domain = strict_domain
882        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
883        self.strict_ns_unverifiable = strict_ns_unverifiable
884        self.strict_ns_domain = strict_ns_domain
885        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
886        self.strict_ns_set_path = strict_ns_set_path
887
888        if blocked_domains is not None:
889            self._blocked_domains = tuple(blocked_domains)
890        else:
891            self._blocked_domains = ()
892
893        if allowed_domains is not None:
894            allowed_domains = tuple(allowed_domains)
895        self._allowed_domains = allowed_domains
896
897    def blocked_domains(self):
898        """Return the sequence of blocked domains (as a tuple)."""
899        return self._blocked_domains
900    def set_blocked_domains(self, blocked_domains):
901        """Set the sequence of blocked domains."""
902        self._blocked_domains = tuple(blocked_domains)
903
904    def is_blocked(self, domain):
905        for blocked_domain in self._blocked_domains:
906            if user_domain_match(domain, blocked_domain):
907                return True
908        return False
909
910    def allowed_domains(self):
911        """Return None, or the sequence of allowed domains (as a tuple)."""
912        return self._allowed_domains
913    def set_allowed_domains(self, allowed_domains):
914        """Set the sequence of allowed domains, or None."""
915        if allowed_domains is not None:
916            allowed_domains = tuple(allowed_domains)
917        self._allowed_domains = allowed_domains
918
919    def is_not_allowed(self, domain):
920        if self._allowed_domains is None:
921            return False
922        for allowed_domain in self._allowed_domains:
923            if user_domain_match(domain, allowed_domain):
924                return False
925        return True
926
927    def set_ok(self, cookie, request):
928        """
929        If you override .set_ok(), be sure to call this method.  If it returns
930        false, so should your subclass (assuming your subclass wants to be more
931        strict about which cookies to accept).
932
933        """
934        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
935
936        assert cookie.name is not None
937
938        for n in "version", "verifiability", "name", "path", "domain", "port":
939            fn_name = "set_ok_"+n
940            fn = getattr(self, fn_name)
941            if not fn(cookie, request):
942                return False
943
944        return True
945
946    def set_ok_version(self, cookie, request):
947        if cookie.version is None:
948            # Version is always set to 0 by parse_ns_headers if it's a Netscape
949            # cookie, so this must be an invalid RFC 2965 cookie.
950            _debug("   Set-Cookie2 without version attribute (%s=%s)",
951                   cookie.name, cookie.value)
952            return False
953        if cookie.version > 0 and not self.rfc2965:
954            _debug("   RFC 2965 cookies are switched off")
955            return False
956        elif cookie.version == 0 and not self.netscape:
957            _debug("   Netscape cookies are switched off")
958            return False
959        return True
960
961    def set_ok_verifiability(self, cookie, request):
962        if request.is_unverifiable() and is_third_party(request):
963            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
964                _debug("   third-party RFC 2965 cookie during "
965                             "unverifiable transaction")
966                return False
967            elif cookie.version == 0 and self.strict_ns_unverifiable:
968                _debug("   third-party Netscape cookie during "
969                             "unverifiable transaction")
970                return False
971        return True
972
973    def set_ok_name(self, cookie, request):
974        # Try and stop servers setting V0 cookies designed to hack other
975        # servers that know both V0 and V1 protocols.
976        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
977            cookie.name.startswith("$")):
978            _debug("   illegal name (starts with '$'): '%s'", cookie.name)
979            return False
980        return True
981
982    def set_ok_path(self, cookie, request):
983        if cookie.path_specified:
984            req_path = request_path(request)
985            if ((cookie.version > 0 or
986                 (cookie.version == 0 and self.strict_ns_set_path)) and
987                not req_path.startswith(cookie.path)):
988                _debug("   path attribute %s is not a prefix of request "
989                       "path %s", cookie.path, req_path)
990                return False
991        return True
992
993    def set_ok_domain(self, cookie, request):
994        if self.is_blocked(cookie.domain):
995            _debug("   domain %s is in user block-list", cookie.domain)
996            return False
997        if self.is_not_allowed(cookie.domain):
998            _debug("   domain %s is not in user allow-list", cookie.domain)
999            return False
1000        if cookie.domain_specified:
1001            req_host, erhn = eff_request_host(request)
1002            domain = cookie.domain
1003            if self.strict_domain and (domain.count(".") >= 2):
1004                # XXX This should probably be compared with the Konqueror
1005                # (kcookiejar.cpp) and Mozilla implementations, but it's a
1006                # losing battle.
1007                i = domain.rfind(".")
1008                j = domain.rfind(".", 0, i)
1009                if j == 0:  # domain like .foo.bar
1010                    tld = domain[i+1:]
1011                    sld = domain[j+1:i]
1012                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1013                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
1014                       "info", "jobs", "mobi", "museum", "name", "pro",
1015                       "travel", "eu") and len(tld) == 2:
1016                        # domain like .co.uk
1017                        _debug("   country-code second level domain %s", domain)
1018                        return False
1019            if domain.startswith("."):
1020                undotted_domain = domain[1:]
1021            else:
1022                undotted_domain = domain
1023            embedded_dots = (undotted_domain.find(".") >= 0)
1024            if not embedded_dots and domain != ".local":
1025                _debug("   non-local domain %s contains no embedded dot",
1026                       domain)
1027                return False
1028            if cookie.version == 0:
1029                if (not erhn.endswith(domain) and
1030                    (not erhn.startswith(".") and
1031                     not ("."+erhn).endswith(domain))):
1032                    _debug("   effective request-host %s (even with added "
1033                           "initial dot) does not end with %s",
1034                           erhn, domain)
1035                    return False
1036            if (cookie.version > 0 or
1037                (self.strict_ns_domain & self.DomainRFC2965Match)):
1038                if not domain_match(erhn, domain):
1039                    _debug("   effective request-host %s does not domain-match "
1040                           "%s", erhn, domain)
1041                    return False
1042            if (cookie.version > 0 or
1043                (self.strict_ns_domain & self.DomainStrictNoDots)):
1044                host_prefix = req_host[:-len(domain)]
1045                if (host_prefix.find(".") >= 0 and
1046                    not IPV4_RE.search(req_host)):
1047                    _debug("   host prefix %s for domain %s contains a dot",
1048                           host_prefix, domain)
1049                    return False
1050        return True
1051
1052    def set_ok_port(self, cookie, request):
1053        if cookie.port_specified:
1054            req_port = request_port(request)
1055            if req_port is None:
1056                req_port = "80"
1057            else:
1058                req_port = str(req_port)
1059            for p in cookie.port.split(","):
1060                try:
1061                    int(p)
1062                except ValueError:
1063                    _debug("   bad port %s (not numeric)", p)
1064                    return False
1065                if p == req_port:
1066                    break
1067            else:
1068                _debug("   request port (%s) not found in %s",
1069                       req_port, cookie.port)
1070                return False
1071        return True
1072
1073    def return_ok(self, cookie, request):
1074        """
1075        If you override .return_ok(), be sure to call this method.  If it
1076        returns false, so should your subclass (assuming your subclass wants to
1077        be more strict about which cookies to return).
1078
1079        """
1080        # Path has already been checked by .path_return_ok(), and domain
1081        # blocking done by .domain_return_ok().
1082        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1083
1084        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1085            fn_name = "return_ok_"+n
1086            fn = getattr(self, fn_name)
1087            if not fn(cookie, request):
1088                return False
1089        return True
1090
1091    def return_ok_version(self, cookie, request):
1092        if cookie.version > 0 and not self.rfc2965:
1093            _debug("   RFC 2965 cookies are switched off")
1094            return False
1095        elif cookie.version == 0 and not self.netscape:
1096            _debug("   Netscape cookies are switched off")
1097            return False
1098        return True
1099
1100    def return_ok_verifiability(self, cookie, request):
1101        if request.is_unverifiable() and is_third_party(request):
1102            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1103                _debug("   third-party RFC 2965 cookie during unverifiable "
1104                       "transaction")
1105                return False
1106            elif cookie.version == 0 and self.strict_ns_unverifiable:
1107                _debug("   third-party Netscape cookie during unverifiable "
1108                       "transaction")
1109                return False
1110        return True
1111
1112    def return_ok_secure(self, cookie, request):
1113        if cookie.secure and request.get_type() != "https":
1114            _debug("   secure cookie with non-secure request")
1115            return False
1116        return True
1117
1118    def return_ok_expires(self, cookie, request):
1119        if cookie.is_expired(self._now):
1120            _debug("   cookie expired")
1121            return False
1122        return True
1123
1124    def return_ok_port(self, cookie, request):
1125        if cookie.port:
1126            req_port = request_port(request)
1127            if req_port is None:
1128                req_port = "80"
1129            for p in cookie.port.split(","):
1130                if p == req_port:
1131                    break
1132            else:
1133                _debug("   request port %s does not match cookie port %s",
1134                       req_port, cookie.port)
1135                return False
1136        return True
1137
1138    def return_ok_domain(self, cookie, request):
1139        req_host, erhn = eff_request_host(request)
1140        domain = cookie.domain
1141
1142        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1143        if (cookie.version == 0 and
1144            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1145            not cookie.domain_specified and domain != erhn):
1146            _debug("   cookie with unspecified domain does not string-compare "
1147                   "equal to request domain")
1148            return False
1149
1150        if cookie.version > 0 and not domain_match(erhn, domain):
1151            _debug("   effective request-host name %s does not domain-match "
1152                   "RFC 2965 cookie domain %s", erhn, domain)
1153            return False
1154        if cookie.version == 0 and not ("."+erhn).endswith(domain):
1155            _debug("   request-host %s does not match Netscape cookie domain "
1156                   "%s", req_host, domain)
1157            return False
1158        return True
1159
1160    def domain_return_ok(self, domain, request):
1161        # Liberal check of.  This is here as an optimization to avoid
1162        # having to load lots of MSIE cookie files unless necessary.
1163        req_host, erhn = eff_request_host(request)
1164        if not req_host.startswith("."):
1165            req_host = "."+req_host
1166        if not erhn.startswith("."):
1167            erhn = "."+erhn
1168        if not (req_host.endswith(domain) or erhn.endswith(domain)):
1169            #_debug("   request domain %s does not match cookie domain %s",
1170            #       req_host, domain)
1171            return False
1172
1173        if self.is_blocked(domain):
1174            _debug("   domain %s is in user block-list", domain)
1175            return False
1176        if self.is_not_allowed(domain):
1177            _debug("   domain %s is not in user allow-list", domain)
1178            return False
1179
1180        return True
1181
1182    def path_return_ok(self, path, request):
1183        _debug("- checking cookie path=%s", path)
1184        req_path = request_path(request)
1185        if not req_path.startswith(path):
1186            _debug("  %s does not path-match %s", req_path, path)
1187            return False
1188        return True
1189
1190
1191def vals_sorted_by_key(adict):
1192    keys = adict.keys()
1193    keys.sort()
1194    return map(adict.get, keys)
1195
1196def deepvalues(mapping):
1197    """Iterates over nested mapping, depth-first, in sorted order by key."""
1198    values = vals_sorted_by_key(mapping)
1199    for obj in values:
1200        mapping = False
1201        try:
1202            obj.items
1203        except AttributeError:
1204            pass
1205        else:
1206            mapping = True
1207            for subobj in deepvalues(obj):
1208                yield subobj
1209        if not mapping:
1210            yield obj
1211
1212
1213# Used as second parameter to dict.get() method, to distinguish absent
1214# dict key from one with a None value.
1215class Absent: pass
1216
1217class CookieJar:
1218    """Collection of HTTP cookies.
1219
1220    You may not need to know about this class: try
1221    urllib2.build_opener(HTTPCookieProcessor).open(url).
1222
1223    """
1224
1225    non_word_re = re.compile(r"\W")
1226    quote_re = re.compile(r"([\"\\])")
1227    strict_domain_re = re.compile(r"\.?[^.]*")
1228    domain_re = re.compile(r"[^.]*")
1229    dots_re = re.compile(r"^\.+")
1230
1231    magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1232
1233    def __init__(self, policy=None):
1234        if policy is None:
1235            policy = DefaultCookiePolicy()
1236        self._policy = policy
1237
1238        self._cookies_lock = _threading.RLock()
1239        self._cookies = {}
1240
1241    def set_policy(self, policy):
1242        self._policy = policy
1243
1244    def _cookies_for_domain(self, domain, request):
1245        cookies = []
1246        if not self._policy.domain_return_ok(domain, request):
1247            return []
1248        _debug("Checking %s for cookies to return", domain)
1249        cookies_by_path = self._cookies[domain]
1250        for path in cookies_by_path.keys():
1251            if not self._policy.path_return_ok(path, request):
1252                continue
1253            cookies_by_name = cookies_by_path[path]
1254            for cookie in cookies_by_name.values():
1255                if not self._policy.return_ok(cookie, request):
1256                    _debug("   not returning cookie")
1257                    continue
1258                _debug("   it's a match")
1259                cookies.append(cookie)
1260        return cookies
1261
1262    def _cookies_for_request(self, request):
1263        """Return a list of cookies to be returned to server."""
1264        cookies = []
1265        for domain in self._cookies.keys():
1266            cookies.extend(self._cookies_for_domain(domain, request))
1267        return cookies
1268
1269    def _cookie_attrs(self, cookies):
1270        """Return a list of cookie-attributes to be returned to server.
1271
1272        like ['foo="bar"; $Path="/"', ...]
1273
1274        The $Version attribute is also added when appropriate (currently only
1275        once per request).
1276
1277        """
1278        # add cookies in order of most specific (ie. longest) path first
1279        cookies.sort(key=lambda arg: len(arg.path), reverse=True)
1280
1281        version_set = False
1282
1283        attrs = []
1284        for cookie in cookies:
1285            # set version of Cookie header
1286            # XXX
1287            # What should it be if multiple matching Set-Cookie headers have
1288            #  different versions themselves?
1289            # Answer: there is no answer; was supposed to be settled by
1290            #  RFC 2965 errata, but that may never appear...
1291            version = cookie.version
1292            if not version_set:
1293                version_set = True
1294                if version > 0:
1295                    attrs.append("$Version=%s" % version)
1296
1297            # quote cookie value if necessary
1298            # (not for Netscape protocol, which already has any quotes
1299            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1300            if ((cookie.value is not None) and
1301                self.non_word_re.search(cookie.value) and version > 0):
1302                value = self.quote_re.sub(r"\\\1", cookie.value)
1303            else:
1304                value = cookie.value
1305
1306            # add cookie-attributes to be returned in Cookie header
1307            if cookie.value is None:
1308                attrs.append(cookie.name)
1309            else:
1310                attrs.append("%s=%s" % (cookie.name, value))
1311            if version > 0:
1312                if cookie.path_specified:
1313                    attrs.append('$Path="%s"' % cookie.path)
1314                if cookie.domain.startswith("."):
1315                    domain = cookie.domain
1316                    if (not cookie.domain_initial_dot and
1317                        domain.startswith(".")):
1318                        domain = domain[1:]
1319                    attrs.append('$Domain="%s"' % domain)
1320                if cookie.port is not None:
1321                    p = "$Port"
1322                    if cookie.port_specified:
1323                        p = p + ('="%s"' % cookie.port)
1324                    attrs.append(p)
1325
1326        return attrs
1327
1328    def add_cookie_header(self, request):
1329        """Add correct Cookie: header to request (urllib2.Request object).
1330
1331        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1332
1333        """
1334        _debug("add_cookie_header")
1335        self._cookies_lock.acquire()
1336        try:
1337
1338            self._policy._now = self._now = int(time.time())
1339
1340            cookies = self._cookies_for_request(request)
1341
1342            attrs = self._cookie_attrs(cookies)
1343            if attrs:
1344                if not request.has_header("Cookie"):
1345                    request.add_unredirected_header(
1346                        "Cookie", "; ".join(attrs))
1347
1348            # if necessary, advertise that we know RFC 2965
1349            if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1350                not request.has_header("Cookie2")):
1351                for cookie in cookies:
1352                    if cookie.version != 1:
1353                        request.add_unredirected_header("Cookie2", '$Version="1"')
1354                        break
1355
1356        finally:
1357            self._cookies_lock.release()
1358
1359        self.clear_expired_cookies()
1360
1361    def _normalized_cookie_tuples(self, attrs_set):
1362        """Return list of tuples containing normalised cookie information.
1363
1364        attrs_set is the list of lists of key,value pairs extracted from
1365        the Set-Cookie or Set-Cookie2 headers.
1366
1367        Tuples are name, value, standard, rest, where name and value are the
1368        cookie name and value, standard is a dictionary containing the standard
1369        cookie-attributes (discard, secure, version, expires or max-age,
1370        domain, path and port) and rest is a dictionary containing the rest of
1371        the cookie-attributes.
1372
1373        """
1374        cookie_tuples = []
1375
1376        boolean_attrs = "discard", "secure"
1377        value_attrs = ("version",
1378                       "expires", "max-age",
1379                       "domain", "path", "port",
1380                       "comment", "commenturl")
1381
1382        for cookie_attrs in attrs_set:
1383            name, value = cookie_attrs[0]
1384
1385            # Build dictionary of standard cookie-attributes (standard) and
1386            # dictionary of other cookie-attributes (rest).
1387
1388            # Note: expiry time is normalised to seconds since epoch.  V0
1389            # cookies should have the Expires cookie-attribute, and V1 cookies
1390            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1391            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1392            # accept either (but prefer Max-Age).
1393            max_age_set = False
1394
1395            bad_cookie = False
1396
1397            standard = {}
1398            rest = {}
1399            for k, v in cookie_attrs[1:]:
1400                lc = k.lower()
1401                # don't lose case distinction for unknown fields
1402                if lc in value_attrs or lc in boolean_attrs:
1403                    k = lc
1404                if k in boolean_attrs and v is None:
1405                    # boolean cookie-attribute is present, but has no value
1406                    # (like "discard", rather than "port=80")
1407                    v = True
1408                if k in standard:
1409                    # only first value is significant
1410                    continue
1411                if k == "domain":
1412                    if v is None:
1413                        _debug("   missing value for domain attribute")
1414                        bad_cookie = True
1415                        break
1416                    # RFC 2965 section 3.3.3
1417                    v = v.lower()
1418                if k == "expires":
1419                    if max_age_set:
1420                        # Prefer max-age to expires (like Mozilla)
1421                        continue
1422                    if v is None:
1423                        _debug("   missing or invalid value for expires "
1424                              "attribute: treating as session cookie")
1425                        continue
1426                if k == "max-age":
1427                    max_age_set = True
1428                    try:
1429                        v = int(v)
1430                    except ValueError:
1431                        _debug("   missing or invalid (non-numeric) value for "
1432                              "max-age attribute")
1433                        bad_cookie = True
1434                        break
1435                    # convert RFC 2965 Max-Age to seconds since epoch
1436                    # XXX Strictly you're supposed to follow RFC 2616
1437                    #   age-calculation rules.  Remember that zero Max-Age
1438                    #   is a request to discard (old and new) cookie, though.
1439                    k = "expires"
1440                    v = self._now + v
1441                if (k in value_attrs) or (k in boolean_attrs):
1442                    if (v is None and
1443                        k not in ("port", "comment", "commenturl")):
1444                        _debug("   missing value for %s attribute" % k)
1445                        bad_cookie = True
1446                        break
1447                    standard[k] = v
1448                else:
1449                    rest[k] = v
1450
1451            if bad_cookie:
1452                continue
1453
1454            cookie_tuples.append((name, value, standard, rest))
1455
1456        return cookie_tuples
1457
1458    def _cookie_from_cookie_tuple(self, tup, request):
1459        # standard is dict of standard cookie-attributes, rest is dict of the
1460        # rest of them
1461        name, value, standard, rest = tup
1462
1463        domain = standard.get("domain", Absent)
1464        path = standard.get("path", Absent)
1465        port = standard.get("port", Absent)
1466        expires = standard.get("expires", Absent)
1467
1468        # set the easy defaults
1469        version = standard.get("version", None)
1470        if version is not None:
1471            try:
1472                version = int(version)
1473            except ValueError:
1474                return None  # invalid version, ignore cookie
1475        secure = standard.get("secure", False)
1476        # (discard is also set if expires is Absent)
1477        discard = standard.get("discard", False)
1478        comment = standard.get("comment", None)
1479        comment_url = standard.get("commenturl", None)
1480
1481        # set default path
1482        if path is not Absent and path != "":
1483            path_specified = True
1484            path = escape_path(path)
1485        else:
1486            path_specified = False
1487            path = request_path(request)
1488            i = path.rfind("/")
1489            if i != -1:
1490                if version == 0:
1491                    # Netscape spec parts company from reality here
1492                    path = path[:i]
1493                else:
1494                    path = path[:i+1]
1495            if len(path) == 0: path = "/"
1496
1497        # set default domain
1498        domain_specified = domain is not Absent
1499        # but first we have to remember whether it starts with a dot
1500        domain_initial_dot = False
1501        if domain_specified:
1502            domain_initial_dot = bool(domain.startswith("."))
1503        if domain is Absent:
1504            req_host, erhn = eff_request_host(request)
1505            domain = erhn
1506        elif not domain.startswith("."):
1507            domain = "."+domain
1508
1509        # set default port
1510        port_specified = False
1511        if port is not Absent:
1512            if port is None:
1513                # Port attr present, but has no value: default to request port.
1514                # Cookie should then only be sent back on that port.
1515                port = request_port(request)
1516            else:
1517                port_specified = True
1518                port = re.sub(r"\s+", "", port)
1519        else:
1520            # No port attr present.  Cookie can be sent back on any port.
1521            port = None
1522
1523        # set default expires and discard
1524        if expires is Absent:
1525            expires = None
1526            discard = True
1527        elif expires <= self._now:
1528            # Expiry date in past is request to delete cookie.  This can't be
1529            # in DefaultCookiePolicy, because can't delete cookies there.
1530            try:
1531                self.clear(domain, path, name)
1532            except KeyError:
1533                pass
1534            _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1535                   domain, path, name)
1536            return None
1537
1538        return Cookie(version,
1539                      name, value,
1540                      port, port_specified,
1541                      domain, domain_specified, domain_initial_dot,
1542                      path, path_specified,
1543                      secure,
1544                      expires,
1545                      discard,
1546                      comment,
1547                      comment_url,
1548                      rest)
1549
1550    def _cookies_from_attrs_set(self, attrs_set, request):
1551        cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1552
1553        cookies = []
1554        for tup in cookie_tuples:
1555            cookie = self._cookie_from_cookie_tuple(tup, request)
1556            if cookie: cookies.append(cookie)
1557        return cookies
1558
1559    def _process_rfc2109_cookies(self, cookies):
1560        rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1561        if rfc2109_as_ns is None:
1562            rfc2109_as_ns = not self._policy.rfc2965
1563        for cookie in cookies:
1564            if cookie.version == 1:
1565                cookie.rfc2109 = True
1566                if rfc2109_as_ns:
1567                    # treat 2109 cookies as Netscape cookies rather than
1568                    # as RFC2965 cookies
1569                    cookie.version = 0
1570
1571    def make_cookies(self, response, request):
1572        """Return sequence of Cookie objects extracted from response object."""
1573        # get cookie-attributes for RFC 2965 and Netscape protocols
1574        headers = response.info()
1575        rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1576        ns_hdrs = headers.getheaders("Set-Cookie")
1577
1578        rfc2965 = self._policy.rfc2965
1579        netscape = self._policy.netscape
1580
1581        if ((not rfc2965_hdrs and not ns_hdrs) or
1582            (not ns_hdrs and not rfc2965) or
1583            (not rfc2965_hdrs and not netscape) or
1584            (not netscape and not rfc2965)):
1585            return []  # no relevant cookie headers: quick exit
1586
1587        try:
1588            cookies = self._cookies_from_attrs_set(
1589                split_header_words(rfc2965_hdrs), request)
1590        except Exception:
1591            _warn_unhandled_exception()
1592            cookies = []
1593
1594        if ns_hdrs and netscape:
1595            try:
1596                # RFC 2109 and Netscape cookies
1597                ns_cookies = self._cookies_from_attrs_set(
1598                    parse_ns_headers(ns_hdrs), request)
1599            except Exception:
1600                _warn_unhandled_exception()
1601                ns_cookies = []
1602            self._process_rfc2109_cookies(ns_cookies)
1603
1604            # Look for Netscape cookies (from Set-Cookie headers) that match
1605            # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1606            # For each match, keep the RFC 2965 cookie and ignore the Netscape
1607            # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1608            # bundled in with the Netscape cookies for this purpose, which is
1609            # reasonable behaviour.
1610            if rfc2965:
1611                lookup = {}
1612                for cookie in cookies:
1613                    lookup[(cookie.domain, cookie.path, cookie.name)] = None
1614
1615                def no_matching_rfc2965(ns_cookie, lookup=lookup):
1616                    key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1617                    return key not in lookup
1618                ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1619
1620            if ns_cookies:
1621                cookies.extend(ns_cookies)
1622
1623        return cookies
1624
1625    def set_cookie_if_ok(self, cookie, request):
1626        """Set a cookie if policy says it's OK to do so."""
1627        self._cookies_lock.acquire()
1628        try:
1629            self._policy._now = self._now = int(time.time())
1630
1631            if self._policy.set_ok(cookie, request):
1632                self.set_cookie(cookie)
1633
1634
1635        finally:
1636            self._cookies_lock.release()
1637
1638    def set_cookie(self, cookie):
1639        """Set a cookie, without checking whether or not it should be set."""
1640        c = self._cookies
1641        self._cookies_lock.acquire()
1642        try:
1643            if cookie.domain not in c: c[cookie.domain] = {}
1644            c2 = c[cookie.domain]
1645            if cookie.path not in c2: c2[cookie.path] = {}
1646            c3 = c2[cookie.path]
1647            c3[cookie.name] = cookie
1648        finally:
1649            self._cookies_lock.release()
1650
1651    def extract_cookies(self, response, request):
1652        """Extract cookies from response, where allowable given the request."""
1653        _debug("extract_cookies: %s", response.info())
1654        self._cookies_lock.acquire()
1655        try:
1656            self._policy._now = self._now = int(time.time())
1657
1658            for cookie in self.make_cookies(response, request):
1659                if self._policy.set_ok(cookie, request):
1660                    _debug(" setting cookie: %s", cookie)
1661                    self.set_cookie(cookie)
1662        finally:
1663            self._cookies_lock.release()
1664
1665    def clear(self, domain=None, path=None, name=None):
1666        """Clear some cookies.
1667
1668        Invoking this method without arguments will clear all cookies.  If
1669        given a single argument, only cookies belonging to that domain will be
1670        removed.  If given two arguments, cookies belonging to the specified
1671        path within that domain are removed.  If given three arguments, then
1672        the cookie with the specified name, path and domain is removed.
1673
1674        Raises KeyError if no matching cookie exists.
1675
1676        """
1677        if name is not None:
1678            if (domain is None) or (path is None):
1679                raise ValueError(
1680                    "domain and path must be given to remove a cookie by name")
1681            del self._cookies[domain][path][name]
1682        elif path is not None:
1683            if domain is None:
1684                raise ValueError(
1685                    "domain must be given to remove cookies by path")
1686            del self._cookies[domain][path]
1687        elif domain is not None:
1688            del self._cookies[domain]
1689        else:
1690            self._cookies = {}
1691
1692    def clear_session_cookies(self):
1693        """Discard all session cookies.
1694
1695        Note that the .save() method won't save session cookies anyway, unless
1696        you ask otherwise by passing a true ignore_discard argument.
1697
1698        """
1699        self._cookies_lock.acquire()
1700        try:
1701            for cookie in self:
1702                if cookie.discard:
1703                    self.clear(cookie.domain, cookie.path, cookie.name)
1704        finally:
1705            self._cookies_lock.release()
1706
1707    def clear_expired_cookies(self):
1708        """Discard all expired cookies.
1709
1710        You probably don't need to call this method: expired cookies are never
1711        sent back to the server (provided you're using DefaultCookiePolicy),
1712        this method is called by CookieJar itself every so often, and the
1713        .save() method won't save expired cookies anyway (unless you ask
1714        otherwise by passing a true ignore_expires argument).
1715
1716        """
1717        self._cookies_lock.acquire()
1718        try:
1719            now = time.time()
1720            for cookie in self:
1721                if cookie.is_expired(now):
1722                    self.clear(cookie.domain, cookie.path, cookie.name)
1723        finally:
1724            self._cookies_lock.release()
1725
1726    def __iter__(self):
1727        return deepvalues(self._cookies)
1728
1729    def __len__(self):
1730        """Return number of contained cookies."""
1731        i = 0
1732        for cookie in self: i = i + 1
1733        return i
1734
1735    def __repr__(self):
1736        r = []
1737        for cookie in self: r.append(repr(cookie))
1738        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1739
1740    def __str__(self):
1741        r = []
1742        for cookie in self: r.append(str(cookie))
1743        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1744
1745
1746# derives from IOError for backwards-compatibility with Python 2.4.0
1747class LoadError(IOError): pass
1748
1749class FileCookieJar(CookieJar):
1750    """CookieJar that can be loaded from and saved to a file."""
1751
1752    def __init__(self, filename=None, delayload=False, policy=None):
1753        """
1754        Cookies are NOT loaded from the named file until either the .load() or
1755        .revert() method is called.
1756
1757        """
1758        CookieJar.__init__(self, policy)
1759        if filename is not None:
1760            try:
1761                filename+""
1762            except:
1763                raise ValueError("filename must be string-like")
1764        self.filename = filename
1765        self.delayload = bool(delayload)
1766
1767    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1768        """Save cookies to a file."""
1769        raise NotImplementedError()
1770
1771    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1772        """Load cookies from a file."""
1773        if filename is None:
1774            if self.filename is not None: filename = self.filename
1775            else: raise ValueError(MISSING_FILENAME_TEXT)
1776
1777        f = open(filename)
1778        try:
1779            self._really_load(f, filename, ignore_discard, ignore_expires)
1780        finally:
1781            f.close()
1782
1783    def revert(self, filename=None,
1784               ignore_discard=False, ignore_expires=False):
1785        """Clear all cookies and reload cookies from a saved file.
1786
1787        Raises LoadError (or IOError) if reversion is not successful; the
1788        object's state will not be altered if this happens.
1789
1790        """
1791        if filename is None:
1792            if self.filename is not None: filename = self.filename
1793            else: raise ValueError(MISSING_FILENAME_TEXT)
1794
1795        self._cookies_lock.acquire()
1796        try:
1797
1798            old_state = copy.deepcopy(self._cookies)
1799            self._cookies = {}
1800            try:
1801                self.load(filename, ignore_discard, ignore_expires)
1802            except (LoadError, IOError):
1803                self._cookies = old_state
1804                raise
1805
1806        finally:
1807            self._cookies_lock.release()
1808
1809from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1810from _MozillaCookieJar import MozillaCookieJar
1811