cookielib.py revision e58334ae9e4a635794ff0605f125eec459b9b98f
1"""HTTP cookie handling for web clients. 2 3This module has (now fairly distant) origins in Gisle Aas' Perl module 4HTTP::Cookies, from the libwww-perl library. 5 6Docstrings, comments and debug strings in this code refer to the 7attributes of the HTTP cookie system as cookie-attributes, to distinguish 8them clearly from Python attributes. 9 10Class diagram (note that the classes which do not derive from 11FileCookieJar are not distributed with the Python standard library, but 12are available from http://wwwsearch.sf.net/): 13 14 CookieJar____ 15 / \ \ 16 FileCookieJar \ \ 17 / | \ \ \ 18 MozillaCookieJar | LWPCookieJar \ \ 19 | | \ 20 | ---MSIEBase | \ 21 | / | | \ 22 | / MSIEDBCookieJar BSDDBCookieJar 23 |/ 24 MSIECookieJar 25 26""" 27 28import sys, re, urlparse, copy, time, urllib, logging 29try: 30 import threading as _threading 31except ImportError: 32 import dummy_threading as _threading 33import httplib # only for the default HTTP port 34from calendar import timegm 35 36debug = logging.getLogger("cookielib").debug 37 38DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT) 39MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " 40 "instance initialised with one)") 41 42def reraise_unmasked_exceptions(unmasked=()): 43 # There are a few catch-all except: statements in this module, for 44 # catching input that's bad in unexpected ways. 45 # This function re-raises some exceptions we don't want to trap. 46 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) 47 etype = sys.exc_info()[0] 48 if issubclass(etype, unmasked): 49 raise 50 # swallowed an exception 51 import warnings, traceback, StringIO 52 f = StringIO.StringIO() 53 traceback.print_exc(None, f) 54 msg = f.getvalue() 55 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2) 56 57 58# Date/time conversion 59# ----------------------------------------------------------------------------- 60 61EPOCH_YEAR = 1970 62def _timegm(tt): 63 year, month, mday, hour, min, sec = tt[:6] 64 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and 65 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): 66 return timegm(tt) 67 else: 68 return None 69 70DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 71MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 72 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 73MONTHS_LOWER = [] 74for month in MONTHS: MONTHS_LOWER.append(month.lower()) 75 76def time2isoz(t=None): 77 """Return a string representing time in seconds since epoch, t. 78 79 If the function is called without an argument, it will use the current 80 time. 81 82 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", 83 representing Universal Time (UTC, aka GMT). An example of this format is: 84 85 1994-11-24 08:49:37Z 86 87 """ 88 if t is None: t = time.time() 89 year, mon, mday, hour, min, sec = time.gmtime(t)[:6] 90 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( 91 year, mon, mday, hour, min, sec) 92 93def time2netscape(t=None): 94 """Return a string representing time in seconds since epoch, t. 95 96 If the function is called without an argument, it will use the current 97 time. 98 99 The format of the returned string is like this: 100 101 Wed, DD-Mon-YYYY HH:MM:SS GMT 102 103 """ 104 if t is None: t = time.time() 105 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7] 106 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % ( 107 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec) 108 109 110UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} 111 112TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") 113def offset_from_tz_string(tz): 114 offset = None 115 if tz in UTC_ZONES: 116 offset = 0 117 else: 118 m = TIMEZONE_RE.search(tz) 119 if m: 120 offset = 3600 * int(m.group(2)) 121 if m.group(3): 122 offset = offset + 60 * int(m.group(3)) 123 if m.group(1) == '-': 124 offset = -offset 125 return offset 126 127def _str2time(day, mon, yr, hr, min, sec, tz): 128 # translate month name to number 129 # month numbers start with 1 (January) 130 try: 131 mon = MONTHS_LOWER.index(mon.lower())+1 132 except ValueError: 133 # maybe it's already a number 134 try: 135 imon = int(mon) 136 except ValueError: 137 return None 138 if 1 <= imon <= 12: 139 mon = imon 140 else: 141 return None 142 143 # make sure clock elements are defined 144 if hr is None: hr = 0 145 if min is None: min = 0 146 if sec is None: sec = 0 147 148 yr = int(yr) 149 day = int(day) 150 hr = int(hr) 151 min = int(min) 152 sec = int(sec) 153 154 if yr < 1000: 155 # find "obvious" year 156 cur_yr = time.localtime(time.time())[0] 157 m = cur_yr % 100 158 tmp = yr 159 yr = yr + cur_yr - m 160 m = m - tmp 161 if abs(m) > 50: 162 if m > 0: yr = yr + 100 163 else: yr = yr - 100 164 165 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) 166 t = _timegm((yr, mon, day, hr, min, sec, tz)) 167 168 if t is not None: 169 # adjust time using timezone string, to get absolute time since epoch 170 if tz is None: 171 tz = "UTC" 172 tz = tz.upper() 173 offset = offset_from_tz_string(tz) 174 if offset is None: 175 return None 176 t = t - offset 177 178 return t 179 180STRICT_DATE_RE = re.compile( 181 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " 182 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") 183WEEKDAY_RE = re.compile( 184 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) 185LOOSE_HTTP_DATE_RE = re.compile( 186 r"""^ 187 (\d\d?) # day 188 (?:\s+|[-\/]) 189 (\w+) # month 190 (?:\s+|[-\/]) 191 (\d+) # year 192 (?: 193 (?:\s+|:) # separator before clock 194 (\d\d?):(\d\d) # hour:min 195 (?::(\d\d))? # optional seconds 196 )? # optional clock 197 \s* 198 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone 199 \s* 200 (?:\(\w+\))? # ASCII representation of timezone in parens. 201 \s*$""", re.X) 202def http2time(text): 203 """Returns time in seconds since epoch of time represented by a string. 204 205 Return value is an integer. 206 207 None is returned if the format of str is unrecognized, the time is outside 208 the representable range, or the timezone string is not recognized. If the 209 string contains no timezone, UTC is assumed. 210 211 The timezone in the string may be numerical (like "-0800" or "+0100") or a 212 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the 213 timezone strings equivalent to UTC (zero offset) are known to the function. 214 215 The function loosely parses the following formats: 216 217 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format 218 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format 219 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format 220 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) 221 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) 222 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) 223 224 The parser ignores leading and trailing whitespace. The time may be 225 absent. 226 227 If the year is given with only 2 digits, the function will select the 228 century that makes the year closest to the current date. 229 230 """ 231 # fast exit for strictly conforming string 232 m = STRICT_DATE_RE.search(text) 233 if m: 234 g = m.groups() 235 mon = MONTHS_LOWER.index(g[1].lower()) + 1 236 tt = (int(g[2]), mon, int(g[0]), 237 int(g[3]), int(g[4]), float(g[5])) 238 return _timegm(tt) 239 240 # No, we need some messy parsing... 241 242 # clean up 243 text = text.lstrip() 244 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday 245 246 # tz is time zone specifier string 247 day, mon, yr, hr, min, sec, tz = [None]*7 248 249 # loose regexp parse 250 m = LOOSE_HTTP_DATE_RE.search(text) 251 if m is not None: 252 day, mon, yr, hr, min, sec, tz = m.groups() 253 else: 254 return None # bad format 255 256 return _str2time(day, mon, yr, hr, min, sec, tz) 257 258ISO_DATE_RE = re.compile( 259 """^ 260 (\d{4}) # year 261 [-\/]? 262 (\d\d?) # numerical month 263 [-\/]? 264 (\d\d?) # day 265 (?: 266 (?:\s+|[-:Tt]) # separator before clock 267 (\d\d?):?(\d\d) # hour:min 268 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) 269 )? # optional clock 270 \s* 271 ([-+]?\d\d?:?(:?\d\d)? 272 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) 273 \s*$""", re.X) 274def iso2time(text): 275 """ 276 As for http2time, but parses the ISO 8601 formats: 277 278 1994-02-03 14:15:29 -0100 -- ISO 8601 format 279 1994-02-03 14:15:29 -- zone is optional 280 1994-02-03 -- only date 281 1994-02-03T14:15:29 -- Use T as separator 282 19940203T141529Z -- ISO 8601 compact format 283 19940203 -- only date 284 285 """ 286 # clean up 287 text = text.lstrip() 288 289 # tz is time zone specifier string 290 day, mon, yr, hr, min, sec, tz = [None]*7 291 292 # loose regexp parse 293 m = ISO_DATE_RE.search(text) 294 if m is not None: 295 # XXX there's an extra bit of the timezone I'm ignoring here: is 296 # this the right thing to do? 297 yr, mon, day, hr, min, sec, tz, _ = m.groups() 298 else: 299 return None # bad format 300 301 return _str2time(day, mon, yr, hr, min, sec, tz) 302 303 304# Header parsing 305# ----------------------------------------------------------------------------- 306 307def unmatched(match): 308 """Return unmatched part of re.Match object.""" 309 start, end = match.span(0) 310 return match.string[:start]+match.string[end:] 311 312HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") 313HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") 314HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") 315HEADER_ESCAPE_RE = re.compile(r"\\(.)") 316def split_header_words(header_values): 317 r"""Parse header values into a list of lists containing key,value pairs. 318 319 The function knows how to deal with ",", ";" and "=" as well as quoted 320 values after "=". A list of space separated tokens are parsed as if they 321 were separated by ";". 322 323 If the header_values passed as argument contains multiple values, then they 324 are treated as if they were a single value separated by comma ",". 325 326 This means that this function is useful for parsing header fields that 327 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax 328 the requirement for tokens). 329 330 headers = #header 331 header = (token | parameter) *( [";"] (token | parameter)) 332 333 token = 1*<any CHAR except CTLs or separators> 334 separators = "(" | ")" | "<" | ">" | "@" 335 | "," | ";" | ":" | "\" | <"> 336 | "/" | "[" | "]" | "?" | "=" 337 | "{" | "}" | SP | HT 338 339 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) 340 qdtext = <any TEXT except <">> 341 quoted-pair = "\" CHAR 342 343 parameter = attribute "=" value 344 attribute = token 345 value = token | quoted-string 346 347 Each header is represented by a list of key/value pairs. The value for a 348 simple token (not part of a parameter) is None. Syntactically incorrect 349 headers will not necessarily be parsed as you would want. 350 351 This is easier to describe with some examples: 352 353 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) 354 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] 355 >>> split_header_words(['text/html; charset="iso-8859-1"']) 356 [[('text/html', None), ('charset', 'iso-8859-1')]] 357 >>> split_header_words([r'Basic realm="\"foo\bar\""']) 358 [[('Basic', None), ('realm', '"foobar"')]] 359 360 """ 361 assert not isinstance(header_values, basestring) 362 result = [] 363 for text in header_values: 364 orig_text = text 365 pairs = [] 366 while text: 367 m = HEADER_TOKEN_RE.search(text) 368 if m: 369 text = unmatched(m) 370 name = m.group(1) 371 m = HEADER_QUOTED_VALUE_RE.search(text) 372 if m: # quoted value 373 text = unmatched(m) 374 value = m.group(1) 375 value = HEADER_ESCAPE_RE.sub(r"\1", value) 376 else: 377 m = HEADER_VALUE_RE.search(text) 378 if m: # unquoted value 379 text = unmatched(m) 380 value = m.group(1) 381 value = value.rstrip() 382 else: 383 # no value, a lone token 384 value = None 385 pairs.append((name, value)) 386 elif text.lstrip().startswith(","): 387 # concatenated headers, as per RFC 2616 section 4.2 388 text = text.lstrip()[1:] 389 if pairs: result.append(pairs) 390 pairs = [] 391 else: 392 # skip junk 393 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) 394 assert nr_junk_chars > 0, ( 395 "split_header_words bug: '%s', '%s', %s" % 396 (orig_text, text, pairs)) 397 text = non_junk 398 if pairs: result.append(pairs) 399 return result 400 401HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") 402def join_header_words(lists): 403 """Do the inverse (almost) of the conversion done by split_header_words. 404 405 Takes a list of lists of (key, value) pairs and produces a single header 406 value. Attribute values are quoted if needed. 407 408 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) 409 'text/plain; charset="iso-8859/1"' 410 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) 411 'text/plain, charset="iso-8859/1"' 412 413 """ 414 headers = [] 415 for pairs in lists: 416 attr = [] 417 for k, v in pairs: 418 if v is not None: 419 if not re.search(r"^\w+$", v): 420 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ 421 v = '"%s"' % v 422 k = "%s=%s" % (k, v) 423 attr.append(k) 424 if attr: headers.append("; ".join(attr)) 425 return ", ".join(headers) 426 427def parse_ns_headers(ns_headers): 428 """Ad-hoc parser for Netscape protocol cookie-attributes. 429 430 The old Netscape cookie format for Set-Cookie can for instance contain 431 an unquoted "," in the expires field, so we have to use this ad-hoc 432 parser instead of split_header_words. 433 434 XXX This may not make the best possible effort to parse all the crap 435 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient 436 parser is probably better, so could do worse than following that if 437 this ever gives any trouble. 438 439 Currently, this is also used for parsing RFC 2109 cookies. 440 441 """ 442 known_attrs = ("expires", "domain", "path", "secure", 443 # RFC 2109 attrs (may turn up in Netscape cookies, too) 444 "port", "max-age") 445 446 result = [] 447 for ns_header in ns_headers: 448 pairs = [] 449 version_set = False 450 for ii, param in enumerate(re.split(r";\s*", ns_header)): 451 param = param.rstrip() 452 if param == "": continue 453 if "=" not in param: 454 k, v = param, None 455 else: 456 k, v = re.split(r"\s*=\s*", param, 1) 457 k = k.lstrip() 458 if ii != 0: 459 lc = k.lower() 460 if lc in known_attrs: 461 k = lc 462 if k == "version": 463 # This is an RFC 2109 cookie. 464 version_set = True 465 if k == "expires": 466 # convert expires date to seconds since epoch 467 if v.startswith('"'): v = v[1:] 468 if v.endswith('"'): v = v[:-1] 469 v = http2time(v) # None if invalid 470 pairs.append((k, v)) 471 472 if pairs: 473 if not version_set: 474 pairs.append(("version", "0")) 475 result.append(pairs) 476 477 return result 478 479 480IPV4_RE = re.compile(r"\.\d+$") 481def is_HDN(text): 482 """Return True if text is a host domain name.""" 483 # XXX 484 # This may well be wrong. Which RFC is HDN defined in, if any (for 485 # the purposes of RFC 2965)? 486 # For the current implementation, what about IPv6? Remember to look 487 # at other uses of IPV4_RE also, if change this. 488 if IPV4_RE.search(text): 489 return False 490 if text == "": 491 return False 492 if text[0] == "." or text[-1] == ".": 493 return False 494 return True 495 496def domain_match(A, B): 497 """Return True if domain A domain-matches domain B, according to RFC 2965. 498 499 A and B may be host domain names or IP addresses. 500 501 RFC 2965, section 1: 502 503 Host names can be specified either as an IP address or a HDN string. 504 Sometimes we compare one host name with another. (Such comparisons SHALL 505 be case-insensitive.) Host A's name domain-matches host B's if 506 507 * their host name strings string-compare equal; or 508 509 * A is a HDN string and has the form NB, where N is a non-empty 510 name string, B has the form .B', and B' is a HDN string. (So, 511 x.y.com domain-matches .Y.com but not Y.com.) 512 513 Note that domain-match is not a commutative operation: a.b.c.com 514 domain-matches .c.com, but not the reverse. 515 516 """ 517 # Note that, if A or B are IP addresses, the only relevant part of the 518 # definition of the domain-match algorithm is the direct string-compare. 519 A = A.lower() 520 B = B.lower() 521 if A == B: 522 return True 523 if not is_HDN(A): 524 return False 525 i = A.rfind(B) 526 if i == -1 or i == 0: 527 # A does not have form NB, or N is the empty string 528 return False 529 if not B.startswith("."): 530 return False 531 if not is_HDN(B[1:]): 532 return False 533 return True 534 535def liberal_is_HDN(text): 536 """Return True if text is a sort-of-like a host domain name. 537 538 For accepting/blocking domains. 539 540 """ 541 if IPV4_RE.search(text): 542 return False 543 return True 544 545def user_domain_match(A, B): 546 """For blocking/accepting domains. 547 548 A and B may be host domain names or IP addresses. 549 550 """ 551 A = A.lower() 552 B = B.lower() 553 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): 554 if A == B: 555 # equal IP addresses 556 return True 557 return False 558 initial_dot = B.startswith(".") 559 if initial_dot and A.endswith(B): 560 return True 561 if not initial_dot and A == B: 562 return True 563 return False 564 565cut_port_re = re.compile(r":\d+$") 566def request_host(request): 567 """Return request-host, as defined by RFC 2965. 568 569 Variation from RFC: returned value is lowercased, for convenient 570 comparison. 571 572 """ 573 url = request.get_full_url() 574 host = urlparse.urlparse(url)[1] 575 if host == "": 576 host = request.get_header("Host", "") 577 578 # remove port, if present 579 host = cut_port_re.sub("", host, 1) 580 return host.lower() 581 582def eff_request_host(request): 583 """Return a tuple (request-host, effective request-host name). 584 585 As defined by RFC 2965, except both are lowercased. 586 587 """ 588 erhn = req_host = request_host(request) 589 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): 590 erhn = req_host + ".local" 591 return req_host, erhn 592 593def request_path(request): 594 """request-URI, as defined by RFC 2965.""" 595 url = request.get_full_url() 596 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url) 597 #req_path = escape_path("".join(urlparse.urlparse(url)[2:])) 598 path, parameters, query, frag = urlparse.urlparse(url)[2:] 599 if parameters: 600 path = "%s;%s" % (path, parameters) 601 path = escape_path(path) 602 req_path = urlparse.urlunparse(("", "", path, "", query, frag)) 603 if not req_path.startswith("/"): 604 # fix bad RFC 2396 absoluteURI 605 req_path = "/"+req_path 606 return req_path 607 608def request_port(request): 609 host = request.get_host() 610 i = host.find(':') 611 if i >= 0: 612 port = host[i+1:] 613 try: 614 int(port) 615 except ValueError: 616 debug("nonnumeric port: '%s'", port) 617 return None 618 else: 619 port = DEFAULT_HTTP_PORT 620 return port 621 622# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't 623# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). 624HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" 625ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") 626def uppercase_escaped_char(match): 627 return "%%%s" % match.group(1).upper() 628def escape_path(path): 629 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" 630 # There's no knowing what character encoding was used to create URLs 631 # containing %-escapes, but since we have to pick one to escape invalid 632 # path characters, we pick UTF-8, as recommended in the HTML 4.0 633 # specification: 634 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 635 # And here, kind of: draft-fielding-uri-rfc2396bis-03 636 # (And in draft IRI specification: draft-duerst-iri-05) 637 # (And here, for new URI schemes: RFC 2718) 638 if isinstance(path, unicode): 639 path = path.encode("utf-8") 640 path = urllib.quote(path, HTTP_PATH_SAFE) 641 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) 642 return path 643 644def reach(h): 645 """Return reach of host h, as defined by RFC 2965, section 1. 646 647 The reach R of a host name H is defined as follows: 648 649 * If 650 651 - H is the host domain name of a host; and, 652 653 - H has the form A.B; and 654 655 - A has no embedded (that is, interior) dots; and 656 657 - B has at least one embedded dot, or B is the string "local". 658 then the reach of H is .B. 659 660 * Otherwise, the reach of H is H. 661 662 >>> reach("www.acme.com") 663 '.acme.com' 664 >>> reach("acme.com") 665 'acme.com' 666 >>> reach("acme.local") 667 '.local' 668 669 """ 670 i = h.find(".") 671 if i >= 0: 672 #a = h[:i] # this line is only here to show what a is 673 b = h[i+1:] 674 i = b.find(".") 675 if is_HDN(h) and (i >= 0 or b == "local"): 676 return "."+b 677 return h 678 679def is_third_party(request): 680 """ 681 682 RFC 2965, section 3.3.6: 683 684 An unverifiable transaction is to a third-party host if its request- 685 host U does not domain-match the reach R of the request-host O in the 686 origin transaction. 687 688 """ 689 req_host = request_host(request) 690 if not domain_match(req_host, reach(request.get_origin_req_host())): 691 return True 692 else: 693 return False 694 695 696class Cookie: 697 """HTTP Cookie. 698 699 This class represents both Netscape and RFC 2965 cookies. 700 701 This is deliberately a very simple class. It just holds attributes. It's 702 possible to construct Cookie instances that don't comply with the cookie 703 standards. CookieJar.make_cookies is the factory function for Cookie 704 objects -- it deals with cookie parsing, supplying defaults, and 705 normalising to the representation used in this class. CookiePolicy is 706 responsible for checking them to see whether they should be accepted from 707 and returned to the server. 708 709 Note that the port may be present in the headers, but unspecified ("Port" 710 rather than"Port=80", for example); if this is the case, port is None. 711 712 """ 713 714 def __init__(self, version, name, value, 715 port, port_specified, 716 domain, domain_specified, domain_initial_dot, 717 path, path_specified, 718 secure, 719 expires, 720 discard, 721 comment, 722 comment_url, 723 rest, 724 rfc2109=False, 725 ): 726 727 if version is not None: version = int(version) 728 if expires is not None: expires = int(expires) 729 if port is None and port_specified is True: 730 raise ValueError("if port is None, port_specified must be false") 731 732 self.version = version 733 self.name = name 734 self.value = value 735 self.port = port 736 self.port_specified = port_specified 737 # normalise case, as per RFC 2965 section 3.3.3 738 self.domain = domain.lower() 739 self.domain_specified = domain_specified 740 # Sigh. We need to know whether the domain given in the 741 # cookie-attribute had an initial dot, in order to follow RFC 2965 742 # (as clarified in draft errata). Needed for the returned $Domain 743 # value. 744 self.domain_initial_dot = domain_initial_dot 745 self.path = path 746 self.path_specified = path_specified 747 self.secure = secure 748 self.expires = expires 749 self.discard = discard 750 self.comment = comment 751 self.comment_url = comment_url 752 self.rfc2109 = rfc2109 753 754 self._rest = copy.copy(rest) 755 756 def has_nonstandard_attr(self, name): 757 return name in self._rest 758 def get_nonstandard_attr(self, name, default=None): 759 return self._rest.get(name, default) 760 def set_nonstandard_attr(self, name, value): 761 self._rest[name] = value 762 763 def is_expired(self, now=None): 764 if now is None: now = time.time() 765 if (self.expires is not None) and (self.expires <= now): 766 return True 767 return False 768 769 def __str__(self): 770 if self.port is None: p = "" 771 else: p = ":"+self.port 772 limit = self.domain + p + self.path 773 if self.value is not None: 774 namevalue = "%s=%s" % (self.name, self.value) 775 else: 776 namevalue = self.name 777 return "<Cookie %s for %s>" % (namevalue, limit) 778 779 def __repr__(self): 780 args = [] 781 for name in ("version", "name", "value", 782 "port", "port_specified", 783 "domain", "domain_specified", "domain_initial_dot", 784 "path", "path_specified", 785 "secure", "expires", "discard", "comment", "comment_url", 786 ): 787 attr = getattr(self, name) 788 args.append("%s=%s" % (name, repr(attr))) 789 args.append("rest=%s" % repr(self._rest)) 790 args.append("rfc2109=%s" % repr(self.rfc2109)) 791 return "Cookie(%s)" % ", ".join(args) 792 793 794class CookiePolicy: 795 """Defines which cookies get accepted from and returned to server. 796 797 May also modify cookies, though this is probably a bad idea. 798 799 The subclass DefaultCookiePolicy defines the standard rules for Netscape 800 and RFC 2965 cookies -- override that if you want a customised policy. 801 802 """ 803 def set_ok(self, cookie, request): 804 """Return true if (and only if) cookie should be accepted from server. 805 806 Currently, pre-expired cookies never get this far -- the CookieJar 807 class deletes such cookies itself. 808 809 """ 810 raise NotImplementedError() 811 812 def return_ok(self, cookie, request): 813 """Return true if (and only if) cookie should be returned to server.""" 814 raise NotImplementedError() 815 816 def domain_return_ok(self, domain, request): 817 """Return false if cookies should not be returned, given cookie domain. 818 """ 819 return True 820 821 def path_return_ok(self, path, request): 822 """Return false if cookies should not be returned, given cookie path. 823 """ 824 return True 825 826 827class DefaultCookiePolicy(CookiePolicy): 828 """Implements the standard rules for accepting and returning cookies.""" 829 830 DomainStrictNoDots = 1 831 DomainStrictNonDomain = 2 832 DomainRFC2965Match = 4 833 834 DomainLiberal = 0 835 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain 836 837 def __init__(self, 838 blocked_domains=None, allowed_domains=None, 839 netscape=True, rfc2965=False, 840 rfc2109_as_netscape=None, 841 hide_cookie2=False, 842 strict_domain=False, 843 strict_rfc2965_unverifiable=True, 844 strict_ns_unverifiable=False, 845 strict_ns_domain=DomainLiberal, 846 strict_ns_set_initial_dollar=False, 847 strict_ns_set_path=False, 848 ): 849 """Constructor arguments should be passed as keyword arguments only.""" 850 self.netscape = netscape 851 self.rfc2965 = rfc2965 852 self.rfc2109_as_netscape = rfc2109_as_netscape 853 self.hide_cookie2 = hide_cookie2 854 self.strict_domain = strict_domain 855 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable 856 self.strict_ns_unverifiable = strict_ns_unverifiable 857 self.strict_ns_domain = strict_ns_domain 858 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar 859 self.strict_ns_set_path = strict_ns_set_path 860 861 if blocked_domains is not None: 862 self._blocked_domains = tuple(blocked_domains) 863 else: 864 self._blocked_domains = () 865 866 if allowed_domains is not None: 867 allowed_domains = tuple(allowed_domains) 868 self._allowed_domains = allowed_domains 869 870 def blocked_domains(self): 871 """Return the sequence of blocked domains (as a tuple).""" 872 return self._blocked_domains 873 def set_blocked_domains(self, blocked_domains): 874 """Set the sequence of blocked domains.""" 875 self._blocked_domains = tuple(blocked_domains) 876 877 def is_blocked(self, domain): 878 for blocked_domain in self._blocked_domains: 879 if user_domain_match(domain, blocked_domain): 880 return True 881 return False 882 883 def allowed_domains(self): 884 """Return None, or the sequence of allowed domains (as a tuple).""" 885 return self._allowed_domains 886 def set_allowed_domains(self, allowed_domains): 887 """Set the sequence of allowed domains, or None.""" 888 if allowed_domains is not None: 889 allowed_domains = tuple(allowed_domains) 890 self._allowed_domains = allowed_domains 891 892 def is_not_allowed(self, domain): 893 if self._allowed_domains is None: 894 return False 895 for allowed_domain in self._allowed_domains: 896 if user_domain_match(domain, allowed_domain): 897 return False 898 return True 899 900 def set_ok(self, cookie, request): 901 """ 902 If you override .set_ok(), be sure to call this method. If it returns 903 false, so should your subclass (assuming your subclass wants to be more 904 strict about which cookies to accept). 905 906 """ 907 debug(" - checking cookie %s=%s", cookie.name, cookie.value) 908 909 assert cookie.name is not None 910 911 for n in "version", "verifiability", "name", "path", "domain", "port": 912 fn_name = "set_ok_"+n 913 fn = getattr(self, fn_name) 914 if not fn(cookie, request): 915 return False 916 917 return True 918 919 def set_ok_version(self, cookie, request): 920 if cookie.version is None: 921 # Version is always set to 0 by parse_ns_headers if it's a Netscape 922 # cookie, so this must be an invalid RFC 2965 cookie. 923 debug(" Set-Cookie2 without version attribute (%s=%s)", 924 cookie.name, cookie.value) 925 return False 926 if cookie.version > 0 and not self.rfc2965: 927 debug(" RFC 2965 cookies are switched off") 928 return False 929 elif cookie.version == 0 and not self.netscape: 930 debug(" Netscape cookies are switched off") 931 return False 932 return True 933 934 def set_ok_verifiability(self, cookie, request): 935 if request.is_unverifiable() and is_third_party(request): 936 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 937 debug(" third-party RFC 2965 cookie during " 938 "unverifiable transaction") 939 return False 940 elif cookie.version == 0 and self.strict_ns_unverifiable: 941 debug(" third-party Netscape cookie during " 942 "unverifiable transaction") 943 return False 944 return True 945 946 def set_ok_name(self, cookie, request): 947 # Try and stop servers setting V0 cookies designed to hack other 948 # servers that know both V0 and V1 protocols. 949 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and 950 cookie.name.startswith("$")): 951 debug(" illegal name (starts with '$'): '%s'", cookie.name) 952 return False 953 return True 954 955 def set_ok_path(self, cookie, request): 956 if cookie.path_specified: 957 req_path = request_path(request) 958 if ((cookie.version > 0 or 959 (cookie.version == 0 and self.strict_ns_set_path)) and 960 not req_path.startswith(cookie.path)): 961 debug(" path attribute %s is not a prefix of request " 962 "path %s", cookie.path, req_path) 963 return False 964 return True 965 966 def set_ok_domain(self, cookie, request): 967 if self.is_blocked(cookie.domain): 968 debug(" domain %s is in user block-list", cookie.domain) 969 return False 970 if self.is_not_allowed(cookie.domain): 971 debug(" domain %s is not in user allow-list", cookie.domain) 972 return False 973 if cookie.domain_specified: 974 req_host, erhn = eff_request_host(request) 975 domain = cookie.domain 976 if self.strict_domain and (domain.count(".") >= 2): 977 # XXX This should probably be compared with the Konqueror 978 # (kcookiejar.cpp) and Mozilla implementations, but it's a 979 # losing battle. 980 i = domain.rfind(".") 981 j = domain.rfind(".", 0, i) 982 if j == 0: # domain like .foo.bar 983 tld = domain[i+1:] 984 sld = domain[j+1:i] 985 if sld.lower() in ("co", "ac", "com", "edu", "org", "net", 986 "gov", "mil", "int", "aero", "biz", "cat", "coop", 987 "info", "jobs", "mobi", "museum", "name", "pro", 988 "travel", "eu") and len(tld) == 2: 989 # domain like .co.uk 990 debug(" country-code second level domain %s", domain) 991 return False 992 if domain.startswith("."): 993 undotted_domain = domain[1:] 994 else: 995 undotted_domain = domain 996 embedded_dots = (undotted_domain.find(".") >= 0) 997 if not embedded_dots and domain != ".local": 998 debug(" non-local domain %s contains no embedded dot", 999 domain) 1000 return False 1001 if cookie.version == 0: 1002 if (not erhn.endswith(domain) and 1003 (not erhn.startswith(".") and 1004 not ("."+erhn).endswith(domain))): 1005 debug(" effective request-host %s (even with added " 1006 "initial dot) does not end end with %s", 1007 erhn, domain) 1008 return False 1009 if (cookie.version > 0 or 1010 (self.strict_ns_domain & self.DomainRFC2965Match)): 1011 if not domain_match(erhn, domain): 1012 debug(" effective request-host %s does not domain-match " 1013 "%s", erhn, domain) 1014 return False 1015 if (cookie.version > 0 or 1016 (self.strict_ns_domain & self.DomainStrictNoDots)): 1017 host_prefix = req_host[:-len(domain)] 1018 if (host_prefix.find(".") >= 0 and 1019 not IPV4_RE.search(req_host)): 1020 debug(" host prefix %s for domain %s contains a dot", 1021 host_prefix, domain) 1022 return False 1023 return True 1024 1025 def set_ok_port(self, cookie, request): 1026 if cookie.port_specified: 1027 req_port = request_port(request) 1028 if req_port is None: 1029 req_port = "80" 1030 else: 1031 req_port = str(req_port) 1032 for p in cookie.port.split(","): 1033 try: 1034 int(p) 1035 except ValueError: 1036 debug(" bad port %s (not numeric)", p) 1037 return False 1038 if p == req_port: 1039 break 1040 else: 1041 debug(" request port (%s) not found in %s", 1042 req_port, cookie.port) 1043 return False 1044 return True 1045 1046 def return_ok(self, cookie, request): 1047 """ 1048 If you override .return_ok(), be sure to call this method. If it 1049 returns false, so should your subclass (assuming your subclass wants to 1050 be more strict about which cookies to return). 1051 1052 """ 1053 # Path has already been checked by .path_return_ok(), and domain 1054 # blocking done by .domain_return_ok(). 1055 debug(" - checking cookie %s=%s", cookie.name, cookie.value) 1056 1057 for n in "version", "verifiability", "secure", "expires", "port", "domain": 1058 fn_name = "return_ok_"+n 1059 fn = getattr(self, fn_name) 1060 if not fn(cookie, request): 1061 return False 1062 return True 1063 1064 def return_ok_version(self, cookie, request): 1065 if cookie.version > 0 and not self.rfc2965: 1066 debug(" RFC 2965 cookies are switched off") 1067 return False 1068 elif cookie.version == 0 and not self.netscape: 1069 debug(" Netscape cookies are switched off") 1070 return False 1071 return True 1072 1073 def return_ok_verifiability(self, cookie, request): 1074 if request.is_unverifiable() and is_third_party(request): 1075 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 1076 debug(" third-party RFC 2965 cookie during unverifiable " 1077 "transaction") 1078 return False 1079 elif cookie.version == 0 and self.strict_ns_unverifiable: 1080 debug(" third-party Netscape cookie during unverifiable " 1081 "transaction") 1082 return False 1083 return True 1084 1085 def return_ok_secure(self, cookie, request): 1086 if cookie.secure and request.get_type() != "https": 1087 debug(" secure cookie with non-secure request") 1088 return False 1089 return True 1090 1091 def return_ok_expires(self, cookie, request): 1092 if cookie.is_expired(self._now): 1093 debug(" cookie expired") 1094 return False 1095 return True 1096 1097 def return_ok_port(self, cookie, request): 1098 if cookie.port: 1099 req_port = request_port(request) 1100 if req_port is None: 1101 req_port = "80" 1102 for p in cookie.port.split(","): 1103 if p == req_port: 1104 break 1105 else: 1106 debug(" request port %s does not match cookie port %s", 1107 req_port, cookie.port) 1108 return False 1109 return True 1110 1111 def return_ok_domain(self, cookie, request): 1112 req_host, erhn = eff_request_host(request) 1113 domain = cookie.domain 1114 1115 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't 1116 if (cookie.version == 0 and 1117 (self.strict_ns_domain & self.DomainStrictNonDomain) and 1118 not cookie.domain_specified and domain != erhn): 1119 debug(" cookie with unspecified domain does not string-compare " 1120 "equal to request domain") 1121 return False 1122 1123 if cookie.version > 0 and not domain_match(erhn, domain): 1124 debug(" effective request-host name %s does not domain-match " 1125 "RFC 2965 cookie domain %s", erhn, domain) 1126 return False 1127 if cookie.version == 0 and not ("."+erhn).endswith(domain): 1128 debug(" request-host %s does not match Netscape cookie domain " 1129 "%s", req_host, domain) 1130 return False 1131 return True 1132 1133 def domain_return_ok(self, domain, request): 1134 # Liberal check of. This is here as an optimization to avoid 1135 # having to load lots of MSIE cookie files unless necessary. 1136 req_host, erhn = eff_request_host(request) 1137 if not req_host.startswith("."): 1138 req_host = "."+req_host 1139 if not erhn.startswith("."): 1140 erhn = "."+erhn 1141 if not (req_host.endswith(domain) or erhn.endswith(domain)): 1142 #debug(" request domain %s does not match cookie domain %s", 1143 # req_host, domain) 1144 return False 1145 1146 if self.is_blocked(domain): 1147 debug(" domain %s is in user block-list", domain) 1148 return False 1149 if self.is_not_allowed(domain): 1150 debug(" domain %s is not in user allow-list", domain) 1151 return False 1152 1153 return True 1154 1155 def path_return_ok(self, path, request): 1156 debug("- checking cookie path=%s", path) 1157 req_path = request_path(request) 1158 if not req_path.startswith(path): 1159 debug(" %s does not path-match %s", req_path, path) 1160 return False 1161 return True 1162 1163 1164def vals_sorted_by_key(adict): 1165 keys = adict.keys() 1166 keys.sort() 1167 return map(adict.get, keys) 1168 1169def deepvalues(mapping): 1170 """Iterates over nested mapping, depth-first, in sorted order by key.""" 1171 values = vals_sorted_by_key(mapping) 1172 for obj in values: 1173 mapping = False 1174 try: 1175 obj.items 1176 except AttributeError: 1177 pass 1178 else: 1179 mapping = True 1180 for subobj in deepvalues(obj): 1181 yield subobj 1182 if not mapping: 1183 yield obj 1184 1185 1186# Used as second parameter to dict.get() method, to distinguish absent 1187# dict key from one with a None value. 1188class Absent: pass 1189 1190class CookieJar: 1191 """Collection of HTTP cookies. 1192 1193 You may not need to know about this class: try 1194 urllib2.build_opener(HTTPCookieProcessor).open(url). 1195 1196 """ 1197 1198 non_word_re = re.compile(r"\W") 1199 quote_re = re.compile(r"([\"\\])") 1200 strict_domain_re = re.compile(r"\.?[^.]*") 1201 domain_re = re.compile(r"[^.]*") 1202 dots_re = re.compile(r"^\.+") 1203 1204 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" 1205 1206 def __init__(self, policy=None): 1207 if policy is None: 1208 policy = DefaultCookiePolicy() 1209 self._policy = policy 1210 1211 self._cookies_lock = _threading.RLock() 1212 self._cookies = {} 1213 1214 def set_policy(self, policy): 1215 self._policy = policy 1216 1217 def _cookies_for_domain(self, domain, request): 1218 cookies = [] 1219 if not self._policy.domain_return_ok(domain, request): 1220 return [] 1221 debug("Checking %s for cookies to return", domain) 1222 cookies_by_path = self._cookies[domain] 1223 for path in cookies_by_path.keys(): 1224 if not self._policy.path_return_ok(path, request): 1225 continue 1226 cookies_by_name = cookies_by_path[path] 1227 for cookie in cookies_by_name.values(): 1228 if not self._policy.return_ok(cookie, request): 1229 debug(" not returning cookie") 1230 continue 1231 debug(" it's a match") 1232 cookies.append(cookie) 1233 return cookies 1234 1235 def _cookies_for_request(self, request): 1236 """Return a list of cookies to be returned to server.""" 1237 cookies = [] 1238 for domain in self._cookies.keys(): 1239 cookies.extend(self._cookies_for_domain(domain, request)) 1240 return cookies 1241 1242 def _cookie_attrs(self, cookies): 1243 """Return a list of cookie-attributes to be returned to server. 1244 1245 like ['foo="bar"; $Path="/"', ...] 1246 1247 The $Version attribute is also added when appropriate (currently only 1248 once per request). 1249 1250 """ 1251 # add cookies in order of most specific (ie. longest) path first 1252 def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) 1253 cookies.sort(decreasing_size) 1254 1255 version_set = False 1256 1257 attrs = [] 1258 for cookie in cookies: 1259 # set version of Cookie header 1260 # XXX 1261 # What should it be if multiple matching Set-Cookie headers have 1262 # different versions themselves? 1263 # Answer: there is no answer; was supposed to be settled by 1264 # RFC 2965 errata, but that may never appear... 1265 version = cookie.version 1266 if not version_set: 1267 version_set = True 1268 if version > 0: 1269 attrs.append("$Version=%s" % version) 1270 1271 # quote cookie value if necessary 1272 # (not for Netscape protocol, which already has any quotes 1273 # intact, due to the poorly-specified Netscape Cookie: syntax) 1274 if ((cookie.value is not None) and 1275 self.non_word_re.search(cookie.value) and version > 0): 1276 value = self.quote_re.sub(r"\\\1", cookie.value) 1277 else: 1278 value = cookie.value 1279 1280 # add cookie-attributes to be returned in Cookie header 1281 if cookie.value is None: 1282 attrs.append(cookie.name) 1283 else: 1284 attrs.append("%s=%s" % (cookie.name, value)) 1285 if version > 0: 1286 if cookie.path_specified: 1287 attrs.append('$Path="%s"' % cookie.path) 1288 if cookie.domain.startswith("."): 1289 domain = cookie.domain 1290 if (not cookie.domain_initial_dot and 1291 domain.startswith(".")): 1292 domain = domain[1:] 1293 attrs.append('$Domain="%s"' % domain) 1294 if cookie.port is not None: 1295 p = "$Port" 1296 if cookie.port_specified: 1297 p = p + ('="%s"' % cookie.port) 1298 attrs.append(p) 1299 1300 return attrs 1301 1302 def add_cookie_header(self, request): 1303 """Add correct Cookie: header to request (urllib2.Request object). 1304 1305 The Cookie2 header is also added unless policy.hide_cookie2 is true. 1306 1307 """ 1308 debug("add_cookie_header") 1309 self._cookies_lock.acquire() 1310 1311 self._policy._now = self._now = int(time.time()) 1312 1313 cookies = self._cookies_for_request(request) 1314 1315 attrs = self._cookie_attrs(cookies) 1316 if attrs: 1317 if not request.has_header("Cookie"): 1318 request.add_unredirected_header( 1319 "Cookie", "; ".join(attrs)) 1320 1321 # if necessary, advertise that we know RFC 2965 1322 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and 1323 not request.has_header("Cookie2")): 1324 for cookie in cookies: 1325 if cookie.version != 1: 1326 request.add_unredirected_header("Cookie2", '$Version="1"') 1327 break 1328 1329 self._cookies_lock.release() 1330 1331 self.clear_expired_cookies() 1332 1333 def _normalized_cookie_tuples(self, attrs_set): 1334 """Return list of tuples containing normalised cookie information. 1335 1336 attrs_set is the list of lists of key,value pairs extracted from 1337 the Set-Cookie or Set-Cookie2 headers. 1338 1339 Tuples are name, value, standard, rest, where name and value are the 1340 cookie name and value, standard is a dictionary containing the standard 1341 cookie-attributes (discard, secure, version, expires or max-age, 1342 domain, path and port) and rest is a dictionary containing the rest of 1343 the cookie-attributes. 1344 1345 """ 1346 cookie_tuples = [] 1347 1348 boolean_attrs = "discard", "secure" 1349 value_attrs = ("version", 1350 "expires", "max-age", 1351 "domain", "path", "port", 1352 "comment", "commenturl") 1353 1354 for cookie_attrs in attrs_set: 1355 name, value = cookie_attrs[0] 1356 1357 # Build dictionary of standard cookie-attributes (standard) and 1358 # dictionary of other cookie-attributes (rest). 1359 1360 # Note: expiry time is normalised to seconds since epoch. V0 1361 # cookies should have the Expires cookie-attribute, and V1 cookies 1362 # should have Max-Age, but since V1 includes RFC 2109 cookies (and 1363 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we 1364 # accept either (but prefer Max-Age). 1365 max_age_set = False 1366 1367 bad_cookie = False 1368 1369 standard = {} 1370 rest = {} 1371 for k, v in cookie_attrs[1:]: 1372 lc = k.lower() 1373 # don't lose case distinction for unknown fields 1374 if lc in value_attrs or lc in boolean_attrs: 1375 k = lc 1376 if k in boolean_attrs and v is None: 1377 # boolean cookie-attribute is present, but has no value 1378 # (like "discard", rather than "port=80") 1379 v = True 1380 if k in standard: 1381 # only first value is significant 1382 continue 1383 if k == "domain": 1384 if v is None: 1385 debug(" missing value for domain attribute") 1386 bad_cookie = True 1387 break 1388 # RFC 2965 section 3.3.3 1389 v = v.lower() 1390 if k == "expires": 1391 if max_age_set: 1392 # Prefer max-age to expires (like Mozilla) 1393 continue 1394 if v is None: 1395 debug(" missing or invalid value for expires " 1396 "attribute: treating as session cookie") 1397 continue 1398 if k == "max-age": 1399 max_age_set = True 1400 try: 1401 v = int(v) 1402 except ValueError: 1403 debug(" missing or invalid (non-numeric) value for " 1404 "max-age attribute") 1405 bad_cookie = True 1406 break 1407 # convert RFC 2965 Max-Age to seconds since epoch 1408 # XXX Strictly you're supposed to follow RFC 2616 1409 # age-calculation rules. Remember that zero Max-Age is a 1410 # is a request to discard (old and new) cookie, though. 1411 k = "expires" 1412 v = self._now + v 1413 if (k in value_attrs) or (k in boolean_attrs): 1414 if (v is None and 1415 k not in ("port", "comment", "commenturl")): 1416 debug(" missing value for %s attribute" % k) 1417 bad_cookie = True 1418 break 1419 standard[k] = v 1420 else: 1421 rest[k] = v 1422 1423 if bad_cookie: 1424 continue 1425 1426 cookie_tuples.append((name, value, standard, rest)) 1427 1428 return cookie_tuples 1429 1430 def _cookie_from_cookie_tuple(self, tup, request): 1431 # standard is dict of standard cookie-attributes, rest is dict of the 1432 # rest of them 1433 name, value, standard, rest = tup 1434 1435 domain = standard.get("domain", Absent) 1436 path = standard.get("path", Absent) 1437 port = standard.get("port", Absent) 1438 expires = standard.get("expires", Absent) 1439 1440 # set the easy defaults 1441 version = standard.get("version", None) 1442 if version is not None: version = int(version) 1443 secure = standard.get("secure", False) 1444 # (discard is also set if expires is Absent) 1445 discard = standard.get("discard", False) 1446 comment = standard.get("comment", None) 1447 comment_url = standard.get("commenturl", None) 1448 1449 # set default path 1450 if path is not Absent and path != "": 1451 path_specified = True 1452 path = escape_path(path) 1453 else: 1454 path_specified = False 1455 path = request_path(request) 1456 i = path.rfind("/") 1457 if i != -1: 1458 if version == 0: 1459 # Netscape spec parts company from reality here 1460 path = path[:i] 1461 else: 1462 path = path[:i+1] 1463 if len(path) == 0: path = "/" 1464 1465 # set default domain 1466 domain_specified = domain is not Absent 1467 # but first we have to remember whether it starts with a dot 1468 domain_initial_dot = False 1469 if domain_specified: 1470 domain_initial_dot = bool(domain.startswith(".")) 1471 if domain is Absent: 1472 req_host, erhn = eff_request_host(request) 1473 domain = erhn 1474 elif not domain.startswith("."): 1475 domain = "."+domain 1476 1477 # set default port 1478 port_specified = False 1479 if port is not Absent: 1480 if port is None: 1481 # Port attr present, but has no value: default to request port. 1482 # Cookie should then only be sent back on that port. 1483 port = request_port(request) 1484 else: 1485 port_specified = True 1486 port = re.sub(r"\s+", "", port) 1487 else: 1488 # No port attr present. Cookie can be sent back on any port. 1489 port = None 1490 1491 # set default expires and discard 1492 if expires is Absent: 1493 expires = None 1494 discard = True 1495 elif expires <= self._now: 1496 # Expiry date in past is request to delete cookie. This can't be 1497 # in DefaultCookiePolicy, because can't delete cookies there. 1498 try: 1499 self.clear(domain, path, name) 1500 except KeyError: 1501 pass 1502 debug("Expiring cookie, domain='%s', path='%s', name='%s'", 1503 domain, path, name) 1504 return None 1505 1506 return Cookie(version, 1507 name, value, 1508 port, port_specified, 1509 domain, domain_specified, domain_initial_dot, 1510 path, path_specified, 1511 secure, 1512 expires, 1513 discard, 1514 comment, 1515 comment_url, 1516 rest) 1517 1518 def _cookies_from_attrs_set(self, attrs_set, request): 1519 cookie_tuples = self._normalized_cookie_tuples(attrs_set) 1520 1521 cookies = [] 1522 for tup in cookie_tuples: 1523 cookie = self._cookie_from_cookie_tuple(tup, request) 1524 if cookie: cookies.append(cookie) 1525 return cookies 1526 1527 def _process_rfc2109_cookies(self, cookies): 1528 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) 1529 if rfc2109_as_ns is None: 1530 rfc2109_as_ns = not self._policy.rfc2965 1531 for cookie in cookies: 1532 if cookie.version == 1: 1533 cookie.rfc2109 = True 1534 if rfc2109_as_ns: 1535 # treat 2109 cookies as Netscape cookies rather than 1536 # as RFC2965 cookies 1537 cookie.version = 0 1538 1539 def make_cookies(self, response, request): 1540 """Return sequence of Cookie objects extracted from response object.""" 1541 # get cookie-attributes for RFC 2965 and Netscape protocols 1542 headers = response.info() 1543 rfc2965_hdrs = headers.getheaders("Set-Cookie2") 1544 ns_hdrs = headers.getheaders("Set-Cookie") 1545 1546 rfc2965 = self._policy.rfc2965 1547 netscape = self._policy.netscape 1548 1549 if ((not rfc2965_hdrs and not ns_hdrs) or 1550 (not ns_hdrs and not rfc2965) or 1551 (not rfc2965_hdrs and not netscape) or 1552 (not netscape and not rfc2965)): 1553 return [] # no relevant cookie headers: quick exit 1554 1555 try: 1556 cookies = self._cookies_from_attrs_set( 1557 split_header_words(rfc2965_hdrs), request) 1558 except: 1559 reraise_unmasked_exceptions() 1560 cookies = [] 1561 1562 if ns_hdrs and netscape: 1563 try: 1564 # RFC 2109 and Netscape cookies 1565 ns_cookies = self._cookies_from_attrs_set( 1566 parse_ns_headers(ns_hdrs), request) 1567 except: 1568 reraise_unmasked_exceptions() 1569 ns_cookies = [] 1570 self._process_rfc2109_cookies(ns_cookies) 1571 1572 # Look for Netscape cookies (from Set-Cookie headers) that match 1573 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). 1574 # For each match, keep the RFC 2965 cookie and ignore the Netscape 1575 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are 1576 # bundled in with the Netscape cookies for this purpose, which is 1577 # reasonable behaviour. 1578 if rfc2965: 1579 lookup = {} 1580 for cookie in cookies: 1581 lookup[(cookie.domain, cookie.path, cookie.name)] = None 1582 1583 def no_matching_rfc2965(ns_cookie, lookup=lookup): 1584 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name 1585 return key not in lookup 1586 ns_cookies = filter(no_matching_rfc2965, ns_cookies) 1587 1588 if ns_cookies: 1589 cookies.extend(ns_cookies) 1590 1591 return cookies 1592 1593 def set_cookie_if_ok(self, cookie, request): 1594 """Set a cookie if policy says it's OK to do so.""" 1595 self._cookies_lock.acquire() 1596 self._policy._now = self._now = int(time.time()) 1597 1598 if self._policy.set_ok(cookie, request): 1599 self.set_cookie(cookie) 1600 1601 self._cookies_lock.release() 1602 1603 def set_cookie(self, cookie): 1604 """Set a cookie, without checking whether or not it should be set.""" 1605 c = self._cookies 1606 self._cookies_lock.acquire() 1607 try: 1608 if cookie.domain not in c: c[cookie.domain] = {} 1609 c2 = c[cookie.domain] 1610 if cookie.path not in c2: c2[cookie.path] = {} 1611 c3 = c2[cookie.path] 1612 c3[cookie.name] = cookie 1613 finally: 1614 self._cookies_lock.release() 1615 1616 def extract_cookies(self, response, request): 1617 """Extract cookies from response, where allowable given the request.""" 1618 debug("extract_cookies: %s", response.info()) 1619 self._cookies_lock.acquire() 1620 self._policy._now = self._now = int(time.time()) 1621 1622 for cookie in self.make_cookies(response, request): 1623 if self._policy.set_ok(cookie, request): 1624 debug(" setting cookie: %s", cookie) 1625 self.set_cookie(cookie) 1626 self._cookies_lock.release() 1627 1628 def clear(self, domain=None, path=None, name=None): 1629 """Clear some cookies. 1630 1631 Invoking this method without arguments will clear all cookies. If 1632 given a single argument, only cookies belonging to that domain will be 1633 removed. If given two arguments, cookies belonging to the specified 1634 path within that domain are removed. If given three arguments, then 1635 the cookie with the specified name, path and domain is removed. 1636 1637 Raises KeyError if no matching cookie exists. 1638 1639 """ 1640 if name is not None: 1641 if (domain is None) or (path is None): 1642 raise ValueError( 1643 "domain and path must be given to remove a cookie by name") 1644 del self._cookies[domain][path][name] 1645 elif path is not None: 1646 if domain is None: 1647 raise ValueError( 1648 "domain must be given to remove cookies by path") 1649 del self._cookies[domain][path] 1650 elif domain is not None: 1651 del self._cookies[domain] 1652 else: 1653 self._cookies = {} 1654 1655 def clear_session_cookies(self): 1656 """Discard all session cookies. 1657 1658 Note that the .save() method won't save session cookies anyway, unless 1659 you ask otherwise by passing a true ignore_discard argument. 1660 1661 """ 1662 self._cookies_lock.acquire() 1663 for cookie in self: 1664 if cookie.discard: 1665 self.clear(cookie.domain, cookie.path, cookie.name) 1666 self._cookies_lock.release() 1667 1668 def clear_expired_cookies(self): 1669 """Discard all expired cookies. 1670 1671 You probably don't need to call this method: expired cookies are never 1672 sent back to the server (provided you're using DefaultCookiePolicy), 1673 this method is called by CookieJar itself every so often, and the 1674 .save() method won't save expired cookies anyway (unless you ask 1675 otherwise by passing a true ignore_expires argument). 1676 1677 """ 1678 self._cookies_lock.acquire() 1679 now = time.time() 1680 for cookie in self: 1681 if cookie.is_expired(now): 1682 self.clear(cookie.domain, cookie.path, cookie.name) 1683 self._cookies_lock.release() 1684 1685 def __iter__(self): 1686 return deepvalues(self._cookies) 1687 1688 def __len__(self): 1689 """Return number of contained cookies.""" 1690 i = 0 1691 for cookie in self: i = i + 1 1692 return i 1693 1694 def __repr__(self): 1695 r = [] 1696 for cookie in self: r.append(repr(cookie)) 1697 return "<%s[%s]>" % (self.__class__, ", ".join(r)) 1698 1699 def __str__(self): 1700 r = [] 1701 for cookie in self: r.append(str(cookie)) 1702 return "<%s[%s]>" % (self.__class__, ", ".join(r)) 1703 1704 1705# derives from IOError for backwards-compatibility with Python 2.4.0 1706class LoadError(IOError): pass 1707 1708class FileCookieJar(CookieJar): 1709 """CookieJar that can be loaded from and saved to a file.""" 1710 1711 def __init__(self, filename=None, delayload=False, policy=None): 1712 """ 1713 Cookies are NOT loaded from the named file until either the .load() or 1714 .revert() method is called. 1715 1716 """ 1717 CookieJar.__init__(self, policy) 1718 if filename is not None: 1719 try: 1720 filename+"" 1721 except: 1722 raise ValueError("filename must be string-like") 1723 self.filename = filename 1724 self.delayload = bool(delayload) 1725 1726 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1727 """Save cookies to a file.""" 1728 raise NotImplementedError() 1729 1730 def load(self, filename=None, ignore_discard=False, ignore_expires=False): 1731 """Load cookies from a file.""" 1732 if filename is None: 1733 if self.filename is not None: filename = self.filename 1734 else: raise ValueError(MISSING_FILENAME_TEXT) 1735 1736 f = open(filename) 1737 try: 1738 self._really_load(f, filename, ignore_discard, ignore_expires) 1739 finally: 1740 f.close() 1741 1742 def revert(self, filename=None, 1743 ignore_discard=False, ignore_expires=False): 1744 """Clear all cookies and reload cookies from a saved file. 1745 1746 Raises LoadError (or IOError) if reversion is not successful; the 1747 object's state will not be altered if this happens. 1748 1749 """ 1750 if filename is None: 1751 if self.filename is not None: filename = self.filename 1752 else: raise ValueError(MISSING_FILENAME_TEXT) 1753 1754 self._cookies_lock.acquire() 1755 1756 old_state = copy.deepcopy(self._cookies) 1757 self._cookies = {} 1758 try: 1759 self.load(filename, ignore_discard, ignore_expires) 1760 except (LoadError, IOError): 1761 self._cookies = old_state 1762 raise 1763 1764 self._cookies_lock.release() 1765 1766from _LWPCookieJar import LWPCookieJar, lwp_cookie_str 1767from _MozillaCookieJar import MozillaCookieJar 1768