1from __future__ import generators
2"""
3httplib2
4
5A caching http interface that supports ETags and gzip
6to conserve bandwidth.
7
8Requires Python 2.3 or later
9
10Changelog:
112007-08-18, Rick: Modified so it's able to use a socks proxy if needed.
12
13"""
14
15__author__ = "Joe Gregorio (joe@bitworking.org)"
16__copyright__ = "Copyright 2006, Joe Gregorio"
17__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
18    "James Antill",
19    "Xavier Verges Farrero",
20    "Jonathan Feinberg",
21    "Blair Zajac",
22    "Sam Ruby",
23    "Louis Nyffenegger"]
24__license__ = "MIT"
25__version__ = "$Rev$"
26
27import re
28import sys
29import email
30import email.Utils
31import email.Message
32import email.FeedParser
33import StringIO
34import gzip
35import zlib
36import httplib
37import urlparse
38import base64
39import os
40import copy
41import calendar
42import time
43import random
44# remove depracated warning in python2.6
45try:
46    from hashlib import sha1 as _sha, md5 as _md5
47except ImportError:
48    import sha
49    import md5
50    _sha = sha.new
51    _md5 = md5.new
52import hmac
53from gettext import gettext as _
54import socket
55
56try:
57    import socks
58except ImportError:
59    socks = None
60
61# Build the appropriate socket wrapper for ssl
62try:
63    import ssl # python 2.6
64    _ssl_wrap_socket = ssl.wrap_socket
65except ImportError:
66    def _ssl_wrap_socket(sock, key_file, cert_file):
67        ssl_sock = socket.ssl(sock, key_file, cert_file)
68        return httplib.FakeSocket(sock, ssl_sock)
69
70
71if sys.version_info >= (2,3):
72    from iri2uri import iri2uri
73else:
74    def iri2uri(uri):
75        return uri
76
77def has_timeout(timeout): # python 2.6
78    if hasattr(socket, '_GLOBAL_DEFAULT_TIMEOUT'):
79        return (timeout is not None and timeout is not socket._GLOBAL_DEFAULT_TIMEOUT)
80    return (timeout is not None)
81
82__all__ = ['Http', 'Response', 'ProxyInfo', 'HttpLib2Error',
83  'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
84  'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
85  'debuglevel']
86
87
88# The httplib debug level, set to a non-zero value to get debug output
89debuglevel = 0
90
91
92# Python 2.3 support
93if sys.version_info < (2,4):
94    def sorted(seq):
95        seq.sort()
96        return seq
97
98# Python 2.3 support
99def HTTPResponse__getheaders(self):
100    """Return list of (header, value) tuples."""
101    if self.msg is None:
102        raise httplib.ResponseNotReady()
103    return self.msg.items()
104
105if not hasattr(httplib.HTTPResponse, 'getheaders'):
106    httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
107
108# All exceptions raised here derive from HttpLib2Error
109class HttpLib2Error(Exception): pass
110
111# Some exceptions can be caught and optionally
112# be turned back into responses.
113class HttpLib2ErrorWithResponse(HttpLib2Error):
114    def __init__(self, desc, response, content):
115        self.response = response
116        self.content = content
117        HttpLib2Error.__init__(self, desc)
118
119class RedirectMissingLocation(HttpLib2ErrorWithResponse): pass
120class RedirectLimit(HttpLib2ErrorWithResponse): pass
121class FailedToDecompressContent(HttpLib2ErrorWithResponse): pass
122class UnimplementedDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
123class UnimplementedHmacDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
124
125class RelativeURIError(HttpLib2Error): pass
126class ServerNotFoundError(HttpLib2Error): pass
127
128# Open Items:
129# -----------
130# Proxy support
131
132# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
133
134# Pluggable cache storage (supports storing the cache in
135#   flat files by default. We need a plug-in architecture
136#   that can support Berkeley DB and Squid)
137
138# == Known Issues ==
139# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
140# Does not handle Cache-Control: max-stale
141# Does not use Age: headers when calculating cache freshness.
142
143
144# The number of redirections to follow before giving up.
145# Note that only GET redirects are automatically followed.
146# Will also honor 301 requests by saving that info and never
147# requesting that URI again.
148DEFAULT_MAX_REDIRECTS = 5
149
150# Which headers are hop-by-hop headers by default
151HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
152
153def _get_end2end_headers(response):
154    hopbyhop = list(HOP_BY_HOP)
155    hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
156    return [header for header in response.keys() if header not in hopbyhop]
157
158URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
159
160def parse_uri(uri):
161    """Parses a URI using the regex given in Appendix B of RFC 3986.
162
163        (scheme, authority, path, query, fragment) = parse_uri(uri)
164    """
165    groups = URI.match(uri).groups()
166    return (groups[1], groups[3], groups[4], groups[6], groups[8])
167
168def urlnorm(uri):
169    (scheme, authority, path, query, fragment) = parse_uri(uri)
170    if not scheme or not authority:
171        raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
172    authority = authority.lower()
173    scheme = scheme.lower()
174    if not path:
175        path = "/"
176    # Could do syntax based normalization of the URI before
177    # computing the digest. See Section 6.2.2 of Std 66.
178    request_uri = query and "?".join([path, query]) or path
179    scheme = scheme.lower()
180    defrag_uri = scheme + "://" + authority + request_uri
181    return scheme, authority, request_uri, defrag_uri
182
183
184# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
185re_url_scheme    = re.compile(r'^\w+://')
186re_slash         = re.compile(r'[?/:|]+')
187
188def safename(filename):
189    """Return a filename suitable for the cache.
190
191    Strips dangerous and common characters to create a filename we
192    can use to store the cache in.
193    """
194
195    try:
196        if re_url_scheme.match(filename):
197            if isinstance(filename,str):
198                filename = filename.decode('utf-8')
199                filename = filename.encode('idna')
200            else:
201                filename = filename.encode('idna')
202    except UnicodeError:
203        pass
204    if isinstance(filename,unicode):
205        filename=filename.encode('utf-8')
206    filemd5 = _md5(filename).hexdigest()
207    filename = re_url_scheme.sub("", filename)
208    filename = re_slash.sub(",", filename)
209
210    # limit length of filename
211    if len(filename)>200:
212        filename=filename[:200]
213    return ",".join((filename, filemd5))
214
215NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
216def _normalize_headers(headers):
217    return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip())  for (key, value) in headers.iteritems()])
218
219def _parse_cache_control(headers):
220    retval = {}
221    if headers.has_key('cache-control'):
222        parts =  headers['cache-control'].split(',')
223        parts_with_args = [tuple([x.strip().lower() for x in part.split("=", 1)]) for part in parts if -1 != part.find("=")]
224        parts_wo_args = [(name.strip().lower(), 1) for name in parts if -1 == name.find("=")]
225        retval = dict(parts_with_args + parts_wo_args)
226    return retval
227
228# Whether to use a strict mode to parse WWW-Authenticate headers
229# Might lead to bad results in case of ill-formed header value,
230# so disabled by default, falling back to relaxed parsing.
231# Set to true to turn on, usefull for testing servers.
232USE_WWW_AUTH_STRICT_PARSING = 0
233
234# In regex below:
235#    [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+             matches a "token" as defined by HTTP
236#    "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?"    matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
237# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
238#    \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
239WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
240WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
241UNQUOTE_PAIRS = re.compile(r'\\(.)')
242def _parse_www_authenticate(headers, headername='www-authenticate'):
243    """Returns a dictionary of dictionaries, one dict
244    per auth_scheme."""
245    retval = {}
246    if headers.has_key(headername):
247        authenticate = headers[headername].strip()
248        www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
249        while authenticate:
250            # Break off the scheme at the beginning of the line
251            if headername == 'authentication-info':
252                (auth_scheme, the_rest) = ('digest', authenticate)
253            else:
254                (auth_scheme, the_rest) = authenticate.split(" ", 1)
255            # Now loop over all the key value pairs that come after the scheme,
256            # being careful not to roll into the next scheme
257            match = www_auth.search(the_rest)
258            auth_params = {}
259            while match:
260                if match and len(match.groups()) == 3:
261                    (key, value, the_rest) = match.groups()
262                    auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
263                match = www_auth.search(the_rest)
264            retval[auth_scheme.lower()] = auth_params
265            authenticate = the_rest.strip()
266    return retval
267
268
269def _entry_disposition(response_headers, request_headers):
270    """Determine freshness from the Date, Expires and Cache-Control headers.
271
272    We don't handle the following:
273
274    1. Cache-Control: max-stale
275    2. Age: headers are not used in the calculations.
276
277    Not that this algorithm is simpler than you might think
278    because we are operating as a private (non-shared) cache.
279    This lets us ignore 's-maxage'. We can also ignore
280    'proxy-invalidate' since we aren't a proxy.
281    We will never return a stale document as
282    fresh as a design decision, and thus the non-implementation
283    of 'max-stale'. This also lets us safely ignore 'must-revalidate'
284    since we operate as if every server has sent 'must-revalidate'.
285    Since we are private we get to ignore both 'public' and
286    'private' parameters. We also ignore 'no-transform' since
287    we don't do any transformations.
288    The 'no-store' parameter is handled at a higher level.
289    So the only Cache-Control parameters we look at are:
290
291    no-cache
292    only-if-cached
293    max-age
294    min-fresh
295    """
296
297    retval = "STALE"
298    cc = _parse_cache_control(request_headers)
299    cc_response = _parse_cache_control(response_headers)
300
301    if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
302        retval = "TRANSPARENT"
303        if 'cache-control' not in request_headers:
304            request_headers['cache-control'] = 'no-cache'
305    elif cc.has_key('no-cache'):
306        retval = "TRANSPARENT"
307    elif cc_response.has_key('no-cache'):
308        retval = "STALE"
309    elif cc.has_key('only-if-cached'):
310        retval = "FRESH"
311    elif response_headers.has_key('date'):
312        date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
313        now = time.time()
314        current_age = max(0, now - date)
315        if cc_response.has_key('max-age'):
316            try:
317                freshness_lifetime = int(cc_response['max-age'])
318            except ValueError:
319                freshness_lifetime = 0
320        elif response_headers.has_key('expires'):
321            expires = email.Utils.parsedate_tz(response_headers['expires'])
322            if None == expires:
323                freshness_lifetime = 0
324            else:
325                freshness_lifetime = max(0, calendar.timegm(expires) - date)
326        else:
327            freshness_lifetime = 0
328        if cc.has_key('max-age'):
329            try:
330                freshness_lifetime = int(cc['max-age'])
331            except ValueError:
332                freshness_lifetime = 0
333        if cc.has_key('min-fresh'):
334            try:
335                min_fresh = int(cc['min-fresh'])
336            except ValueError:
337                min_fresh = 0
338            current_age += min_fresh
339        if freshness_lifetime > current_age:
340            retval = "FRESH"
341    return retval
342
343def _decompressContent(response, new_content):
344    content = new_content
345    try:
346        encoding = response.get('content-encoding', None)
347        if encoding in ['gzip', 'deflate']:
348            if encoding == 'gzip':
349                content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
350            if encoding == 'deflate':
351                content = zlib.decompress(content)
352            response['content-length'] = str(len(content))
353            # Record the historical presence of the encoding in a way the won't interfere.
354            response['-content-encoding'] = response['content-encoding']
355            del response['content-encoding']
356    except IOError:
357        content = ""
358        raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
359    return content
360
361def _updateCache(request_headers, response_headers, content, cache, cachekey):
362    if cachekey:
363        cc = _parse_cache_control(request_headers)
364        cc_response = _parse_cache_control(response_headers)
365        if cc.has_key('no-store') or cc_response.has_key('no-store'):
366            cache.delete(cachekey)
367        else:
368            info = email.Message.Message()
369            for key, value in response_headers.iteritems():
370                if key not in ['status','content-encoding','transfer-encoding']:
371                    info[key] = value
372
373            # Add annotations to the cache to indicate what headers
374            # are variant for this request.
375            vary = response_headers.get('vary', None)
376            if vary:
377                vary_headers = vary.lower().replace(' ', '').split(',')
378                for header in vary_headers:
379                    key = '-varied-%s' % header
380                    try:
381                        info[key] = request_headers[header]
382                    except KeyError:
383                        pass
384
385            status = response_headers.status
386            if status == 304:
387                status = 200
388
389            status_header = 'status: %d\r\n' % response_headers.status
390
391            header_str = info.as_string()
392
393            header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
394            text = "".join([status_header, header_str, content])
395
396            cache.set(cachekey, text)
397
398def _cnonce():
399    dig = _md5("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
400    return dig[:16]
401
402def _wsse_username_token(cnonce, iso_now, password):
403    return base64.b64encode(_sha("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
404
405
406# For credentials we need two things, first
407# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
408# Then we also need a list of URIs that have already demanded authentication
409# That list is tricky since sub-URIs can take the same auth, or the
410# auth scheme may change as you descend the tree.
411# So we also need each Auth instance to be able to tell us
412# how close to the 'top' it is.
413
414class Authentication(object):
415    def __init__(self, credentials, host, request_uri, headers, response, content, http):
416        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
417        self.path = path
418        self.host = host
419        self.credentials = credentials
420        self.http = http
421
422    def depth(self, request_uri):
423        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
424        return request_uri[len(self.path):].count("/")
425
426    def inscope(self, host, request_uri):
427        # XXX Should we normalize the request_uri?
428        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
429        return (host == self.host) and path.startswith(self.path)
430
431    def request(self, method, request_uri, headers, content):
432        """Modify the request headers to add the appropriate
433        Authorization header. Over-rise this in sub-classes."""
434        pass
435
436    def response(self, response, content):
437        """Gives us a chance to update with new nonces
438        or such returned from the last authorized response.
439        Over-rise this in sub-classes if necessary.
440
441        Return TRUE is the request is to be retried, for
442        example Digest may return stale=true.
443        """
444        return False
445
446
447
448class BasicAuthentication(Authentication):
449    def __init__(self, credentials, host, request_uri, headers, response, content, http):
450        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
451
452    def request(self, method, request_uri, headers, content):
453        """Modify the request headers to add the appropriate
454        Authorization header."""
455        headers['authorization'] = 'Basic ' + base64.b64encode("%s:%s" % self.credentials).strip()
456
457
458class DigestAuthentication(Authentication):
459    """Only do qop='auth' and MD5, since that
460    is all Apache currently implements"""
461    def __init__(self, credentials, host, request_uri, headers, response, content, http):
462        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
463        challenge = _parse_www_authenticate(response, 'www-authenticate')
464        self.challenge = challenge['digest']
465        qop = self.challenge.get('qop', 'auth')
466        self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
467        if self.challenge['qop'] is None:
468            raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
469        self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5').upper()
470        if self.challenge['algorithm'] != 'MD5':
471            raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
472        self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])
473        self.challenge['nc'] = 1
474
475    def request(self, method, request_uri, headers, content, cnonce = None):
476        """Modify the request headers"""
477        H = lambda x: _md5(x).hexdigest()
478        KD = lambda s, d: H("%s:%s" % (s, d))
479        A2 = "".join([method, ":", request_uri])
480        self.challenge['cnonce'] = cnonce or _cnonce()
481        request_digest  = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'],
482                    '%08x' % self.challenge['nc'],
483                    self.challenge['cnonce'],
484                    self.challenge['qop'], H(A2)
485                    ))
486        headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
487                self.credentials[0],
488                self.challenge['realm'],
489                self.challenge['nonce'],
490                request_uri,
491                self.challenge['algorithm'],
492                request_digest,
493                self.challenge['qop'],
494                self.challenge['nc'],
495                self.challenge['cnonce'],
496                )
497        self.challenge['nc'] += 1
498
499    def response(self, response, content):
500        if not response.has_key('authentication-info'):
501            challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
502            if 'true' == challenge.get('stale'):
503                self.challenge['nonce'] = challenge['nonce']
504                self.challenge['nc'] = 1
505                return True
506        else:
507            updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
508
509            if updated_challenge.has_key('nextnonce'):
510                self.challenge['nonce'] = updated_challenge['nextnonce']
511                self.challenge['nc'] = 1
512        return False
513
514
515class HmacDigestAuthentication(Authentication):
516    """Adapted from Robert Sayre's code and DigestAuthentication above."""
517    __author__ = "Thomas Broyer (t.broyer@ltgt.net)"
518
519    def __init__(self, credentials, host, request_uri, headers, response, content, http):
520        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
521        challenge = _parse_www_authenticate(response, 'www-authenticate')
522        self.challenge = challenge['hmacdigest']
523        # TODO: self.challenge['domain']
524        self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
525        if self.challenge['reason'] not in ['unauthorized', 'integrity']:
526            self.challenge['reason'] = 'unauthorized'
527        self.challenge['salt'] = self.challenge.get('salt', '')
528        if not self.challenge.get('snonce'):
529            raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
530        self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
531        if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
532            raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
533        self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
534        if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
535            raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
536        if self.challenge['algorithm'] == 'HMAC-MD5':
537            self.hashmod = _md5
538        else:
539            self.hashmod = _sha
540        if self.challenge['pw-algorithm'] == 'MD5':
541            self.pwhashmod = _md5
542        else:
543            self.pwhashmod = _sha
544        self.key = "".join([self.credentials[0], ":",
545                    self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
546                    ":", self.challenge['realm']
547                    ])
548        self.key = self.pwhashmod.new(self.key).hexdigest().lower()
549
550    def request(self, method, request_uri, headers, content):
551        """Modify the request headers"""
552        keys = _get_end2end_headers(headers)
553        keylist = "".join(["%s " % k for k in keys])
554        headers_val = "".join([headers[k] for k in keys])
555        created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
556        cnonce = _cnonce()
557        request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
558        request_digest  = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
559        headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
560                self.credentials[0],
561                self.challenge['realm'],
562                self.challenge['snonce'],
563                cnonce,
564                request_uri,
565                created,
566                request_digest,
567                keylist,
568                )
569
570    def response(self, response, content):
571        challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
572        if challenge.get('reason') in ['integrity', 'stale']:
573            return True
574        return False
575
576
577class WsseAuthentication(Authentication):
578    """This is thinly tested and should not be relied upon.
579    At this time there isn't any third party server to test against.
580    Blogger and TypePad implemented this algorithm at one point
581    but Blogger has since switched to Basic over HTTPS and
582    TypePad has implemented it wrong, by never issuing a 401
583    challenge but instead requiring your client to telepathically know that
584    their endpoint is expecting WSSE profile="UsernameToken"."""
585    def __init__(self, credentials, host, request_uri, headers, response, content, http):
586        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
587
588    def request(self, method, request_uri, headers, content):
589        """Modify the request headers to add the appropriate
590        Authorization header."""
591        headers['Authorization'] = 'WSSE profile="UsernameToken"'
592        iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
593        cnonce = _cnonce()
594        password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
595        headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
596                self.credentials[0],
597                password_digest,
598                cnonce,
599                iso_now)
600
601class GoogleLoginAuthentication(Authentication):
602    def __init__(self, credentials, host, request_uri, headers, response, content, http):
603        from urllib import urlencode
604        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
605        challenge = _parse_www_authenticate(response, 'www-authenticate')
606        service = challenge['googlelogin'].get('service', 'xapi')
607        # Bloggger actually returns the service in the challenge
608        # For the rest we guess based on the URI
609        if service == 'xapi' and  request_uri.find("calendar") > 0:
610            service = "cl"
611        # No point in guessing Base or Spreadsheet
612        #elif request_uri.find("spreadsheets") > 0:
613        #    service = "wise"
614
615        auth = dict(Email=credentials[0], Passwd=credentials[1], service=service, source=headers['user-agent'])
616        resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
617        lines = content.split('\n')
618        d = dict([tuple(line.split("=", 1)) for line in lines if line])
619        if resp.status == 403:
620            self.Auth = ""
621        else:
622            self.Auth = d['Auth']
623
624    def request(self, method, request_uri, headers, content):
625        """Modify the request headers to add the appropriate
626        Authorization header."""
627        headers['authorization'] = 'GoogleLogin Auth=' + self.Auth
628
629
630AUTH_SCHEME_CLASSES = {
631    "basic": BasicAuthentication,
632    "wsse": WsseAuthentication,
633    "digest": DigestAuthentication,
634    "hmacdigest": HmacDigestAuthentication,
635    "googlelogin": GoogleLoginAuthentication
636}
637
638AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
639
640class FileCache(object):
641    """Uses a local directory as a store for cached files.
642    Not really safe to use if multiple threads or processes are going to
643    be running on the same cache.
644    """
645    def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
646        self.cache = cache
647        self.safe = safe
648        if not os.path.exists(cache):
649            os.makedirs(self.cache)
650
651    def get(self, key):
652        retval = None
653        cacheFullPath = os.path.join(self.cache, self.safe(key))
654        try:
655            f = file(cacheFullPath, "rb")
656            retval = f.read()
657            f.close()
658        except IOError:
659            pass
660        return retval
661
662    def set(self, key, value):
663        cacheFullPath = os.path.join(self.cache, self.safe(key))
664        f = file(cacheFullPath, "wb")
665        f.write(value)
666        f.close()
667
668    def delete(self, key):
669        cacheFullPath = os.path.join(self.cache, self.safe(key))
670        if os.path.exists(cacheFullPath):
671            os.remove(cacheFullPath)
672
673class Credentials(object):
674    def __init__(self):
675        self.credentials = []
676
677    def add(self, name, password, domain=""):
678        self.credentials.append((domain.lower(), name, password))
679
680    def clear(self):
681        self.credentials = []
682
683    def iter(self, domain):
684        for (cdomain, name, password) in self.credentials:
685            if cdomain == "" or domain == cdomain:
686                yield (name, password)
687
688class KeyCerts(Credentials):
689    """Identical to Credentials except that
690    name/password are mapped to key/cert."""
691    pass
692
693
694class ProxyInfo(object):
695  """Collect information required to use a proxy."""
696  def __init__(self, proxy_type, proxy_host, proxy_port, proxy_rdns=None, proxy_user=None, proxy_pass=None):
697      """The parameter proxy_type must be set to one of socks.PROXY_TYPE_XXX
698      constants. For example:
699
700p = ProxyInfo(proxy_type=socks.PROXY_TYPE_HTTP, proxy_host='localhost', proxy_port=8000)
701      """
702      self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, self.proxy_user, self.proxy_pass = proxy_type, proxy_host, proxy_port, proxy_rdns, proxy_user, proxy_pass
703
704  def astuple(self):
705    return (self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns,
706        self.proxy_user, self.proxy_pass)
707
708  def isgood(self):
709    return socks and (self.proxy_host != None) and (self.proxy_port != None)
710
711
712class HTTPConnectionWithTimeout(httplib.HTTPConnection):
713    """HTTPConnection subclass that supports timeouts"""
714
715    def __init__(self, host, port=None, strict=None, timeout=None, proxy_info=None):
716        httplib.HTTPConnection.__init__(self, host, port, strict)
717        self.timeout = timeout
718        self.proxy_info = proxy_info
719
720    def connect(self):
721        """Connect to the host and port specified in __init__."""
722        # Mostly verbatim from httplib.py.
723        msg = "getaddrinfo returns an empty list"
724        for res in socket.getaddrinfo(self.host, self.port, 0,
725                socket.SOCK_STREAM):
726            af, socktype, proto, canonname, sa = res
727            try:
728                if self.proxy_info and self.proxy_info.isgood():
729                    self.sock = socks.socksocket(af, socktype, proto)
730                    self.sock.setproxy(*self.proxy_info.astuple())
731                else:
732                    self.sock = socket.socket(af, socktype, proto)
733                # Different from httplib: support timeouts.
734                if has_timeout(self.timeout):
735                    self.sock.settimeout(self.timeout)
736                    # End of difference from httplib.
737                if self.debuglevel > 0:
738                    print "connect: (%s, %s)" % (self.host, self.port)
739
740                self.sock.connect(sa)
741            except socket.error, msg:
742                if self.debuglevel > 0:
743                    print 'connect fail:', (self.host, self.port)
744                if self.sock:
745                    self.sock.close()
746                self.sock = None
747                continue
748            break
749        if not self.sock:
750            raise socket.error, msg
751
752class HTTPSConnectionWithTimeout(httplib.HTTPSConnection):
753    "This class allows communication via SSL."
754
755    def __init__(self, host, port=None, key_file=None, cert_file=None,
756                 strict=None, timeout=None, proxy_info=None):
757        httplib.HTTPSConnection.__init__(self, host, port=port, key_file=key_file,
758                cert_file=cert_file, strict=strict)
759        self.timeout = timeout
760        self.proxy_info = proxy_info
761
762    def connect(self):
763        "Connect to a host on a given (SSL) port."
764
765        if self.proxy_info and self.proxy_info.isgood():
766            sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
767            sock.setproxy(*self.proxy_info.astuple())
768        else:
769            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
770
771        if has_timeout(self.timeout):
772            sock.settimeout(self.timeout)
773        sock.connect((self.host, self.port))
774        self.sock =_ssl_wrap_socket(sock, self.key_file, self.cert_file)
775
776
777
778class Http(object):
779    """An HTTP client that handles:
780- all methods
781- caching
782- ETags
783- compression,
784- HTTPS
785- Basic
786- Digest
787- WSSE
788
789and more.
790    """
791    def __init__(self, cache=None, timeout=None, proxy_info=None):
792        """The value of proxy_info is a ProxyInfo instance.
793
794If 'cache' is a string then it is used as a directory name
795for a disk cache. Otherwise it must be an object that supports
796the same interface as FileCache."""
797        self.proxy_info = proxy_info
798        # Map domain name to an httplib connection
799        self.connections = {}
800        # The location of the cache, for now a directory
801        # where cached responses are held.
802        if cache and isinstance(cache, str):
803            self.cache = FileCache(cache)
804        else:
805            self.cache = cache
806
807        # Name/password
808        self.credentials = Credentials()
809
810        # Key/cert
811        self.certificates = KeyCerts()
812
813        # authorization objects
814        self.authorizations = []
815
816        # If set to False then no redirects are followed, even safe ones.
817        self.follow_redirects = True
818
819        # Which HTTP methods do we apply optimistic concurrency to, i.e.
820        # which methods get an "if-match:" etag header added to them.
821        self.optimistic_concurrency_methods = ["PUT"]
822
823        # If 'follow_redirects' is True, and this is set to True then
824        # all redirecs are followed, including unsafe ones.
825        self.follow_all_redirects = False
826
827        self.ignore_etag = False
828
829        self.force_exception_to_status_code = False
830
831        self.timeout = timeout
832
833    def _auth_from_challenge(self, host, request_uri, headers, response, content):
834        """A generator that creates Authorization objects
835           that can be applied to requests.
836        """
837        challenges = _parse_www_authenticate(response, 'www-authenticate')
838        for cred in self.credentials.iter(host):
839            for scheme in AUTH_SCHEME_ORDER:
840                if challenges.has_key(scheme):
841                    yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self)
842
843    def add_credentials(self, name, password, domain=""):
844        """Add a name and password that will be used
845        any time a request requires authentication."""
846        self.credentials.add(name, password, domain)
847
848    def add_certificate(self, key, cert, domain):
849        """Add a key and cert that will be used
850        any time a request requires authentication."""
851        self.certificates.add(key, cert, domain)
852
853    def clear_credentials(self):
854        """Remove all the names and passwords
855        that are used for authentication"""
856        self.credentials.clear()
857        self.authorizations = []
858
859    def _conn_request(self, conn, request_uri, method, body, headers):
860        for i in range(2):
861            try:
862                conn.request(method, request_uri, body, headers)
863            except socket.gaierror:
864                conn.close()
865                raise ServerNotFoundError("Unable to find the server at %s" % conn.host)
866            except (socket.error, httplib.HTTPException):
867                # Just because the server closed the connection doesn't apparently mean
868                # that the server didn't send a response.
869                pass
870            try:
871                response = conn.getresponse()
872            except (socket.error, httplib.HTTPException):
873                if i == 0:
874                    conn.close()
875                    conn.connect()
876                    continue
877                else:
878                    raise
879            else:
880                content = ""
881                if method == "HEAD":
882                    response.close()
883                else:
884                    content = response.read()
885                response = Response(response)
886                if method != "HEAD":
887                    content = _decompressContent(response, content)
888            break
889        return (response, content)
890
891
892    def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
893        """Do the actual request using the connection object
894        and also follow one level of redirects if necessary"""
895
896        auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
897        auth = auths and sorted(auths)[0][1] or None
898        if auth:
899            auth.request(method, request_uri, headers, body)
900
901        (response, content) = self._conn_request(conn, request_uri, method, body, headers)
902
903        if auth:
904            if auth.response(response, body):
905                auth.request(method, request_uri, headers, body)
906                (response, content) = self._conn_request(conn, request_uri, method, body, headers )
907                response._stale_digest = 1
908
909        if response.status == 401:
910            for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
911                authorization.request(method, request_uri, headers, body)
912                (response, content) = self._conn_request(conn, request_uri, method, body, headers, )
913                if response.status != 401:
914                    self.authorizations.append(authorization)
915                    authorization.response(response, body)
916                    break
917
918        if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303):
919            if self.follow_redirects and response.status in [300, 301, 302, 303, 307]:
920                # Pick out the location header and basically start from the beginning
921                # remembering first to strip the ETag header and decrement our 'depth'
922                if redirections:
923                    if not response.has_key('location') and response.status != 300:
924                        raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."), response, content)
925                    # Fix-up relative redirects (which violate an RFC 2616 MUST)
926                    if response.has_key('location'):
927                        location = response['location']
928                        (scheme, authority, path, query, fragment) = parse_uri(location)
929                        if authority == None:
930                            response['location'] = urlparse.urljoin(absolute_uri, location)
931                    if response.status == 301 and method in ["GET", "HEAD"]:
932                        response['-x-permanent-redirect-url'] = response['location']
933                        if not response.has_key('content-location'):
934                            response['content-location'] = absolute_uri
935                        _updateCache(headers, response, content, self.cache, cachekey)
936                    if headers.has_key('if-none-match'):
937                        del headers['if-none-match']
938                    if headers.has_key('if-modified-since'):
939                        del headers['if-modified-since']
940                    if response.has_key('location'):
941                        location = response['location']
942                        old_response = copy.deepcopy(response)
943                        if not old_response.has_key('content-location'):
944                            old_response['content-location'] = absolute_uri
945                        redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
946                        (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
947                        response.previous = old_response
948                else:
949                    raise RedirectLimit( _("Redirected more times than rediection_limit allows."), response, content)
950            elif response.status in [200, 203] and method == "GET":
951                # Don't cache 206's since we aren't going to handle byte range requests
952                if not response.has_key('content-location'):
953                    response['content-location'] = absolute_uri
954                _updateCache(headers, response, content, self.cache, cachekey)
955
956        return (response, content)
957
958    def _normalize_headers(self, headers):
959        return _normalize_headers(headers)
960
961# Need to catch and rebrand some exceptions
962# Then need to optionally turn all exceptions into status codes
963# including all socket.* and httplib.* exceptions.
964
965
966    def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None):
967        """ Performs a single HTTP request.
968The 'uri' is the URI of the HTTP resource and can begin
969with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
970
971The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
972There is no restriction on the methods allowed.
973
974The 'body' is the entity body to be sent with the request. It is a string
975object.
976
977Any extra headers that are to be sent with the request should be provided in the
978'headers' dictionary.
979
980The maximum number of redirect to follow before raising an
981exception is 'redirections. The default is 5.
982
983The return value is a tuple of (response, content), the first
984being and instance of the 'Response' class, the second being
985a string that contains the response entity body.
986        """
987        try:
988            if headers is None:
989                headers = {}
990            else:
991                headers = self._normalize_headers(headers)
992
993            if not headers.has_key('user-agent'):
994                headers['user-agent'] = "Python-httplib2/%s" % __version__
995
996            uri = iri2uri(uri)
997
998            (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
999            domain_port = authority.split(":")[0:2]
1000            if len(domain_port) == 2 and domain_port[1] == '443' and scheme == 'http':
1001                scheme = 'https'
1002                authority = domain_port[0]
1003
1004            conn_key = scheme+":"+authority
1005            if conn_key in self.connections:
1006                conn = self.connections[conn_key]
1007            else:
1008                if not connection_type:
1009                    connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout
1010                certs = list(self.certificates.iter(authority))
1011                if scheme == 'https' and certs:
1012                    conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0],
1013                        cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info)
1014                else:
1015                    conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info)
1016                conn.set_debuglevel(debuglevel)
1017
1018            if method in ["GET", "HEAD"] and 'range' not in headers and 'accept-encoding' not in headers:
1019                headers['accept-encoding'] = 'gzip, deflate'
1020
1021            info = email.Message.Message()
1022            cached_value = None
1023            if self.cache:
1024                cachekey = defrag_uri
1025                cached_value = self.cache.get(cachekey)
1026                if cached_value:
1027                    # info = email.message_from_string(cached_value)
1028                    #
1029                    # Need to replace the line above with the kludge below
1030                    # to fix the non-existent bug not fixed in this
1031                    # bug report: http://mail.python.org/pipermail/python-bugs-list/2005-September/030289.html
1032                    try:
1033                        info, content = cached_value.split('\r\n\r\n', 1)
1034                        feedparser = email.FeedParser.FeedParser()
1035                        feedparser.feed(info)
1036                        info = feedparser.close()
1037                        feedparser._parse = None
1038                    except IndexError:
1039                        self.cache.delete(cachekey)
1040                        cachekey = None
1041                        cached_value = None
1042            else:
1043                cachekey = None
1044
1045            if method in self.optimistic_concurrency_methods and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
1046                # http://www.w3.org/1999/04/Editing/
1047                headers['if-match'] = info['etag']
1048
1049            if method not in ["GET", "HEAD"] and self.cache and cachekey:
1050                # RFC 2616 Section 13.10
1051                self.cache.delete(cachekey)
1052
1053            # Check the vary header in the cache to see if this request
1054            # matches what varies in the cache.
1055            if method in ['GET', 'HEAD'] and 'vary' in info:
1056                vary = info['vary']
1057                vary_headers = vary.lower().replace(' ', '').split(',')
1058                for header in vary_headers:
1059                    key = '-varied-%s' % header
1060                    value = info[key]
1061                    if headers.get(header, '') != value:
1062                            cached_value = None
1063                            break
1064
1065            if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
1066                if info.has_key('-x-permanent-redirect-url'):
1067                    # Should cached permanent redirects be counted in our redirection count? For now, yes.
1068                    (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
1069                    response.previous = Response(info)
1070                    response.previous.fromcache = True
1071                else:
1072                    # Determine our course of action:
1073                    #   Is the cached entry fresh or stale?
1074                    #   Has the client requested a non-cached response?
1075                    #
1076                    # There seems to be three possible answers:
1077                    # 1. [FRESH] Return the cache entry w/o doing a GET
1078                    # 2. [STALE] Do the GET (but add in cache validators if available)
1079                    # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
1080                    entry_disposition = _entry_disposition(info, headers)
1081
1082                    if entry_disposition == "FRESH":
1083                        if not cached_value:
1084                            info['status'] = '504'
1085                            content = ""
1086                        response = Response(info)
1087                        if cached_value:
1088                            response.fromcache = True
1089                        return (response, content)
1090
1091                    if entry_disposition == "STALE":
1092                        if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
1093                            headers['if-none-match'] = info['etag']
1094                        if info.has_key('last-modified') and not 'last-modified' in headers:
1095                            headers['if-modified-since'] = info['last-modified']
1096                    elif entry_disposition == "TRANSPARENT":
1097                        pass
1098
1099                    (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1100
1101                if response.status == 304 and method == "GET":
1102                    # Rewrite the cache entry with the new end-to-end headers
1103                    # Take all headers that are in response
1104                    # and overwrite their values in info.
1105                    # unless they are hop-by-hop, or are listed in the connection header.
1106
1107                    for key in _get_end2end_headers(response):
1108                        info[key] = response[key]
1109                    merged_response = Response(info)
1110                    if hasattr(response, "_stale_digest"):
1111                        merged_response._stale_digest = response._stale_digest
1112                    _updateCache(headers, merged_response, content, self.cache, cachekey)
1113                    response = merged_response
1114                    response.status = 200
1115                    response.fromcache = True
1116
1117                elif response.status == 200:
1118                    content = new_content
1119                else:
1120                    self.cache.delete(cachekey)
1121                    content = new_content
1122            else:
1123                cc = _parse_cache_control(headers)
1124                if cc.has_key('only-if-cached'):
1125                    info['status'] = '504'
1126                    response = Response(info)
1127                    content = ""
1128                else:
1129                    (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1130        except Exception, e:
1131            if self.force_exception_to_status_code:
1132                if isinstance(e, HttpLib2ErrorWithResponse):
1133                    response = e.response
1134                    content = e.content
1135                    response.status = 500
1136                    response.reason = str(e)
1137                elif isinstance(e, socket.timeout):
1138                    content = "Request Timeout"
1139                    response = Response( {
1140                            "content-type": "text/plain",
1141                            "status": "408",
1142                            "content-length": len(content)
1143                            })
1144                    response.reason = "Request Timeout"
1145                else:
1146                    content = str(e)
1147                    response = Response( {
1148                            "content-type": "text/plain",
1149                            "status": "400",
1150                            "content-length": len(content)
1151                            })
1152                    response.reason = "Bad Request"
1153            else:
1154                raise
1155
1156
1157        return (response, content)
1158
1159
1160
1161class Response(dict):
1162    """An object more like email.Message than httplib.HTTPResponse."""
1163
1164    """Is this response from our local cache"""
1165    fromcache = False
1166
1167    """HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
1168    version = 11
1169
1170    "Status code returned by server. "
1171    status = 200
1172
1173    """Reason phrase returned by server."""
1174    reason = "Ok"
1175
1176    previous = None
1177
1178    def __init__(self, info):
1179        # info is either an email.Message or
1180        # an httplib.HTTPResponse object.
1181        if isinstance(info, httplib.HTTPResponse):
1182            for key, value in info.getheaders():
1183                self[key.lower()] = value
1184            self.status = info.status
1185            self['status'] = str(self.status)
1186            self.reason = info.reason
1187            self.version = info.version
1188        elif isinstance(info, email.Message.Message):
1189            for key, value in info.items():
1190                self[key] = value
1191            self.status = int(self['status'])
1192        else:
1193            for key, value in info.iteritems():
1194                self[key] = value
1195            self.status = int(self.get('status', self.status))
1196
1197
1198    def __getattr__(self, name):
1199        if name == 'dict':
1200            return self
1201        else:
1202            raise AttributeError, name
1203