urllib.py revision 84040dbe8170864ba673321ec7568974bdabf5a4
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol.  All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info().  The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import time
29import sys
30from urlparse import urljoin as basejoin
31import warnings
32
33__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35           "urlencode", "url2pathname", "pathname2url", "splittag",
36           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38           "splitnport", "splitquery", "splitattr", "splitvalue",
39           "getproxies"]
40
41__version__ = '1.17'    # XXX This version is not always updated :-(
42
43MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
44
45# Helper for non-unix systems
46if os.name == 'mac':
47    from macurl2path import url2pathname, pathname2url
48elif os.name == 'nt':
49    from nturl2path import url2pathname, pathname2url
50elif os.name == 'riscos':
51    from rourl2path import url2pathname, pathname2url
52else:
53    def url2pathname(pathname):
54        """OS-specific conversion from a relative URL of the 'file' scheme
55        to a file system path; not recommended for general use."""
56        return unquote(pathname)
57
58    def pathname2url(pathname):
59        """OS-specific conversion from a file system path to a relative URL
60        of the 'file' scheme; not recommended for general use."""
61        return quote(pathname)
62
63# This really consists of two pieces:
64# (1) a class which handles opening of all sorts of URLs
65#     (plus assorted utilities etc.)
66# (2) a set of functions for parsing URLs
67# XXX Should these be separated out into different modules?
68
69
70# Shortcut for basic usage
71_urlopener = None
72def urlopen(url, data=None, proxies=None):
73    """Create a file-like object for the specified URL to read from."""
74    from warnings import warnpy3k
75    warnings.warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
76                        "favor of urllib2.urlopen()", stacklevel=2)
77
78    global _urlopener
79    if proxies is not None:
80        opener = FancyURLopener(proxies=proxies)
81    elif not _urlopener:
82        opener = FancyURLopener()
83        _urlopener = opener
84    else:
85        opener = _urlopener
86    if data is None:
87        return opener.open(url)
88    else:
89        return opener.open(url, data)
90def urlretrieve(url, filename=None, reporthook=None, data=None):
91    global _urlopener
92    if not _urlopener:
93        _urlopener = FancyURLopener()
94    return _urlopener.retrieve(url, filename, reporthook, data)
95def urlcleanup():
96    if _urlopener:
97        _urlopener.cleanup()
98
99# check for SSL
100try:
101    import ssl
102except:
103    _have_ssl = False
104else:
105    _have_ssl = True
106
107# exception raised when downloaded size does not match content-length
108class ContentTooShortError(IOError):
109    def __init__(self, message, content):
110        IOError.__init__(self, message)
111        self.content = content
112
113ftpcache = {}
114class URLopener:
115    """Class to open URLs.
116    This is a class rather than just a subroutine because we may need
117    more than one set of global protocol-specific options.
118    Note -- this is a base class for those who don't want the
119    automatic handling of errors type 302 (relocated) and 401
120    (authorization needed)."""
121
122    __tempfiles = None
123
124    version = "Python-urllib/%s" % __version__
125
126    # Constructor
127    def __init__(self, proxies=None, **x509):
128        if proxies is None:
129            proxies = getproxies()
130        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
131        self.proxies = proxies
132        self.key_file = x509.get('key_file')
133        self.cert_file = x509.get('cert_file')
134        self.addheaders = [('User-Agent', self.version)]
135        self.__tempfiles = []
136        self.__unlink = os.unlink # See cleanup()
137        self.tempcache = None
138        # Undocumented feature: if you assign {} to tempcache,
139        # it is used to cache files retrieved with
140        # self.retrieve().  This is not enabled by default
141        # since it does not work for changing documents (and I
142        # haven't got the logic to check expiration headers
143        # yet).
144        self.ftpcache = ftpcache
145        # Undocumented feature: you can use a different
146        # ftp cache by assigning to the .ftpcache member;
147        # in case you want logically independent URL openers
148        # XXX This is not threadsafe.  Bah.
149
150    def __del__(self):
151        self.close()
152
153    def close(self):
154        self.cleanup()
155
156    def cleanup(self):
157        # This code sometimes runs when the rest of this module
158        # has already been deleted, so it can't use any globals
159        # or import anything.
160        if self.__tempfiles:
161            for file in self.__tempfiles:
162                try:
163                    self.__unlink(file)
164                except OSError:
165                    pass
166            del self.__tempfiles[:]
167        if self.tempcache:
168            self.tempcache.clear()
169
170    def addheader(self, *args):
171        """Add a header to be used by the HTTP interface only
172        e.g. u.addheader('Accept', 'sound/basic')"""
173        self.addheaders.append(args)
174
175    # External interface
176    def open(self, fullurl, data=None):
177        """Use URLopener().open(file) instead of open(file, 'r')."""
178        fullurl = unwrap(toBytes(fullurl))
179        if self.tempcache and fullurl in self.tempcache:
180            filename, headers = self.tempcache[fullurl]
181            fp = open(filename, 'rb')
182            return addinfourl(fp, headers, fullurl)
183        urltype, url = splittype(fullurl)
184        if not urltype:
185            urltype = 'file'
186        if urltype in self.proxies:
187            proxy = self.proxies[urltype]
188            urltype, proxyhost = splittype(proxy)
189            host, selector = splithost(proxyhost)
190            url = (host, fullurl) # Signal special case to open_*()
191        else:
192            proxy = None
193        name = 'open_' + urltype
194        self.type = urltype
195        name = name.replace('-', '_')
196        if not hasattr(self, name):
197            if proxy:
198                return self.open_unknown_proxy(proxy, fullurl, data)
199            else:
200                return self.open_unknown(fullurl, data)
201        try:
202            if data is None:
203                return getattr(self, name)(url)
204            else:
205                return getattr(self, name)(url, data)
206        except socket.error, msg:
207            raise IOError, ('socket error', msg), sys.exc_info()[2]
208
209    def open_unknown(self, fullurl, data=None):
210        """Overridable interface to open unknown URL type."""
211        type, url = splittype(fullurl)
212        raise IOError, ('url error', 'unknown url type', type)
213
214    def open_unknown_proxy(self, proxy, fullurl, data=None):
215        """Overridable interface to open unknown URL type."""
216        type, url = splittype(fullurl)
217        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
218
219    # External interface
220    def retrieve(self, url, filename=None, reporthook=None, data=None):
221        """retrieve(url) returns (filename, headers) for a local object
222        or (tempfilename, headers) for a remote object."""
223        url = unwrap(toBytes(url))
224        if self.tempcache and url in self.tempcache:
225            return self.tempcache[url]
226        type, url1 = splittype(url)
227        if filename is None and (not type or type == 'file'):
228            try:
229                fp = self.open_local_file(url1)
230                hdrs = fp.info()
231                del fp
232                return url2pathname(splithost(url1)[1]), hdrs
233            except IOError, msg:
234                pass
235        fp = self.open(url, data)
236        headers = fp.info()
237        if filename:
238            tfp = open(filename, 'wb')
239        else:
240            import tempfile
241            garbage, path = splittype(url)
242            garbage, path = splithost(path or "")
243            path, garbage = splitquery(path or "")
244            path, garbage = splitattr(path or "")
245            suffix = os.path.splitext(path)[1]
246            (fd, filename) = tempfile.mkstemp(suffix)
247            self.__tempfiles.append(filename)
248            tfp = os.fdopen(fd, 'wb')
249        result = filename, headers
250        if self.tempcache is not None:
251            self.tempcache[url] = result
252        bs = 1024*8
253        size = -1
254        read = 0
255        blocknum = 0
256        if reporthook:
257            if "content-length" in headers:
258                size = int(headers["Content-Length"])
259            reporthook(blocknum, bs, size)
260        while 1:
261            block = fp.read(bs)
262            if block == "":
263                break
264            read += len(block)
265            tfp.write(block)
266            blocknum += 1
267            if reporthook:
268                reporthook(blocknum, bs, size)
269        fp.close()
270        tfp.close()
271        del fp
272        del tfp
273
274        # raise exception if actual size does not match content-length header
275        if size >= 0 and read < size:
276            raise ContentTooShortError("retrieval incomplete: got only %i out "
277                                       "of %i bytes" % (read, size), result)
278
279        return result
280
281    # Each method named open_<type> knows how to open that type of URL
282
283    def open_http(self, url, data=None):
284        """Use HTTP protocol."""
285        import httplib
286        user_passwd = None
287        proxy_passwd= None
288        if isinstance(url, str):
289            host, selector = splithost(url)
290            if host:
291                user_passwd, host = splituser(host)
292                host = unquote(host)
293            realhost = host
294        else:
295            host, selector = url
296            # check whether the proxy contains authorization information
297            proxy_passwd, host = splituser(host)
298            # now we proceed with the url we want to obtain
299            urltype, rest = splittype(selector)
300            url = rest
301            user_passwd = None
302            if urltype.lower() != 'http':
303                realhost = None
304            else:
305                realhost, rest = splithost(rest)
306                if realhost:
307                    user_passwd, realhost = splituser(realhost)
308                if user_passwd:
309                    selector = "%s://%s%s" % (urltype, realhost, rest)
310                if proxy_bypass(realhost):
311                    host = realhost
312
313            #print "proxy via http:", host, selector
314        if not host: raise IOError, ('http error', 'no host given')
315
316        if proxy_passwd:
317            import base64
318            proxy_auth = base64.b64encode(proxy_passwd).strip()
319        else:
320            proxy_auth = None
321
322        if user_passwd:
323            import base64
324            auth = base64.b64encode(user_passwd).strip()
325        else:
326            auth = None
327        h = httplib.HTTP(host)
328        if data is not None:
329            h.putrequest('POST', selector)
330            h.putheader('Content-Type', 'application/x-www-form-urlencoded')
331            h.putheader('Content-Length', '%d' % len(data))
332        else:
333            h.putrequest('GET', selector)
334        if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
335        if auth: h.putheader('Authorization', 'Basic %s' % auth)
336        if realhost: h.putheader('Host', realhost)
337        for args in self.addheaders: h.putheader(*args)
338        h.endheaders(data)
339        errcode, errmsg, headers = h.getreply()
340        fp = h.getfile()
341        if errcode == -1:
342            if fp: fp.close()
343            # something went wrong with the HTTP status line
344            raise IOError, ('http protocol error', 0,
345                            'got a bad status line', None)
346        # According to RFC 2616, "2xx" code indicates that the client's
347        # request was successfully received, understood, and accepted.
348        if (200 <= errcode < 300):
349            return addinfourl(fp, headers, "http:" + url, errcode)
350        else:
351            if data is None:
352                return self.http_error(url, fp, errcode, errmsg, headers)
353            else:
354                return self.http_error(url, fp, errcode, errmsg, headers, data)
355
356    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
357        """Handle http errors.
358        Derived class can override this, or provide specific handlers
359        named http_error_DDD where DDD is the 3-digit error code."""
360        # First check if there's a specific handler for this error
361        name = 'http_error_%d' % errcode
362        if hasattr(self, name):
363            method = getattr(self, name)
364            if data is None:
365                result = method(url, fp, errcode, errmsg, headers)
366            else:
367                result = method(url, fp, errcode, errmsg, headers, data)
368            if result: return result
369        return self.http_error_default(url, fp, errcode, errmsg, headers)
370
371    def http_error_default(self, url, fp, errcode, errmsg, headers):
372        """Default error handler: close the connection and raise IOError."""
373        void = fp.read()
374        fp.close()
375        raise IOError, ('http error', errcode, errmsg, headers)
376
377    if _have_ssl:
378        def open_https(self, url, data=None):
379            """Use HTTPS protocol."""
380
381            import httplib
382            user_passwd = None
383            proxy_passwd = None
384            if isinstance(url, str):
385                host, selector = splithost(url)
386                if host:
387                    user_passwd, host = splituser(host)
388                    host = unquote(host)
389                realhost = host
390            else:
391                host, selector = url
392                # here, we determine, whether the proxy contains authorization information
393                proxy_passwd, host = splituser(host)
394                urltype, rest = splittype(selector)
395                url = rest
396                user_passwd = None
397                if urltype.lower() != 'https':
398                    realhost = None
399                else:
400                    realhost, rest = splithost(rest)
401                    if realhost:
402                        user_passwd, realhost = splituser(realhost)
403                    if user_passwd:
404                        selector = "%s://%s%s" % (urltype, realhost, rest)
405                #print "proxy via https:", host, selector
406            if not host: raise IOError, ('https error', 'no host given')
407            if proxy_passwd:
408                import base64
409                proxy_auth = base64.b64encode(proxy_passwd).strip()
410            else:
411                proxy_auth = None
412            if user_passwd:
413                import base64
414                auth = base64.b64encode(user_passwd).strip()
415            else:
416                auth = None
417            h = httplib.HTTPS(host, 0,
418                              key_file=self.key_file,
419                              cert_file=self.cert_file)
420            if data is not None:
421                h.putrequest('POST', selector)
422                h.putheader('Content-Type',
423                            'application/x-www-form-urlencoded')
424                h.putheader('Content-Length', '%d' % len(data))
425            else:
426                h.putrequest('GET', selector)
427            if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
428            if auth: h.putheader('Authorization', 'Basic %s' % auth)
429            if realhost: h.putheader('Host', realhost)
430            for args in self.addheaders: h.putheader(*args)
431            h.endheaders(data)
432            errcode, errmsg, headers = h.getreply()
433            fp = h.getfile()
434            if errcode == -1:
435                if fp: fp.close()
436                # something went wrong with the HTTP status line
437                raise IOError, ('http protocol error', 0,
438                                'got a bad status line', None)
439            # According to RFC 2616, "2xx" code indicates that the client's
440            # request was successfully received, understood, and accepted.
441            if (200 <= errcode < 300):
442                return addinfourl(fp, headers, "https:" + url, errcode)
443            else:
444                if data is None:
445                    return self.http_error(url, fp, errcode, errmsg, headers)
446                else:
447                    return self.http_error(url, fp, errcode, errmsg, headers,
448                                           data)
449
450    def open_file(self, url):
451        """Use local file or FTP depending on form of URL."""
452        if not isinstance(url, str):
453            raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
454        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
455            return self.open_ftp(url)
456        else:
457            return self.open_local_file(url)
458
459    def open_local_file(self, url):
460        """Use local file."""
461        import mimetypes, mimetools, email.utils
462        try:
463            from cStringIO import StringIO
464        except ImportError:
465            from StringIO import StringIO
466        host, file = splithost(url)
467        localname = url2pathname(file)
468        try:
469            stats = os.stat(localname)
470        except OSError, e:
471            raise IOError(e.errno, e.strerror, e.filename)
472        size = stats.st_size
473        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
474        mtype = mimetypes.guess_type(url)[0]
475        headers = mimetools.Message(StringIO(
476            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
477            (mtype or 'text/plain', size, modified)))
478        if not host:
479            urlfile = file
480            if file[:1] == '/':
481                urlfile = 'file://' + file
482            return addinfourl(open(localname, 'rb'),
483                              headers, urlfile)
484        host, port = splitport(host)
485        if not port \
486           and socket.gethostbyname(host) in (localhost(), thishost()):
487            urlfile = file
488            if file[:1] == '/':
489                urlfile = 'file://' + file
490            return addinfourl(open(localname, 'rb'),
491                              headers, urlfile)
492        raise IOError, ('local file error', 'not on local host')
493
494    def open_ftp(self, url):
495        """Use FTP protocol."""
496        if not isinstance(url, str):
497            raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
498        import mimetypes, mimetools
499        try:
500            from cStringIO import StringIO
501        except ImportError:
502            from StringIO import StringIO
503        host, path = splithost(url)
504        if not host: raise IOError, ('ftp error', 'no host given')
505        host, port = splitport(host)
506        user, host = splituser(host)
507        if user: user, passwd = splitpasswd(user)
508        else: passwd = None
509        host = unquote(host)
510        user = unquote(user or '')
511        passwd = unquote(passwd or '')
512        host = socket.gethostbyname(host)
513        if not port:
514            import ftplib
515            port = ftplib.FTP_PORT
516        else:
517            port = int(port)
518        path, attrs = splitattr(path)
519        path = unquote(path)
520        dirs = path.split('/')
521        dirs, file = dirs[:-1], dirs[-1]
522        if dirs and not dirs[0]: dirs = dirs[1:]
523        if dirs and not dirs[0]: dirs[0] = '/'
524        key = user, host, port, '/'.join(dirs)
525        # XXX thread unsafe!
526        if len(self.ftpcache) > MAXFTPCACHE:
527            # Prune the cache, rather arbitrarily
528            for k in self.ftpcache.keys():
529                if k != key:
530                    v = self.ftpcache[k]
531                    del self.ftpcache[k]
532                    v.close()
533        try:
534            if not key in self.ftpcache:
535                self.ftpcache[key] = \
536                    ftpwrapper(user, passwd, host, port, dirs)
537            if not file: type = 'D'
538            else: type = 'I'
539            for attr in attrs:
540                attr, value = splitvalue(attr)
541                if attr.lower() == 'type' and \
542                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
543                    type = value.upper()
544            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
545            mtype = mimetypes.guess_type("ftp:" + url)[0]
546            headers = ""
547            if mtype:
548                headers += "Content-Type: %s\n" % mtype
549            if retrlen is not None and retrlen >= 0:
550                headers += "Content-Length: %d\n" % retrlen
551            headers = mimetools.Message(StringIO(headers))
552            return addinfourl(fp, headers, "ftp:" + url)
553        except ftperrors(), msg:
554            raise IOError, ('ftp error', msg), sys.exc_info()[2]
555
556    def open_data(self, url, data=None):
557        """Use "data" URL."""
558        if not isinstance(url, str):
559            raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
560        # ignore POSTed data
561        #
562        # syntax of data URLs:
563        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
564        # mediatype := [ type "/" subtype ] *( ";" parameter )
565        # data      := *urlchar
566        # parameter := attribute "=" value
567        import mimetools
568        try:
569            from cStringIO import StringIO
570        except ImportError:
571            from StringIO import StringIO
572        try:
573            [type, data] = url.split(',', 1)
574        except ValueError:
575            raise IOError, ('data error', 'bad data URL')
576        if not type:
577            type = 'text/plain;charset=US-ASCII'
578        semi = type.rfind(';')
579        if semi >= 0 and '=' not in type[semi:]:
580            encoding = type[semi+1:]
581            type = type[:semi]
582        else:
583            encoding = ''
584        msg = []
585        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
586                                            time.gmtime(time.time())))
587        msg.append('Content-type: %s' % type)
588        if encoding == 'base64':
589            import base64
590            data = base64.decodestring(data)
591        else:
592            data = unquote(data)
593        msg.append('Content-Length: %d' % len(data))
594        msg.append('')
595        msg.append(data)
596        msg = '\n'.join(msg)
597        f = StringIO(msg)
598        headers = mimetools.Message(f, 0)
599        #f.fileno = None     # needed for addinfourl
600        return addinfourl(f, headers, url)
601
602
603class FancyURLopener(URLopener):
604    """Derived class with handlers for errors we can handle (perhaps)."""
605
606    def __init__(self, *args, **kwargs):
607        URLopener.__init__(self, *args, **kwargs)
608        self.auth_cache = {}
609        self.tries = 0
610        self.maxtries = 10
611
612    def http_error_default(self, url, fp, errcode, errmsg, headers):
613        """Default error handling -- don't raise an exception."""
614        return addinfourl(fp, headers, "http:" + url, errcode)
615
616    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
617        """Error 302 -- relocated (temporarily)."""
618        self.tries += 1
619        if self.maxtries and self.tries >= self.maxtries:
620            if hasattr(self, "http_error_500"):
621                meth = self.http_error_500
622            else:
623                meth = self.http_error_default
624            self.tries = 0
625            return meth(url, fp, 500,
626                        "Internal Server Error: Redirect Recursion", headers)
627        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
628                                        data)
629        self.tries = 0
630        return result
631
632    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
633        if 'location' in headers:
634            newurl = headers['location']
635        elif 'uri' in headers:
636            newurl = headers['uri']
637        else:
638            return
639        void = fp.read()
640        fp.close()
641        # In case the server sent a relative URL, join with original:
642        newurl = basejoin(self.type + ":" + url, newurl)
643        return self.open(newurl)
644
645    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
646        """Error 301 -- also relocated (permanently)."""
647        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
648
649    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
650        """Error 303 -- also relocated (essentially identical to 302)."""
651        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
652
653    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
654        """Error 307 -- relocated, but turn POST into error."""
655        if data is None:
656            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
657        else:
658            return self.http_error_default(url, fp, errcode, errmsg, headers)
659
660    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
661        """Error 401 -- authentication required.
662        This function supports Basic authentication only."""
663        if not 'www-authenticate' in headers:
664            URLopener.http_error_default(self, url, fp,
665                                         errcode, errmsg, headers)
666        stuff = headers['www-authenticate']
667        import re
668        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
669        if not match:
670            URLopener.http_error_default(self, url, fp,
671                                         errcode, errmsg, headers)
672        scheme, realm = match.groups()
673        if scheme.lower() != 'basic':
674            URLopener.http_error_default(self, url, fp,
675                                         errcode, errmsg, headers)
676        name = 'retry_' + self.type + '_basic_auth'
677        if data is None:
678            return getattr(self,name)(url, realm)
679        else:
680            return getattr(self,name)(url, realm, data)
681
682    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
683        """Error 407 -- proxy authentication required.
684        This function supports Basic authentication only."""
685        if not 'proxy-authenticate' in headers:
686            URLopener.http_error_default(self, url, fp,
687                                         errcode, errmsg, headers)
688        stuff = headers['proxy-authenticate']
689        import re
690        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
691        if not match:
692            URLopener.http_error_default(self, url, fp,
693                                         errcode, errmsg, headers)
694        scheme, realm = match.groups()
695        if scheme.lower() != 'basic':
696            URLopener.http_error_default(self, url, fp,
697                                         errcode, errmsg, headers)
698        name = 'retry_proxy_' + self.type + '_basic_auth'
699        if data is None:
700            return getattr(self,name)(url, realm)
701        else:
702            return getattr(self,name)(url, realm, data)
703
704    def retry_proxy_http_basic_auth(self, url, realm, data=None):
705        host, selector = splithost(url)
706        newurl = 'http://' + host + selector
707        proxy = self.proxies['http']
708        urltype, proxyhost = splittype(proxy)
709        proxyhost, proxyselector = splithost(proxyhost)
710        i = proxyhost.find('@') + 1
711        proxyhost = proxyhost[i:]
712        user, passwd = self.get_user_passwd(proxyhost, realm, i)
713        if not (user or passwd): return None
714        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
715        self.proxies['http'] = 'http://' + proxyhost + proxyselector
716        if data is None:
717            return self.open(newurl)
718        else:
719            return self.open(newurl, data)
720
721    def retry_proxy_https_basic_auth(self, url, realm, data=None):
722        host, selector = splithost(url)
723        newurl = 'https://' + host + selector
724        proxy = self.proxies['https']
725        urltype, proxyhost = splittype(proxy)
726        proxyhost, proxyselector = splithost(proxyhost)
727        i = proxyhost.find('@') + 1
728        proxyhost = proxyhost[i:]
729        user, passwd = self.get_user_passwd(proxyhost, realm, i)
730        if not (user or passwd): return None
731        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
732        self.proxies['https'] = 'https://' + proxyhost + proxyselector
733        if data is None:
734            return self.open(newurl)
735        else:
736            return self.open(newurl, data)
737
738    def retry_http_basic_auth(self, url, realm, data=None):
739        host, selector = splithost(url)
740        i = host.find('@') + 1
741        host = host[i:]
742        user, passwd = self.get_user_passwd(host, realm, i)
743        if not (user or passwd): return None
744        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
745        newurl = 'http://' + host + selector
746        if data is None:
747            return self.open(newurl)
748        else:
749            return self.open(newurl, data)
750
751    def retry_https_basic_auth(self, url, realm, data=None):
752        host, selector = splithost(url)
753        i = host.find('@') + 1
754        host = host[i:]
755        user, passwd = self.get_user_passwd(host, realm, i)
756        if not (user or passwd): return None
757        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
758        newurl = 'https://' + host + selector
759        if data is None:
760            return self.open(newurl)
761        else:
762            return self.open(newurl, data)
763
764    def get_user_passwd(self, host, realm, clear_cache = 0):
765        key = realm + '@' + host.lower()
766        if key in self.auth_cache:
767            if clear_cache:
768                del self.auth_cache[key]
769            else:
770                return self.auth_cache[key]
771        user, passwd = self.prompt_user_passwd(host, realm)
772        if user or passwd: self.auth_cache[key] = (user, passwd)
773        return user, passwd
774
775    def prompt_user_passwd(self, host, realm):
776        """Override this in a GUI environment!"""
777        import getpass
778        try:
779            user = raw_input("Enter username for %s at %s: " % (realm,
780                                                                host))
781            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
782                (user, realm, host))
783            return user, passwd
784        except KeyboardInterrupt:
785            print
786            return None, None
787
788
789# Utility functions
790
791_localhost = None
792def localhost():
793    """Return the IP address of the magic hostname 'localhost'."""
794    global _localhost
795    if _localhost is None:
796        _localhost = socket.gethostbyname('localhost')
797    return _localhost
798
799_thishost = None
800def thishost():
801    """Return the IP address of the current host."""
802    global _thishost
803    if _thishost is None:
804        _thishost = socket.gethostbyname(socket.gethostname())
805    return _thishost
806
807_ftperrors = None
808def ftperrors():
809    """Return the set of errors raised by the FTP class."""
810    global _ftperrors
811    if _ftperrors is None:
812        import ftplib
813        _ftperrors = ftplib.all_errors
814    return _ftperrors
815
816_noheaders = None
817def noheaders():
818    """Return an empty mimetools.Message object."""
819    global _noheaders
820    if _noheaders is None:
821        import mimetools
822        try:
823            from cStringIO import StringIO
824        except ImportError:
825            from StringIO import StringIO
826        _noheaders = mimetools.Message(StringIO(), 0)
827        _noheaders.fp.close()   # Recycle file descriptor
828    return _noheaders
829
830
831# Utility classes
832
833class ftpwrapper:
834    """Class used by open_ftp() for cache of open FTP connections."""
835
836    def __init__(self, user, passwd, host, port, dirs,
837                 timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
838        self.user = user
839        self.passwd = passwd
840        self.host = host
841        self.port = port
842        self.dirs = dirs
843        self.timeout = timeout
844        self.init()
845
846    def init(self):
847        import ftplib
848        self.busy = 0
849        self.ftp = ftplib.FTP()
850        self.ftp.connect(self.host, self.port, self.timeout)
851        self.ftp.login(self.user, self.passwd)
852        for dir in self.dirs:
853            self.ftp.cwd(dir)
854
855    def retrfile(self, file, type):
856        import ftplib
857        self.endtransfer()
858        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
859        else: cmd = 'TYPE ' + type; isdir = 0
860        try:
861            self.ftp.voidcmd(cmd)
862        except ftplib.all_errors:
863            self.init()
864            self.ftp.voidcmd(cmd)
865        conn = None
866        if file and not isdir:
867            # Try to retrieve as a file
868            try:
869                cmd = 'RETR ' + file
870                conn = self.ftp.ntransfercmd(cmd)
871            except ftplib.error_perm, reason:
872                if str(reason)[:3] != '550':
873                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
874        if not conn:
875            # Set transfer mode to ASCII!
876            self.ftp.voidcmd('TYPE A')
877            # Try a directory listing. Verify that directory exists.
878            if file:
879                pwd = self.ftp.pwd()
880                try:
881                    try:
882                        self.ftp.cwd(file)
883                    except ftplib.error_perm, reason:
884                        raise IOError, ('ftp error', reason), sys.exc_info()[2]
885                finally:
886                    self.ftp.cwd(pwd)
887                cmd = 'LIST ' + file
888            else:
889                cmd = 'LIST'
890            conn = self.ftp.ntransfercmd(cmd)
891        self.busy = 1
892        # Pass back both a suitably decorated object and a retrieval length
893        return (addclosehook(conn[0].makefile('rb'),
894                             self.endtransfer), conn[1])
895    def endtransfer(self):
896        if not self.busy:
897            return
898        self.busy = 0
899        try:
900            self.ftp.voidresp()
901        except ftperrors():
902            pass
903
904    def close(self):
905        self.endtransfer()
906        try:
907            self.ftp.close()
908        except ftperrors():
909            pass
910
911class addbase:
912    """Base class for addinfo and addclosehook."""
913
914    def __init__(self, fp):
915        self.fp = fp
916        self.read = self.fp.read
917        self.readline = self.fp.readline
918        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
919        if hasattr(self.fp, "fileno"):
920            self.fileno = self.fp.fileno
921        else:
922            self.fileno = lambda: None
923        if hasattr(self.fp, "__iter__"):
924            self.__iter__ = self.fp.__iter__
925            if hasattr(self.fp, "next"):
926                self.next = self.fp.next
927
928    def __repr__(self):
929        return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
930                                             id(self), self.fp)
931
932    def close(self):
933        self.read = None
934        self.readline = None
935        self.readlines = None
936        self.fileno = None
937        if self.fp: self.fp.close()
938        self.fp = None
939
940class addclosehook(addbase):
941    """Class to add a close hook to an open file."""
942
943    def __init__(self, fp, closehook, *hookargs):
944        addbase.__init__(self, fp)
945        self.closehook = closehook
946        self.hookargs = hookargs
947
948    def close(self):
949        addbase.close(self)
950        if self.closehook:
951            self.closehook(*self.hookargs)
952            self.closehook = None
953            self.hookargs = None
954
955class addinfo(addbase):
956    """class to add an info() method to an open file."""
957
958    def __init__(self, fp, headers):
959        addbase.__init__(self, fp)
960        self.headers = headers
961
962    def info(self):
963        return self.headers
964
965class addinfourl(addbase):
966    """class to add info() and geturl() methods to an open file."""
967
968    def __init__(self, fp, headers, url, code=None):
969        addbase.__init__(self, fp)
970        self.headers = headers
971        self.url = url
972        self.code = code
973
974    def info(self):
975        return self.headers
976
977    def getcode(self):
978        return self.code
979
980    def geturl(self):
981        return self.url
982
983
984# Utilities to parse URLs (most of these return None for missing parts):
985# unwrap('<URL:type://host/path>') --> 'type://host/path'
986# splittype('type:opaquestring') --> 'type', 'opaquestring'
987# splithost('//host[:port]/path') --> 'host[:port]', '/path'
988# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
989# splitpasswd('user:passwd') -> 'user', 'passwd'
990# splitport('host:port') --> 'host', 'port'
991# splitquery('/path?query') --> '/path', 'query'
992# splittag('/path#tag') --> '/path', 'tag'
993# splitattr('/path;attr1=value1;attr2=value2;...') ->
994#   '/path', ['attr1=value1', 'attr2=value2', ...]
995# splitvalue('attr=value') --> 'attr', 'value'
996# unquote('abc%20def') -> 'abc def'
997# quote('abc def') -> 'abc%20def')
998
999try:
1000    unicode
1001except NameError:
1002    def _is_unicode(x):
1003        return 0
1004else:
1005    def _is_unicode(x):
1006        return isinstance(x, unicode)
1007
1008def toBytes(url):
1009    """toBytes(u"URL") --> 'URL'."""
1010    # Most URL schemes require ASCII. If that changes, the conversion
1011    # can be relaxed
1012    if _is_unicode(url):
1013        try:
1014            url = url.encode("ASCII")
1015        except UnicodeError:
1016            raise UnicodeError("URL " + repr(url) +
1017                               " contains non-ASCII characters")
1018    return url
1019
1020def unwrap(url):
1021    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1022    url = url.strip()
1023    if url[:1] == '<' and url[-1:] == '>':
1024        url = url[1:-1].strip()
1025    if url[:4] == 'URL:': url = url[4:].strip()
1026    return url
1027
1028_typeprog = None
1029def splittype(url):
1030    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1031    global _typeprog
1032    if _typeprog is None:
1033        import re
1034        _typeprog = re.compile('^([^/:]+):')
1035
1036    match = _typeprog.match(url)
1037    if match:
1038        scheme = match.group(1)
1039        return scheme.lower(), url[len(scheme) + 1:]
1040    return None, url
1041
1042_hostprog = None
1043def splithost(url):
1044    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1045    global _hostprog
1046    if _hostprog is None:
1047        import re
1048        _hostprog = re.compile('^//([^/?]*)(.*)$')
1049
1050    match = _hostprog.match(url)
1051    if match: return match.group(1, 2)
1052    return None, url
1053
1054_userprog = None
1055def splituser(host):
1056    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1057    global _userprog
1058    if _userprog is None:
1059        import re
1060        _userprog = re.compile('^(.*)@(.*)$')
1061
1062    match = _userprog.match(host)
1063    if match: return map(unquote, match.group(1, 2))
1064    return None, host
1065
1066_passwdprog = None
1067def splitpasswd(user):
1068    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1069    global _passwdprog
1070    if _passwdprog is None:
1071        import re
1072        _passwdprog = re.compile('^([^:]*):(.*)$')
1073
1074    match = _passwdprog.match(user)
1075    if match: return match.group(1, 2)
1076    return user, None
1077
1078# splittag('/path#tag') --> '/path', 'tag'
1079_portprog = None
1080def splitport(host):
1081    """splitport('host:port') --> 'host', 'port'."""
1082    global _portprog
1083    if _portprog is None:
1084        import re
1085        _portprog = re.compile('^(.*):([0-9]+)$')
1086
1087    match = _portprog.match(host)
1088    if match: return match.group(1, 2)
1089    return host, None
1090
1091_nportprog = None
1092def splitnport(host, defport=-1):
1093    """Split host and port, returning numeric port.
1094    Return given default port if no ':' found; defaults to -1.
1095    Return numerical port if a valid number are found after ':'.
1096    Return None if ':' but not a valid number."""
1097    global _nportprog
1098    if _nportprog is None:
1099        import re
1100        _nportprog = re.compile('^(.*):(.*)$')
1101
1102    match = _nportprog.match(host)
1103    if match:
1104        host, port = match.group(1, 2)
1105        try:
1106            if not port: raise ValueError, "no digits"
1107            nport = int(port)
1108        except ValueError:
1109            nport = None
1110        return host, nport
1111    return host, defport
1112
1113_queryprog = None
1114def splitquery(url):
1115    """splitquery('/path?query') --> '/path', 'query'."""
1116    global _queryprog
1117    if _queryprog is None:
1118        import re
1119        _queryprog = re.compile('^(.*)\?([^?]*)$')
1120
1121    match = _queryprog.match(url)
1122    if match: return match.group(1, 2)
1123    return url, None
1124
1125_tagprog = None
1126def splittag(url):
1127    """splittag('/path#tag') --> '/path', 'tag'."""
1128    global _tagprog
1129    if _tagprog is None:
1130        import re
1131        _tagprog = re.compile('^(.*)#([^#]*)$')
1132
1133    match = _tagprog.match(url)
1134    if match: return match.group(1, 2)
1135    return url, None
1136
1137def splitattr(url):
1138    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1139        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1140    words = url.split(';')
1141    return words[0], words[1:]
1142
1143_valueprog = None
1144def splitvalue(attr):
1145    """splitvalue('attr=value') --> 'attr', 'value'."""
1146    global _valueprog
1147    if _valueprog is None:
1148        import re
1149        _valueprog = re.compile('^([^=]*)=(.*)$')
1150
1151    match = _valueprog.match(attr)
1152    if match: return match.group(1, 2)
1153    return attr, None
1154
1155_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1156_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1157
1158def unquote(s):
1159    """unquote('abc%20def') -> 'abc def'."""
1160    res = s.split('%')
1161    for i in xrange(1, len(res)):
1162        item = res[i]
1163        try:
1164            res[i] = _hextochr[item[:2]] + item[2:]
1165        except KeyError:
1166            res[i] = '%' + item
1167        except UnicodeDecodeError:
1168            res[i] = unichr(int(item[:2], 16)) + item[2:]
1169    return "".join(res)
1170
1171def unquote_plus(s):
1172    """unquote('%7e/abc+def') -> '~/abc def'"""
1173    s = s.replace('+', ' ')
1174    return unquote(s)
1175
1176always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1177               'abcdefghijklmnopqrstuvwxyz'
1178               '0123456789' '_.-')
1179_safemaps = {}
1180
1181def quote(s, safe = '/'):
1182    """quote('abc def') -> 'abc%20def'
1183
1184    Each part of a URL, e.g. the path info, the query, etc., has a
1185    different set of reserved characters that must be quoted.
1186
1187    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1188    the following reserved characters.
1189
1190    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1191                  "$" | ","
1192
1193    Each of these characters is reserved in some component of a URL,
1194    but not necessarily in all of them.
1195
1196    By default, the quote function is intended for quoting the path
1197    section of a URL.  Thus, it will not encode '/'.  This character
1198    is reserved, but in typical usage the quote function is being
1199    called on a path where the existing slash characters are used as
1200    reserved characters.
1201    """
1202    cachekey = (safe, always_safe)
1203    try:
1204        safe_map = _safemaps[cachekey]
1205    except KeyError:
1206        safe += always_safe
1207        safe_map = {}
1208        for i in range(256):
1209            c = chr(i)
1210            safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1211        _safemaps[cachekey] = safe_map
1212    res = map(safe_map.__getitem__, s)
1213    return ''.join(res)
1214
1215def quote_plus(s, safe = ''):
1216    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1217    if ' ' in s:
1218        s = quote(s, safe + ' ')
1219        return s.replace(' ', '+')
1220    return quote(s, safe)
1221
1222def urlencode(query,doseq=0):
1223    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1224
1225    If any values in the query arg are sequences and doseq is true, each
1226    sequence element is converted to a separate parameter.
1227
1228    If the query arg is a sequence of two-element tuples, the order of the
1229    parameters in the output will match the order of parameters in the
1230    input.
1231    """
1232
1233    if hasattr(query,"items"):
1234        # mapping objects
1235        query = query.items()
1236    else:
1237        # it's a bother at times that strings and string-like objects are
1238        # sequences...
1239        try:
1240            # non-sequence items should not work with len()
1241            # non-empty strings will fail this
1242            if len(query) and not isinstance(query[0], tuple):
1243                raise TypeError
1244            # zero-length sequences of all types will get here and succeed,
1245            # but that's a minor nit - since the original implementation
1246            # allowed empty dicts that type of behavior probably should be
1247            # preserved for consistency
1248        except TypeError:
1249            ty,va,tb = sys.exc_info()
1250            raise TypeError, "not a valid non-string sequence or mapping object", tb
1251
1252    l = []
1253    if not doseq:
1254        # preserve old behavior
1255        for k, v in query:
1256            k = quote_plus(str(k))
1257            v = quote_plus(str(v))
1258            l.append(k + '=' + v)
1259    else:
1260        for k, v in query:
1261            k = quote_plus(str(k))
1262            if isinstance(v, str):
1263                v = quote_plus(v)
1264                l.append(k + '=' + v)
1265            elif _is_unicode(v):
1266                # is there a reasonable way to convert to ASCII?
1267                # encode generates a string, but "replace" or "ignore"
1268                # lose information and "strict" can raise UnicodeError
1269                v = quote_plus(v.encode("ASCII","replace"))
1270                l.append(k + '=' + v)
1271            else:
1272                try:
1273                    # is this a sufficient test for sequence-ness?
1274                    x = len(v)
1275                except TypeError:
1276                    # not a sequence
1277                    v = quote_plus(str(v))
1278                    l.append(k + '=' + v)
1279                else:
1280                    # loop over the sequence
1281                    for elt in v:
1282                        l.append(k + '=' + quote_plus(str(elt)))
1283    return '&'.join(l)
1284
1285# Proxy handling
1286def getproxies_environment():
1287    """Return a dictionary of scheme -> proxy server URL mappings.
1288
1289    Scan the environment for variables named <scheme>_proxy;
1290    this seems to be the standard convention.  If you need a
1291    different way, you can pass a proxies dictionary to the
1292    [Fancy]URLopener constructor.
1293
1294    """
1295    proxies = {}
1296    for name, value in os.environ.items():
1297        name = name.lower()
1298        if value and name[-6:] == '_proxy':
1299            proxies[name[:-6]] = value
1300    return proxies
1301
1302def proxy_bypass_environment(host):
1303    """Test if proxies should not be used for a particular host.
1304
1305    Checks the environment for a variable named no_proxy, which should
1306    be a list of DNS suffixes separated by commas, or '*' for all hosts.
1307    """
1308    no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1309    # '*' is special case for always bypass
1310    if no_proxy == '*':
1311        return 1
1312    # strip port off host
1313    hostonly, port = splitport(host)
1314    # check if the host ends with any of the DNS suffixes
1315    for name in no_proxy.split(','):
1316        if name and (hostonly.endswith(name) or host.endswith(name)):
1317            return 1
1318    # otherwise, don't bypass
1319    return 0
1320
1321
1322if sys.platform == 'darwin':
1323
1324    def _CFSetup(sc):
1325        from ctypes import c_int32, c_void_p, c_char_p, c_int
1326        sc.CFStringCreateWithCString.argtypes = [ c_void_p, c_char_p, c_int32 ]
1327        sc.CFStringCreateWithCString.restype = c_void_p
1328        sc.SCDynamicStoreCopyProxies.argtypes = [ c_void_p ]
1329        sc.SCDynamicStoreCopyProxies.restype = c_void_p
1330        sc.CFDictionaryGetValue.argtypes = [ c_void_p, c_void_p ]
1331        sc.CFDictionaryGetValue.restype = c_void_p
1332        sc.CFStringGetLength.argtypes = [ c_void_p ]
1333        sc.CFStringGetLength.restype = c_int32
1334        sc.CFStringGetCString.argtypes = [ c_void_p, c_char_p, c_int32, c_int32 ]
1335        sc.CFStringGetCString.restype = c_int32
1336        sc.CFNumberGetValue.argtypes = [ c_void_p, c_int, c_void_p ]
1337        sc.CFNumberGetValue.restype = c_int32
1338        sc.CFRelease.argtypes = [ c_void_p ]
1339        sc.CFRelease.restype = None
1340
1341    def _CStringFromCFString(sc, value):
1342        from ctypes import create_string_buffer
1343        length = sc.CFStringGetLength(value) + 1
1344        buff = create_string_buffer(length)
1345        sc.CFStringGetCString(value, buff, length, 0)
1346        return buff.value
1347
1348    def _CFNumberToInt32(sc, cfnum):
1349        from ctypes import byref, c_int
1350        val = c_int()
1351        kCFNumberSInt32Type = 3
1352        sc.CFNumberGetValue(cfnum, kCFNumberSInt32Type, byref(val))
1353        return val.value
1354
1355
1356    def proxy_bypass_macosx_sysconf(host):
1357        """
1358        Return True iff this host shouldn't be accessed using a proxy
1359
1360        This function uses the MacOSX framework SystemConfiguration
1361        to fetch the proxy information.
1362        """
1363        from ctypes import cdll
1364        from ctypes.util import find_library
1365        import re
1366        import socket
1367        from fnmatch import fnmatch
1368
1369        def ip2num(ipAddr):
1370            parts = ipAddr.split('.')
1371            parts = map(int, parts)
1372            if len(parts) != 4:
1373                parts = (parts + [0, 0, 0, 0])[:4]
1374            return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1375
1376        sc = cdll.LoadLibrary(find_library("SystemConfiguration"))
1377        _CFSetup(sc)
1378
1379        hostIP = None
1380
1381        if not sc:
1382            return False
1383
1384        kSCPropNetProxiesExceptionsList = sc.CFStringCreateWithCString(0, "ExceptionsList", 0)
1385        kSCPropNetProxiesExcludeSimpleHostnames = sc.CFStringCreateWithCString(0,
1386                "ExcludeSimpleHostnames", 0)
1387
1388
1389        proxyDict = sc.SCDynamicStoreCopyProxies(None)
1390        if proxyDict is None:
1391            return False
1392
1393        try:
1394            # Check for simple host names:
1395            if '.' not in host:
1396                exclude_simple = sc.CFDictionaryGetValue(proxyDict,
1397                        kSCPropNetProxiesExcludeSimpleHostnames)
1398                if exclude_simple and _CFNumberToInt32(sc, exclude_simple):
1399                    return True
1400
1401
1402            # Check the exceptions list:
1403            exceptions = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesExceptionsList)
1404            if exceptions:
1405                # Items in the list are strings like these: *.local, 169.254/16
1406                for index in xrange(sc.CFArrayGetCount(exceptions)):
1407                    value = sc.CFArrayGetValueAtIndex(exceptions, index)
1408                    if not value: continue
1409                    value = _CStringFromCFString(sc, value)
1410
1411                    m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1412                    if m is not None:
1413                        if hostIP is None:
1414                            hostIP = socket.gethostbyname(host)
1415                            hostIP = ip2num(hostIP)
1416
1417                        base = ip2num(m.group(1))
1418                        mask = int(m.group(2)[1:])
1419                        mask = 32 - mask
1420
1421                        if (hostIP >> mask) == (base >> mask):
1422                            return True
1423
1424                    elif fnmatch(host, value):
1425                        return True
1426
1427            return False
1428
1429        finally:
1430            sc.CFRelease(kSCPropNetProxiesExceptionsList)
1431            sc.CFRelease(kSCPropNetProxiesExcludeSimpleHostnames)
1432
1433
1434
1435    def getproxies_macosx_sysconf():
1436        """Return a dictionary of scheme -> proxy server URL mappings.
1437
1438        This function uses the MacOSX framework SystemConfiguration
1439        to fetch the proxy information.
1440        """
1441        from ctypes import cdll
1442        from ctypes.util import find_library
1443
1444        sc = cdll.LoadLibrary(find_library("SystemConfiguration"))
1445        _CFSetup(sc)
1446
1447        if not sc:
1448            return {}
1449
1450        kSCPropNetProxiesHTTPEnable = sc.CFStringCreateWithCString(0, "HTTPEnable", 0)
1451        kSCPropNetProxiesHTTPProxy = sc.CFStringCreateWithCString(0, "HTTPProxy", 0)
1452        kSCPropNetProxiesHTTPPort = sc.CFStringCreateWithCString(0, "HTTPPort", 0)
1453
1454        kSCPropNetProxiesHTTPSEnable = sc.CFStringCreateWithCString(0, "HTTPSEnable", 0)
1455        kSCPropNetProxiesHTTPSProxy = sc.CFStringCreateWithCString(0, "HTTPSProxy", 0)
1456        kSCPropNetProxiesHTTPSPort = sc.CFStringCreateWithCString(0, "HTTPSPort", 0)
1457
1458        kSCPropNetProxiesFTPEnable = sc.CFStringCreateWithCString(0, "FTPEnable", 0)
1459        kSCPropNetProxiesFTPPassive = sc.CFStringCreateWithCString(0, "FTPPassive", 0)
1460        kSCPropNetProxiesFTPPort = sc.CFStringCreateWithCString(0, "FTPPort", 0)
1461        kSCPropNetProxiesFTPProxy = sc.CFStringCreateWithCString(0, "FTPProxy", 0)
1462
1463        kSCPropNetProxiesGopherEnable = sc.CFStringCreateWithCString(0, "GopherEnable", 0)
1464        kSCPropNetProxiesGopherPort = sc.CFStringCreateWithCString(0, "GopherPort", 0)
1465        kSCPropNetProxiesGopherProxy = sc.CFStringCreateWithCString(0, "GopherProxy", 0)
1466
1467        proxies = {}
1468        proxyDict = sc.SCDynamicStoreCopyProxies(None)
1469
1470        try:
1471            # HTTP:
1472            enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPEnable)
1473            if enabled and _CFNumberToInt32(sc, enabled):
1474                proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPProxy)
1475                port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPPort)
1476
1477                if proxy:
1478                    proxy = _CStringFromCFString(sc, proxy)
1479                    if port:
1480                        port = _CFNumberToInt32(sc, port)
1481                        proxies["http"] = "http://%s:%i" % (proxy, port)
1482                    else:
1483                        proxies["http"] = "http://%s" % (proxy, )
1484
1485            # HTTPS:
1486            enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSEnable)
1487            if enabled and _CFNumberToInt32(sc, enabled):
1488                proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSProxy)
1489                port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSPort)
1490
1491                if proxy:
1492                    proxy = _CStringFromCFString(sc, proxy)
1493                    if port:
1494                        port = _CFNumberToInt32(sc, port)
1495                        proxies["https"] = "http://%s:%i" % (proxy, port)
1496                    else:
1497                        proxies["https"] = "http://%s" % (proxy, )
1498
1499            # FTP:
1500            enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPEnable)
1501            if enabled and _CFNumberToInt32(sc, enabled):
1502                proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPProxy)
1503                port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPPort)
1504
1505                if proxy:
1506                    proxy = _CStringFromCFString(sc, proxy)
1507                    if port:
1508                        port = _CFNumberToInt32(sc, port)
1509                        proxies["ftp"] = "http://%s:%i" % (proxy, port)
1510                    else:
1511                        proxies["ftp"] = "http://%s" % (proxy, )
1512
1513            # Gopher:
1514            enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherEnable)
1515            if enabled and _CFNumberToInt32(sc, enabled):
1516                proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherProxy)
1517                port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherPort)
1518
1519                if proxy:
1520                    proxy = _CStringFromCFString(sc, proxy)
1521                    if port:
1522                        port = _CFNumberToInt32(sc, port)
1523                        proxies["gopher"] = "http://%s:%i" % (proxy, port)
1524                    else:
1525                        proxies["gopher"] = "http://%s" % (proxy, )
1526        finally:
1527            sc.CFRelease(proxyDict)
1528
1529        sc.CFRelease(kSCPropNetProxiesHTTPEnable)
1530        sc.CFRelease(kSCPropNetProxiesHTTPProxy)
1531        sc.CFRelease(kSCPropNetProxiesHTTPPort)
1532        sc.CFRelease(kSCPropNetProxiesFTPEnable)
1533        sc.CFRelease(kSCPropNetProxiesFTPPassive)
1534        sc.CFRelease(kSCPropNetProxiesFTPPort)
1535        sc.CFRelease(kSCPropNetProxiesFTPProxy)
1536        sc.CFRelease(kSCPropNetProxiesGopherEnable)
1537        sc.CFRelease(kSCPropNetProxiesGopherPort)
1538        sc.CFRelease(kSCPropNetProxiesGopherProxy)
1539
1540        return proxies
1541
1542
1543
1544    def proxy_bypass(host):
1545        if getproxies_environment():
1546            return proxy_bypass_environment(host)
1547        else:
1548            return proxy_bypass_macosx_sysconf(host)
1549
1550    def getproxies():
1551        return getproxies_environment() or getproxies_macosx_sysconf()
1552
1553elif os.name == 'nt':
1554    def getproxies_registry():
1555        """Return a dictionary of scheme -> proxy server URL mappings.
1556
1557        Win32 uses the registry to store proxies.
1558
1559        """
1560        proxies = {}
1561        try:
1562            import _winreg
1563        except ImportError:
1564            # Std module, so should be around - but you never know!
1565            return proxies
1566        try:
1567            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1568                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1569            proxyEnable = _winreg.QueryValueEx(internetSettings,
1570                                               'ProxyEnable')[0]
1571            if proxyEnable:
1572                # Returned as Unicode but problems if not converted to ASCII
1573                proxyServer = str(_winreg.QueryValueEx(internetSettings,
1574                                                       'ProxyServer')[0])
1575                if '=' in proxyServer:
1576                    # Per-protocol settings
1577                    for p in proxyServer.split(';'):
1578                        protocol, address = p.split('=', 1)
1579                        # See if address has a type:// prefix
1580                        import re
1581                        if not re.match('^([^/:]+)://', address):
1582                            address = '%s://%s' % (protocol, address)
1583                        proxies[protocol] = address
1584                else:
1585                    # Use one setting for all protocols
1586                    if proxyServer[:5] == 'http:':
1587                        proxies['http'] = proxyServer
1588                    else:
1589                        proxies['http'] = 'http://%s' % proxyServer
1590                        proxies['ftp'] = 'ftp://%s' % proxyServer
1591            internetSettings.Close()
1592        except (WindowsError, ValueError, TypeError):
1593            # Either registry key not found etc, or the value in an
1594            # unexpected format.
1595            # proxies already set up to be empty so nothing to do
1596            pass
1597        return proxies
1598
1599    def getproxies():
1600        """Return a dictionary of scheme -> proxy server URL mappings.
1601
1602        Returns settings gathered from the environment, if specified,
1603        or the registry.
1604
1605        """
1606        return getproxies_environment() or getproxies_registry()
1607
1608    def proxy_bypass_registry(host):
1609        try:
1610            import _winreg
1611            import re
1612        except ImportError:
1613            # Std modules, so should be around - but you never know!
1614            return 0
1615        try:
1616            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1617                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1618            proxyEnable = _winreg.QueryValueEx(internetSettings,
1619                                               'ProxyEnable')[0]
1620            proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1621                                                     'ProxyOverride')[0])
1622            # ^^^^ Returned as Unicode but problems if not converted to ASCII
1623        except WindowsError:
1624            return 0
1625        if not proxyEnable or not proxyOverride:
1626            return 0
1627        # try to make a host list from name and IP address.
1628        rawHost, port = splitport(host)
1629        host = [rawHost]
1630        try:
1631            addr = socket.gethostbyname(rawHost)
1632            if addr != rawHost:
1633                host.append(addr)
1634        except socket.error:
1635            pass
1636        try:
1637            fqdn = socket.getfqdn(rawHost)
1638            if fqdn != rawHost:
1639                host.append(fqdn)
1640        except socket.error:
1641            pass
1642        # make a check value list from the registry entry: replace the
1643        # '<local>' string by the localhost entry and the corresponding
1644        # canonical entry.
1645        proxyOverride = proxyOverride.split(';')
1646        i = 0
1647        while i < len(proxyOverride):
1648            if proxyOverride[i] == '<local>':
1649                proxyOverride[i:i+1] = ['localhost',
1650                                        '127.0.0.1',
1651                                        socket.gethostname(),
1652                                        socket.gethostbyname(
1653                                            socket.gethostname())]
1654            i += 1
1655        # print proxyOverride
1656        # now check if we match one of the registry values.
1657        for test in proxyOverride:
1658            test = test.replace(".", r"\.")     # mask dots
1659            test = test.replace("*", r".*")     # change glob sequence
1660            test = test.replace("?", r".")      # change glob char
1661            for val in host:
1662                # print "%s <--> %s" %( test, val )
1663                if re.match(test, val, re.I):
1664                    return 1
1665        return 0
1666
1667    def proxy_bypass(host):
1668        """Return a dictionary of scheme -> proxy server URL mappings.
1669
1670        Returns settings gathered from the environment, if specified,
1671        or the registry.
1672
1673        """
1674        if getproxies_environment():
1675            return proxy_bypass_environment(host)
1676        else:
1677            return proxy_bypass_registry(host)
1678
1679else:
1680    # By default use environment variables
1681    getproxies = getproxies_environment
1682    proxy_bypass = proxy_bypass_environment
1683
1684# Test and time quote() and unquote()
1685def test1():
1686    s = ''
1687    for i in range(256): s = s + chr(i)
1688    s = s*4
1689    t0 = time.time()
1690    qs = quote(s)
1691    uqs = unquote(qs)
1692    t1 = time.time()
1693    if uqs != s:
1694        print 'Wrong!'
1695    print repr(s)
1696    print repr(qs)
1697    print repr(uqs)
1698    print round(t1 - t0, 3), 'sec'
1699
1700
1701def reporthook(blocknum, blocksize, totalsize):
1702    # Report during remote transfers
1703    print "Block number: %d, Block size: %d, Total size: %d" % (
1704        blocknum, blocksize, totalsize)
1705
1706# Test program
1707def test(args=[]):
1708    if not args:
1709        args = [
1710            '/etc/passwd',
1711            'file:/etc/passwd',
1712            'file://localhost/etc/passwd',
1713            'ftp://ftp.gnu.org/pub/README',
1714            'http://www.python.org/index.html',
1715            ]
1716        if hasattr(URLopener, "open_https"):
1717            args.append('https://synergy.as.cmu.edu/~geek/')
1718    try:
1719        for url in args:
1720            print '-'*10, url, '-'*10
1721            fn, h = urlretrieve(url, None, reporthook)
1722            print fn
1723            if h:
1724                print '======'
1725                for k in h.keys(): print k + ':', h[k]
1726                print '======'
1727            fp = open(fn, 'rb')
1728            data = fp.read()
1729            del fp
1730            if '\r' in data:
1731                table = string.maketrans("", "")
1732                data = data.translate(table, "\r")
1733            print data
1734            fn, h = None, None
1735        print '-'*40
1736    finally:
1737        urlcleanup()
1738
1739def main():
1740    import getopt, sys
1741    try:
1742        opts, args = getopt.getopt(sys.argv[1:], "th")
1743    except getopt.error, msg:
1744        print msg
1745        print "Use -h for help"
1746        return
1747    t = 0
1748    for o, a in opts:
1749        if o == '-t':
1750            t = t + 1
1751        if o == '-h':
1752            print "Usage: python urllib.py [-t] [url ...]"
1753            print "-t runs self-test;",
1754            print "otherwise, contents of urls are printed"
1755            return
1756    if t:
1757        if t > 1:
1758            test1()
1759        test(args)
1760    else:
1761        if not args:
1762            print "Use -h for help"
1763        for url in args:
1764            print urlopen(url).read(),
1765
1766# Run test program when run as a script
1767if __name__ == '__main__':
1768    main()
1769