urllib.py revision 1c24592b925ba4716f2c0cec10bfe59cef2eed30
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol.  All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info().  The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import time
29import sys
30from urlparse import urljoin as basejoin
31
32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34           "urlencode", "url2pathname", "pathname2url", "splittag",
35           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37           "splitnport", "splitquery", "splitattr", "splitvalue",
38           "getproxies"]
39
40__version__ = '1.17'    # XXX This version is not always updated :-(
41
42MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
43
44# Helper for non-unix systems
45if os.name == 'mac':
46    from macurl2path import url2pathname, pathname2url
47elif os.name == 'nt':
48    from nturl2path import url2pathname, pathname2url
49elif os.name == 'riscos':
50    from rourl2path import url2pathname, pathname2url
51else:
52    def url2pathname(pathname):
53        """OS-specific conversion from a relative URL of the 'file' scheme
54        to a file system path; not recommended for general use."""
55        return unquote(pathname)
56
57    def pathname2url(pathname):
58        """OS-specific conversion from a file system path to a relative URL
59        of the 'file' scheme; not recommended for general use."""
60        return quote(pathname)
61
62# This really consists of two pieces:
63# (1) a class which handles opening of all sorts of URLs
64#     (plus assorted utilities etc.)
65# (2) a set of functions for parsing URLs
66# XXX Should these be separated out into different modules?
67
68
69# Shortcut for basic usage
70_urlopener = None
71def urlopen(url, data=None, proxies=None):
72    """Create a file-like object for the specified URL to read from."""
73    from warnings import warnpy3k
74    warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
75             "favor of urllib2.urlopen()", stacklevel=2)
76
77    global _urlopener
78    if proxies is not None:
79        opener = FancyURLopener(proxies=proxies)
80    elif not _urlopener:
81        opener = FancyURLopener()
82        _urlopener = opener
83    else:
84        opener = _urlopener
85    if data is None:
86        return opener.open(url)
87    else:
88        return opener.open(url, data)
89def urlretrieve(url, filename=None, reporthook=None, data=None):
90    global _urlopener
91    if not _urlopener:
92        _urlopener = FancyURLopener()
93    return _urlopener.retrieve(url, filename, reporthook, data)
94def urlcleanup():
95    if _urlopener:
96        _urlopener.cleanup()
97    _safe_quoters.clear()
98    ftpcache.clear()
99
100# check for SSL
101try:
102    import ssl
103except:
104    _have_ssl = False
105else:
106    _have_ssl = True
107
108# exception raised when downloaded size does not match content-length
109class ContentTooShortError(IOError):
110    def __init__(self, message, content):
111        IOError.__init__(self, message)
112        self.content = content
113
114ftpcache = {}
115class URLopener:
116    """Class to open URLs.
117    This is a class rather than just a subroutine because we may need
118    more than one set of global protocol-specific options.
119    Note -- this is a base class for those who don't want the
120    automatic handling of errors type 302 (relocated) and 401
121    (authorization needed)."""
122
123    __tempfiles = None
124
125    version = "Python-urllib/%s" % __version__
126
127    # Constructor
128    def __init__(self, proxies=None, **x509):
129        if proxies is None:
130            proxies = getproxies()
131        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
132        self.proxies = proxies
133        self.key_file = x509.get('key_file')
134        self.cert_file = x509.get('cert_file')
135        self.addheaders = [('User-Agent', self.version)]
136        self.__tempfiles = []
137        self.__unlink = os.unlink # See cleanup()
138        self.tempcache = None
139        # Undocumented feature: if you assign {} to tempcache,
140        # it is used to cache files retrieved with
141        # self.retrieve().  This is not enabled by default
142        # since it does not work for changing documents (and I
143        # haven't got the logic to check expiration headers
144        # yet).
145        self.ftpcache = ftpcache
146        # Undocumented feature: you can use a different
147        # ftp cache by assigning to the .ftpcache member;
148        # in case you want logically independent URL openers
149        # XXX This is not threadsafe.  Bah.
150
151    def __del__(self):
152        self.close()
153
154    def close(self):
155        self.cleanup()
156
157    def cleanup(self):
158        # This code sometimes runs when the rest of this module
159        # has already been deleted, so it can't use any globals
160        # or import anything.
161        if self.__tempfiles:
162            for file in self.__tempfiles:
163                try:
164                    self.__unlink(file)
165                except OSError:
166                    pass
167            del self.__tempfiles[:]
168        if self.tempcache:
169            self.tempcache.clear()
170
171    def addheader(self, *args):
172        """Add a header to be used by the HTTP interface only
173        e.g. u.addheader('Accept', 'sound/basic')"""
174        self.addheaders.append(args)
175
176    # External interface
177    def open(self, fullurl, data=None):
178        """Use URLopener().open(file) instead of open(file, 'r')."""
179        fullurl = unwrap(toBytes(fullurl))
180        # percent encode url. fixing lame server errors like space within url
181        # parts
182        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
183        if self.tempcache and fullurl in self.tempcache:
184            filename, headers = self.tempcache[fullurl]
185            fp = open(filename, 'rb')
186            return addinfourl(fp, headers, fullurl)
187        urltype, url = splittype(fullurl)
188        if not urltype:
189            urltype = 'file'
190        if urltype in self.proxies:
191            proxy = self.proxies[urltype]
192            urltype, proxyhost = splittype(proxy)
193            host, selector = splithost(proxyhost)
194            url = (host, fullurl) # Signal special case to open_*()
195        else:
196            proxy = None
197        name = 'open_' + urltype
198        self.type = urltype
199        name = name.replace('-', '_')
200        if not hasattr(self, name):
201            if proxy:
202                return self.open_unknown_proxy(proxy, fullurl, data)
203            else:
204                return self.open_unknown(fullurl, data)
205        try:
206            if data is None:
207                return getattr(self, name)(url)
208            else:
209                return getattr(self, name)(url, data)
210        except socket.error, msg:
211            raise IOError, ('socket error', msg), sys.exc_info()[2]
212
213    def open_unknown(self, fullurl, data=None):
214        """Overridable interface to open unknown URL type."""
215        type, url = splittype(fullurl)
216        raise IOError, ('url error', 'unknown url type', type)
217
218    def open_unknown_proxy(self, proxy, fullurl, data=None):
219        """Overridable interface to open unknown URL type."""
220        type, url = splittype(fullurl)
221        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
222
223    # External interface
224    def retrieve(self, url, filename=None, reporthook=None, data=None):
225        """retrieve(url) returns (filename, headers) for a local object
226        or (tempfilename, headers) for a remote object."""
227        url = unwrap(toBytes(url))
228        if self.tempcache and url in self.tempcache:
229            return self.tempcache[url]
230        type, url1 = splittype(url)
231        if filename is None and (not type or type == 'file'):
232            try:
233                fp = self.open_local_file(url1)
234                hdrs = fp.info()
235                del fp
236                return url2pathname(splithost(url1)[1]), hdrs
237            except IOError, msg:
238                pass
239        fp = self.open(url, data)
240        try:
241            headers = fp.info()
242            if filename:
243                tfp = open(filename, 'wb')
244            else:
245                import tempfile
246                garbage, path = splittype(url)
247                garbage, path = splithost(path or "")
248                path, garbage = splitquery(path or "")
249                path, garbage = splitattr(path or "")
250                suffix = os.path.splitext(path)[1]
251                (fd, filename) = tempfile.mkstemp(suffix)
252                self.__tempfiles.append(filename)
253                tfp = os.fdopen(fd, 'wb')
254            try:
255                result = filename, headers
256                if self.tempcache is not None:
257                    self.tempcache[url] = result
258                bs = 1024*8
259                size = -1
260                read = 0
261                blocknum = 0
262                if reporthook:
263                    if "content-length" in headers:
264                        size = int(headers["Content-Length"])
265                    reporthook(blocknum, bs, size)
266                while 1:
267                    block = fp.read(bs)
268                    if block == "":
269                        break
270                    read += len(block)
271                    tfp.write(block)
272                    blocknum += 1
273                    if reporthook:
274                        reporthook(blocknum, bs, size)
275            finally:
276                tfp.close()
277        finally:
278            fp.close()
279        del fp
280        del tfp
281
282        # raise exception if actual size does not match content-length header
283        if size >= 0 and read < size:
284            raise ContentTooShortError("retrieval incomplete: got only %i out "
285                                       "of %i bytes" % (read, size), result)
286
287        return result
288
289    # Each method named open_<type> knows how to open that type of URL
290
291    def open_http(self, url, data=None):
292        """Use HTTP protocol."""
293        import httplib
294        user_passwd = None
295        proxy_passwd= None
296        if isinstance(url, str):
297            host, selector = splithost(url)
298            if host:
299                user_passwd, host = splituser(host)
300                host = unquote(host)
301            realhost = host
302        else:
303            host, selector = url
304            # check whether the proxy contains authorization information
305            proxy_passwd, host = splituser(host)
306            # now we proceed with the url we want to obtain
307            urltype, rest = splittype(selector)
308            url = rest
309            user_passwd = None
310            if urltype.lower() != 'http':
311                realhost = None
312            else:
313                realhost, rest = splithost(rest)
314                if realhost:
315                    user_passwd, realhost = splituser(realhost)
316                if user_passwd:
317                    selector = "%s://%s%s" % (urltype, realhost, rest)
318                if proxy_bypass(realhost):
319                    host = realhost
320
321            #print "proxy via http:", host, selector
322        if not host: raise IOError, ('http error', 'no host given')
323
324        if proxy_passwd:
325            import base64
326            proxy_auth = base64.b64encode(proxy_passwd).strip()
327        else:
328            proxy_auth = None
329
330        if user_passwd:
331            import base64
332            auth = base64.b64encode(user_passwd).strip()
333        else:
334            auth = None
335        h = httplib.HTTP(host)
336        if data is not None:
337            h.putrequest('POST', selector)
338            h.putheader('Content-Type', 'application/x-www-form-urlencoded')
339            h.putheader('Content-Length', '%d' % len(data))
340        else:
341            h.putrequest('GET', selector)
342        if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
343        if auth: h.putheader('Authorization', 'Basic %s' % auth)
344        if realhost: h.putheader('Host', realhost)
345        for args in self.addheaders: h.putheader(*args)
346        h.endheaders()
347        if data is not None:
348            h.send(data)
349        errcode, errmsg, headers = h.getreply()
350        fp = h.getfile()
351        if errcode == -1:
352            if fp: fp.close()
353            # something went wrong with the HTTP status line
354            raise IOError, ('http protocol error', 0,
355                            'got a bad status line', None)
356        # According to RFC 2616, "2xx" code indicates that the client's
357        # request was successfully received, understood, and accepted.
358        if (200 <= errcode < 300):
359            return addinfourl(fp, headers, "http:" + url, errcode)
360        else:
361            if data is None:
362                return self.http_error(url, fp, errcode, errmsg, headers)
363            else:
364                return self.http_error(url, fp, errcode, errmsg, headers, data)
365
366    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
367        """Handle http errors.
368        Derived class can override this, or provide specific handlers
369        named http_error_DDD where DDD is the 3-digit error code."""
370        # First check if there's a specific handler for this error
371        name = 'http_error_%d' % errcode
372        if hasattr(self, name):
373            method = getattr(self, name)
374            if data is None:
375                result = method(url, fp, errcode, errmsg, headers)
376            else:
377                result = method(url, fp, errcode, errmsg, headers, data)
378            if result: return result
379        return self.http_error_default(url, fp, errcode, errmsg, headers)
380
381    def http_error_default(self, url, fp, errcode, errmsg, headers):
382        """Default error handler: close the connection and raise IOError."""
383        void = fp.read()
384        fp.close()
385        raise IOError, ('http error', errcode, errmsg, headers)
386
387    if _have_ssl:
388        def open_https(self, url, data=None):
389            """Use HTTPS protocol."""
390
391            import httplib
392            user_passwd = None
393            proxy_passwd = None
394            if isinstance(url, str):
395                host, selector = splithost(url)
396                if host:
397                    user_passwd, host = splituser(host)
398                    host = unquote(host)
399                realhost = host
400            else:
401                host, selector = url
402                # here, we determine, whether the proxy contains authorization information
403                proxy_passwd, host = splituser(host)
404                urltype, rest = splittype(selector)
405                url = rest
406                user_passwd = None
407                if urltype.lower() != 'https':
408                    realhost = None
409                else:
410                    realhost, rest = splithost(rest)
411                    if realhost:
412                        user_passwd, realhost = splituser(realhost)
413                    if user_passwd:
414                        selector = "%s://%s%s" % (urltype, realhost, rest)
415                #print "proxy via https:", host, selector
416            if not host: raise IOError, ('https error', 'no host given')
417            if proxy_passwd:
418                import base64
419                proxy_auth = base64.b64encode(proxy_passwd).strip()
420            else:
421                proxy_auth = None
422            if user_passwd:
423                import base64
424                auth = base64.b64encode(user_passwd).strip()
425            else:
426                auth = None
427            h = httplib.HTTPS(host, 0,
428                              key_file=self.key_file,
429                              cert_file=self.cert_file)
430            if data is not None:
431                h.putrequest('POST', selector)
432                h.putheader('Content-Type',
433                            'application/x-www-form-urlencoded')
434                h.putheader('Content-Length', '%d' % len(data))
435            else:
436                h.putrequest('GET', selector)
437            if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
438            if auth: h.putheader('Authorization', 'Basic %s' % auth)
439            if realhost: h.putheader('Host', realhost)
440            for args in self.addheaders: h.putheader(*args)
441            h.endheaders()
442            if data is not None:
443                h.send(data)
444            errcode, errmsg, headers = h.getreply()
445            fp = h.getfile()
446            if errcode == -1:
447                if fp: fp.close()
448                # something went wrong with the HTTP status line
449                raise IOError, ('http protocol error', 0,
450                                'got a bad status line', None)
451            # According to RFC 2616, "2xx" code indicates that the client's
452            # request was successfully received, understood, and accepted.
453            if (200 <= errcode < 300):
454                return addinfourl(fp, headers, "https:" + url, errcode)
455            else:
456                if data is None:
457                    return self.http_error(url, fp, errcode, errmsg, headers)
458                else:
459                    return self.http_error(url, fp, errcode, errmsg, headers,
460                                           data)
461
462    def open_file(self, url):
463        """Use local file or FTP depending on form of URL."""
464        if not isinstance(url, str):
465            raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
466        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
467            return self.open_ftp(url)
468        else:
469            return self.open_local_file(url)
470
471    def open_local_file(self, url):
472        """Use local file."""
473        import mimetypes, mimetools, email.utils
474        try:
475            from cStringIO import StringIO
476        except ImportError:
477            from StringIO import StringIO
478        host, file = splithost(url)
479        localname = url2pathname(file)
480        try:
481            stats = os.stat(localname)
482        except OSError, e:
483            raise IOError(e.errno, e.strerror, e.filename)
484        size = stats.st_size
485        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
486        mtype = mimetypes.guess_type(url)[0]
487        headers = mimetools.Message(StringIO(
488            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
489            (mtype or 'text/plain', size, modified)))
490        if not host:
491            urlfile = file
492            if file[:1] == '/':
493                urlfile = 'file://' + file
494            return addinfourl(open(localname, 'rb'),
495                              headers, urlfile)
496        host, port = splitport(host)
497        if not port \
498           and socket.gethostbyname(host) in (localhost(), thishost()):
499            urlfile = file
500            if file[:1] == '/':
501                urlfile = 'file://' + file
502            return addinfourl(open(localname, 'rb'),
503                              headers, urlfile)
504        raise IOError, ('local file error', 'not on local host')
505
506    def open_ftp(self, url):
507        """Use FTP protocol."""
508        if not isinstance(url, str):
509            raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
510        import mimetypes, mimetools
511        try:
512            from cStringIO import StringIO
513        except ImportError:
514            from StringIO import StringIO
515        host, path = splithost(url)
516        if not host: raise IOError, ('ftp error', 'no host given')
517        host, port = splitport(host)
518        user, host = splituser(host)
519        if user: user, passwd = splitpasswd(user)
520        else: passwd = None
521        host = unquote(host)
522        user = unquote(user or '')
523        passwd = unquote(passwd or '')
524        host = socket.gethostbyname(host)
525        if not port:
526            import ftplib
527            port = ftplib.FTP_PORT
528        else:
529            port = int(port)
530        path, attrs = splitattr(path)
531        path = unquote(path)
532        dirs = path.split('/')
533        dirs, file = dirs[:-1], dirs[-1]
534        if dirs and not dirs[0]: dirs = dirs[1:]
535        if dirs and not dirs[0]: dirs[0] = '/'
536        key = user, host, port, '/'.join(dirs)
537        # XXX thread unsafe!
538        if len(self.ftpcache) > MAXFTPCACHE:
539            # Prune the cache, rather arbitrarily
540            for k in self.ftpcache.keys():
541                if k != key:
542                    v = self.ftpcache[k]
543                    del self.ftpcache[k]
544                    v.close()
545        try:
546            if not key in self.ftpcache:
547                self.ftpcache[key] = \
548                    ftpwrapper(user, passwd, host, port, dirs)
549            if not file: type = 'D'
550            else: type = 'I'
551            for attr in attrs:
552                attr, value = splitvalue(attr)
553                if attr.lower() == 'type' and \
554                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
555                    type = value.upper()
556            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
557            mtype = mimetypes.guess_type("ftp:" + url)[0]
558            headers = ""
559            if mtype:
560                headers += "Content-Type: %s\n" % mtype
561            if retrlen is not None and retrlen >= 0:
562                headers += "Content-Length: %d\n" % retrlen
563            headers = mimetools.Message(StringIO(headers))
564            return addinfourl(fp, headers, "ftp:" + url)
565        except ftperrors(), msg:
566            raise IOError, ('ftp error', msg), sys.exc_info()[2]
567
568    def open_data(self, url, data=None):
569        """Use "data" URL."""
570        if not isinstance(url, str):
571            raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
572        # ignore POSTed data
573        #
574        # syntax of data URLs:
575        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
576        # mediatype := [ type "/" subtype ] *( ";" parameter )
577        # data      := *urlchar
578        # parameter := attribute "=" value
579        import mimetools
580        try:
581            from cStringIO import StringIO
582        except ImportError:
583            from StringIO import StringIO
584        try:
585            [type, data] = url.split(',', 1)
586        except ValueError:
587            raise IOError, ('data error', 'bad data URL')
588        if not type:
589            type = 'text/plain;charset=US-ASCII'
590        semi = type.rfind(';')
591        if semi >= 0 and '=' not in type[semi:]:
592            encoding = type[semi+1:]
593            type = type[:semi]
594        else:
595            encoding = ''
596        msg = []
597        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
598                                            time.gmtime(time.time())))
599        msg.append('Content-type: %s' % type)
600        if encoding == 'base64':
601            import base64
602            data = base64.decodestring(data)
603        else:
604            data = unquote(data)
605        msg.append('Content-Length: %d' % len(data))
606        msg.append('')
607        msg.append(data)
608        msg = '\n'.join(msg)
609        f = StringIO(msg)
610        headers = mimetools.Message(f, 0)
611        #f.fileno = None     # needed for addinfourl
612        return addinfourl(f, headers, url)
613
614
615class FancyURLopener(URLopener):
616    """Derived class with handlers for errors we can handle (perhaps)."""
617
618    def __init__(self, *args, **kwargs):
619        URLopener.__init__(self, *args, **kwargs)
620        self.auth_cache = {}
621        self.tries = 0
622        self.maxtries = 10
623
624    def http_error_default(self, url, fp, errcode, errmsg, headers):
625        """Default error handling -- don't raise an exception."""
626        return addinfourl(fp, headers, "http:" + url, errcode)
627
628    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
629        """Error 302 -- relocated (temporarily)."""
630        self.tries += 1
631        if self.maxtries and self.tries >= self.maxtries:
632            if hasattr(self, "http_error_500"):
633                meth = self.http_error_500
634            else:
635                meth = self.http_error_default
636            self.tries = 0
637            return meth(url, fp, 500,
638                        "Internal Server Error: Redirect Recursion", headers)
639        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
640                                        data)
641        self.tries = 0
642        return result
643
644    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
645        if 'location' in headers:
646            newurl = headers['location']
647        elif 'uri' in headers:
648            newurl = headers['uri']
649        else:
650            return
651        void = fp.read()
652        fp.close()
653        # In case the server sent a relative URL, join with original:
654        newurl = basejoin(self.type + ":" + url, newurl)
655        return self.open(newurl)
656
657    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
658        """Error 301 -- also relocated (permanently)."""
659        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
660
661    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
662        """Error 303 -- also relocated (essentially identical to 302)."""
663        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
664
665    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
666        """Error 307 -- relocated, but turn POST into error."""
667        if data is None:
668            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
669        else:
670            return self.http_error_default(url, fp, errcode, errmsg, headers)
671
672    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
673        """Error 401 -- authentication required.
674        This function supports Basic authentication only."""
675        if not 'www-authenticate' in headers:
676            URLopener.http_error_default(self, url, fp,
677                                         errcode, errmsg, headers)
678        stuff = headers['www-authenticate']
679        import re
680        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
681        if not match:
682            URLopener.http_error_default(self, url, fp,
683                                         errcode, errmsg, headers)
684        scheme, realm = match.groups()
685        if scheme.lower() != 'basic':
686            URLopener.http_error_default(self, url, fp,
687                                         errcode, errmsg, headers)
688        name = 'retry_' + self.type + '_basic_auth'
689        if data is None:
690            return getattr(self,name)(url, realm)
691        else:
692            return getattr(self,name)(url, realm, data)
693
694    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
695        """Error 407 -- proxy authentication required.
696        This function supports Basic authentication only."""
697        if not 'proxy-authenticate' in headers:
698            URLopener.http_error_default(self, url, fp,
699                                         errcode, errmsg, headers)
700        stuff = headers['proxy-authenticate']
701        import re
702        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
703        if not match:
704            URLopener.http_error_default(self, url, fp,
705                                         errcode, errmsg, headers)
706        scheme, realm = match.groups()
707        if scheme.lower() != 'basic':
708            URLopener.http_error_default(self, url, fp,
709                                         errcode, errmsg, headers)
710        name = 'retry_proxy_' + self.type + '_basic_auth'
711        if data is None:
712            return getattr(self,name)(url, realm)
713        else:
714            return getattr(self,name)(url, realm, data)
715
716    def retry_proxy_http_basic_auth(self, url, realm, data=None):
717        host, selector = splithost(url)
718        newurl = 'http://' + host + selector
719        proxy = self.proxies['http']
720        urltype, proxyhost = splittype(proxy)
721        proxyhost, proxyselector = splithost(proxyhost)
722        i = proxyhost.find('@') + 1
723        proxyhost = proxyhost[i:]
724        user, passwd = self.get_user_passwd(proxyhost, realm, i)
725        if not (user or passwd): return None
726        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
727        self.proxies['http'] = 'http://' + proxyhost + proxyselector
728        if data is None:
729            return self.open(newurl)
730        else:
731            return self.open(newurl, data)
732
733    def retry_proxy_https_basic_auth(self, url, realm, data=None):
734        host, selector = splithost(url)
735        newurl = 'https://' + host + selector
736        proxy = self.proxies['https']
737        urltype, proxyhost = splittype(proxy)
738        proxyhost, proxyselector = splithost(proxyhost)
739        i = proxyhost.find('@') + 1
740        proxyhost = proxyhost[i:]
741        user, passwd = self.get_user_passwd(proxyhost, realm, i)
742        if not (user or passwd): return None
743        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
744        self.proxies['https'] = 'https://' + proxyhost + proxyselector
745        if data is None:
746            return self.open(newurl)
747        else:
748            return self.open(newurl, data)
749
750    def retry_http_basic_auth(self, url, realm, data=None):
751        host, selector = splithost(url)
752        i = host.find('@') + 1
753        host = host[i:]
754        user, passwd = self.get_user_passwd(host, realm, i)
755        if not (user or passwd): return None
756        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
757        newurl = 'http://' + host + selector
758        if data is None:
759            return self.open(newurl)
760        else:
761            return self.open(newurl, data)
762
763    def retry_https_basic_auth(self, url, realm, data=None):
764        host, selector = splithost(url)
765        i = host.find('@') + 1
766        host = host[i:]
767        user, passwd = self.get_user_passwd(host, realm, i)
768        if not (user or passwd): return None
769        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
770        newurl = 'https://' + host + selector
771        if data is None:
772            return self.open(newurl)
773        else:
774            return self.open(newurl, data)
775
776    def get_user_passwd(self, host, realm, clear_cache=0):
777        key = realm + '@' + host.lower()
778        if key in self.auth_cache:
779            if clear_cache:
780                del self.auth_cache[key]
781            else:
782                return self.auth_cache[key]
783        user, passwd = self.prompt_user_passwd(host, realm)
784        if user or passwd: self.auth_cache[key] = (user, passwd)
785        return user, passwd
786
787    def prompt_user_passwd(self, host, realm):
788        """Override this in a GUI environment!"""
789        import getpass
790        try:
791            user = raw_input("Enter username for %s at %s: " % (realm,
792                                                                host))
793            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
794                (user, realm, host))
795            return user, passwd
796        except KeyboardInterrupt:
797            print
798            return None, None
799
800
801# Utility functions
802
803_localhost = None
804def localhost():
805    """Return the IP address of the magic hostname 'localhost'."""
806    global _localhost
807    if _localhost is None:
808        _localhost = socket.gethostbyname('localhost')
809    return _localhost
810
811_thishost = None
812def thishost():
813    """Return the IP address of the current host."""
814    global _thishost
815    if _thishost is None:
816        _thishost = socket.gethostbyname(socket.gethostname())
817    return _thishost
818
819_ftperrors = None
820def ftperrors():
821    """Return the set of errors raised by the FTP class."""
822    global _ftperrors
823    if _ftperrors is None:
824        import ftplib
825        _ftperrors = ftplib.all_errors
826    return _ftperrors
827
828_noheaders = None
829def noheaders():
830    """Return an empty mimetools.Message object."""
831    global _noheaders
832    if _noheaders is None:
833        import mimetools
834        try:
835            from cStringIO import StringIO
836        except ImportError:
837            from StringIO import StringIO
838        _noheaders = mimetools.Message(StringIO(), 0)
839        _noheaders.fp.close()   # Recycle file descriptor
840    return _noheaders
841
842
843# Utility classes
844
845class ftpwrapper:
846    """Class used by open_ftp() for cache of open FTP connections."""
847
848    def __init__(self, user, passwd, host, port, dirs,
849                 timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
850        self.user = user
851        self.passwd = passwd
852        self.host = host
853        self.port = port
854        self.dirs = dirs
855        self.timeout = timeout
856        self.init()
857
858    def init(self):
859        import ftplib
860        self.busy = 0
861        self.ftp = ftplib.FTP()
862        self.ftp.connect(self.host, self.port, self.timeout)
863        self.ftp.login(self.user, self.passwd)
864        for dir in self.dirs:
865            self.ftp.cwd(dir)
866
867    def retrfile(self, file, type):
868        import ftplib
869        self.endtransfer()
870        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
871        else: cmd = 'TYPE ' + type; isdir = 0
872        try:
873            self.ftp.voidcmd(cmd)
874        except ftplib.all_errors:
875            self.init()
876            self.ftp.voidcmd(cmd)
877        conn = None
878        if file and not isdir:
879            # Try to retrieve as a file
880            try:
881                cmd = 'RETR ' + file
882                conn = self.ftp.ntransfercmd(cmd)
883            except ftplib.error_perm, reason:
884                if str(reason)[:3] != '550':
885                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
886        if not conn:
887            # Set transfer mode to ASCII!
888            self.ftp.voidcmd('TYPE A')
889            # Try a directory listing. Verify that directory exists.
890            if file:
891                pwd = self.ftp.pwd()
892                try:
893                    try:
894                        self.ftp.cwd(file)
895                    except ftplib.error_perm, reason:
896                        raise IOError, ('ftp error', reason), sys.exc_info()[2]
897                finally:
898                    self.ftp.cwd(pwd)
899                cmd = 'LIST ' + file
900            else:
901                cmd = 'LIST'
902            conn = self.ftp.ntransfercmd(cmd)
903        self.busy = 1
904        # Pass back both a suitably decorated object and a retrieval length
905        return (addclosehook(conn[0].makefile('rb'),
906                             self.endtransfer), conn[1])
907    def endtransfer(self):
908        if not self.busy:
909            return
910        self.busy = 0
911        try:
912            self.ftp.voidresp()
913        except ftperrors():
914            pass
915
916    def close(self):
917        self.endtransfer()
918        try:
919            self.ftp.close()
920        except ftperrors():
921            pass
922
923class addbase:
924    """Base class for addinfo and addclosehook."""
925
926    def __init__(self, fp):
927        self.fp = fp
928        self.read = self.fp.read
929        self.readline = self.fp.readline
930        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
931        if hasattr(self.fp, "fileno"):
932            self.fileno = self.fp.fileno
933        else:
934            self.fileno = lambda: None
935        if hasattr(self.fp, "__iter__"):
936            self.__iter__ = self.fp.__iter__
937            if hasattr(self.fp, "next"):
938                self.next = self.fp.next
939
940    def __repr__(self):
941        return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
942                                             id(self), self.fp)
943
944    def close(self):
945        self.read = None
946        self.readline = None
947        self.readlines = None
948        self.fileno = None
949        if self.fp: self.fp.close()
950        self.fp = None
951
952class addclosehook(addbase):
953    """Class to add a close hook to an open file."""
954
955    def __init__(self, fp, closehook, *hookargs):
956        addbase.__init__(self, fp)
957        self.closehook = closehook
958        self.hookargs = hookargs
959
960    def close(self):
961        addbase.close(self)
962        if self.closehook:
963            self.closehook(*self.hookargs)
964            self.closehook = None
965            self.hookargs = None
966
967class addinfo(addbase):
968    """class to add an info() method to an open file."""
969
970    def __init__(self, fp, headers):
971        addbase.__init__(self, fp)
972        self.headers = headers
973
974    def info(self):
975        return self.headers
976
977class addinfourl(addbase):
978    """class to add info() and geturl() methods to an open file."""
979
980    def __init__(self, fp, headers, url, code=None):
981        addbase.__init__(self, fp)
982        self.headers = headers
983        self.url = url
984        self.code = code
985
986    def info(self):
987        return self.headers
988
989    def getcode(self):
990        return self.code
991
992    def geturl(self):
993        return self.url
994
995
996# Utilities to parse URLs (most of these return None for missing parts):
997# unwrap('<URL:type://host/path>') --> 'type://host/path'
998# splittype('type:opaquestring') --> 'type', 'opaquestring'
999# splithost('//host[:port]/path') --> 'host[:port]', '/path'
1000# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
1001# splitpasswd('user:passwd') -> 'user', 'passwd'
1002# splitport('host:port') --> 'host', 'port'
1003# splitquery('/path?query') --> '/path', 'query'
1004# splittag('/path#tag') --> '/path', 'tag'
1005# splitattr('/path;attr1=value1;attr2=value2;...') ->
1006#   '/path', ['attr1=value1', 'attr2=value2', ...]
1007# splitvalue('attr=value') --> 'attr', 'value'
1008# unquote('abc%20def') -> 'abc def'
1009# quote('abc def') -> 'abc%20def')
1010
1011try:
1012    unicode
1013except NameError:
1014    def _is_unicode(x):
1015        return 0
1016else:
1017    def _is_unicode(x):
1018        return isinstance(x, unicode)
1019
1020def toBytes(url):
1021    """toBytes(u"URL") --> 'URL'."""
1022    # Most URL schemes require ASCII. If that changes, the conversion
1023    # can be relaxed
1024    if _is_unicode(url):
1025        try:
1026            url = url.encode("ASCII")
1027        except UnicodeError:
1028            raise UnicodeError("URL " + repr(url) +
1029                               " contains non-ASCII characters")
1030    return url
1031
1032def unwrap(url):
1033    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1034    url = url.strip()
1035    if url[:1] == '<' and url[-1:] == '>':
1036        url = url[1:-1].strip()
1037    if url[:4] == 'URL:': url = url[4:].strip()
1038    return url
1039
1040_typeprog = None
1041def splittype(url):
1042    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1043    global _typeprog
1044    if _typeprog is None:
1045        import re
1046        _typeprog = re.compile('^([^/:]+):')
1047
1048    match = _typeprog.match(url)
1049    if match:
1050        scheme = match.group(1)
1051        return scheme.lower(), url[len(scheme) + 1:]
1052    return None, url
1053
1054_hostprog = None
1055def splithost(url):
1056    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1057    global _hostprog
1058    if _hostprog is None:
1059        import re
1060        _hostprog = re.compile('^//([^/?]*)(.*)$')
1061
1062    match = _hostprog.match(url)
1063    if match: return match.group(1, 2)
1064    return None, url
1065
1066_userprog = None
1067def splituser(host):
1068    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1069    global _userprog
1070    if _userprog is None:
1071        import re
1072        _userprog = re.compile('^(.*)@(.*)$')
1073
1074    match = _userprog.match(host)
1075    if match: return map(unquote, match.group(1, 2))
1076    return None, host
1077
1078_passwdprog = None
1079def splitpasswd(user):
1080    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1081    global _passwdprog
1082    if _passwdprog is None:
1083        import re
1084        _passwdprog = re.compile('^([^:]*):(.*)$')
1085
1086    match = _passwdprog.match(user)
1087    if match: return match.group(1, 2)
1088    return user, None
1089
1090# splittag('/path#tag') --> '/path', 'tag'
1091_portprog = None
1092def splitport(host):
1093    """splitport('host:port') --> 'host', 'port'."""
1094    global _portprog
1095    if _portprog is None:
1096        import re
1097        _portprog = re.compile('^(.*):([0-9]+)$')
1098
1099    match = _portprog.match(host)
1100    if match: return match.group(1, 2)
1101    return host, None
1102
1103_nportprog = None
1104def splitnport(host, defport=-1):
1105    """Split host and port, returning numeric port.
1106    Return given default port if no ':' found; defaults to -1.
1107    Return numerical port if a valid number are found after ':'.
1108    Return None if ':' but not a valid number."""
1109    global _nportprog
1110    if _nportprog is None:
1111        import re
1112        _nportprog = re.compile('^(.*):(.*)$')
1113
1114    match = _nportprog.match(host)
1115    if match:
1116        host, port = match.group(1, 2)
1117        try:
1118            if not port: raise ValueError, "no digits"
1119            nport = int(port)
1120        except ValueError:
1121            nport = None
1122        return host, nport
1123    return host, defport
1124
1125_queryprog = None
1126def splitquery(url):
1127    """splitquery('/path?query') --> '/path', 'query'."""
1128    global _queryprog
1129    if _queryprog is None:
1130        import re
1131        _queryprog = re.compile('^(.*)\?([^?]*)$')
1132
1133    match = _queryprog.match(url)
1134    if match: return match.group(1, 2)
1135    return url, None
1136
1137_tagprog = None
1138def splittag(url):
1139    """splittag('/path#tag') --> '/path', 'tag'."""
1140    global _tagprog
1141    if _tagprog is None:
1142        import re
1143        _tagprog = re.compile('^(.*)#([^#]*)$')
1144
1145    match = _tagprog.match(url)
1146    if match: return match.group(1, 2)
1147    return url, None
1148
1149def splitattr(url):
1150    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1151        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1152    words = url.split(';')
1153    return words[0], words[1:]
1154
1155_valueprog = None
1156def splitvalue(attr):
1157    """splitvalue('attr=value') --> 'attr', 'value'."""
1158    global _valueprog
1159    if _valueprog is None:
1160        import re
1161        _valueprog = re.compile('^([^=]*)=(.*)$')
1162
1163    match = _valueprog.match(attr)
1164    if match: return match.group(1, 2)
1165    return attr, None
1166
1167_hexdig = '0123456789ABCDEFabcdef'
1168_hextochr = dict((a + b, chr(int(a + b, 16)))
1169                 for a in _hexdig for b in _hexdig)
1170
1171def unquote(s):
1172    """unquote('abc%20def') -> 'abc def'."""
1173    res = s.split('%')
1174    # fastpath
1175    if len(res) == 1:
1176        return s
1177    s = res[0]
1178    for item in res[1:]:
1179        try:
1180            s += _hextochr[item[:2]] + item[2:]
1181        except KeyError:
1182            s += '%' + item
1183        except UnicodeDecodeError:
1184            s += unichr(int(item[:2], 16)) + item[2:]
1185    return s
1186
1187def unquote_plus(s):
1188    """unquote('%7e/abc+def') -> '~/abc def'"""
1189    s = s.replace('+', ' ')
1190    return unquote(s)
1191
1192always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1193               'abcdefghijklmnopqrstuvwxyz'
1194               '0123456789' '_.-')
1195_safe_map = {}
1196for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
1197    _safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
1198_safe_quoters = {}
1199
1200def quote(s, safe='/'):
1201    """quote('abc def') -> 'abc%20def'
1202
1203    Each part of a URL, e.g. the path info, the query, etc., has a
1204    different set of reserved characters that must be quoted.
1205
1206    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1207    the following reserved characters.
1208
1209    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1210                  "$" | ","
1211
1212    Each of these characters is reserved in some component of a URL,
1213    but not necessarily in all of them.
1214
1215    By default, the quote function is intended for quoting the path
1216    section of a URL.  Thus, it will not encode '/'.  This character
1217    is reserved, but in typical usage the quote function is being
1218    called on a path where the existing slash characters are used as
1219    reserved characters.
1220    """
1221    # fastpath
1222    if not s:
1223        return s
1224    cachekey = (safe, always_safe)
1225    try:
1226        (quoter, safe) = _safe_quoters[cachekey]
1227    except KeyError:
1228        safe_map = _safe_map.copy()
1229        safe_map.update([(c, c) for c in safe])
1230        quoter = safe_map.__getitem__
1231        safe = always_safe + safe
1232        _safe_quoters[cachekey] = (quoter, safe)
1233    if not s.rstrip(safe):
1234        return s
1235    return ''.join(map(quoter, s))
1236
1237def quote_plus(s, safe=''):
1238    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1239    if ' ' in s:
1240        s = quote(s, safe + ' ')
1241        return s.replace(' ', '+')
1242    return quote(s, safe)
1243
1244def urlencode(query, doseq=0):
1245    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1246
1247    If any values in the query arg are sequences and doseq is true, each
1248    sequence element is converted to a separate parameter.
1249
1250    If the query arg is a sequence of two-element tuples, the order of the
1251    parameters in the output will match the order of parameters in the
1252    input.
1253    """
1254
1255    if hasattr(query,"items"):
1256        # mapping objects
1257        query = query.items()
1258    else:
1259        # it's a bother at times that strings and string-like objects are
1260        # sequences...
1261        try:
1262            # non-sequence items should not work with len()
1263            # non-empty strings will fail this
1264            if len(query) and not isinstance(query[0], tuple):
1265                raise TypeError
1266            # zero-length sequences of all types will get here and succeed,
1267            # but that's a minor nit - since the original implementation
1268            # allowed empty dicts that type of behavior probably should be
1269            # preserved for consistency
1270        except TypeError:
1271            ty,va,tb = sys.exc_info()
1272            raise TypeError, "not a valid non-string sequence or mapping object", tb
1273
1274    l = []
1275    if not doseq:
1276        # preserve old behavior
1277        for k, v in query:
1278            k = quote_plus(str(k))
1279            v = quote_plus(str(v))
1280            l.append(k + '=' + v)
1281    else:
1282        for k, v in query:
1283            k = quote_plus(str(k))
1284            if isinstance(v, str):
1285                v = quote_plus(v)
1286                l.append(k + '=' + v)
1287            elif _is_unicode(v):
1288                # is there a reasonable way to convert to ASCII?
1289                # encode generates a string, but "replace" or "ignore"
1290                # lose information and "strict" can raise UnicodeError
1291                v = quote_plus(v.encode("ASCII","replace"))
1292                l.append(k + '=' + v)
1293            else:
1294                try:
1295                    # is this a sufficient test for sequence-ness?
1296                    x = len(v)
1297                except TypeError:
1298                    # not a sequence
1299                    v = quote_plus(str(v))
1300                    l.append(k + '=' + v)
1301                else:
1302                    # loop over the sequence
1303                    for elt in v:
1304                        l.append(k + '=' + quote_plus(str(elt)))
1305    return '&'.join(l)
1306
1307# Proxy handling
1308def getproxies_environment():
1309    """Return a dictionary of scheme -> proxy server URL mappings.
1310
1311    Scan the environment for variables named <scheme>_proxy;
1312    this seems to be the standard convention.  If you need a
1313    different way, you can pass a proxies dictionary to the
1314    [Fancy]URLopener constructor.
1315
1316    """
1317    proxies = {}
1318    for name, value in os.environ.items():
1319        name = name.lower()
1320        if value and name[-6:] == '_proxy':
1321            proxies[name[:-6]] = value
1322    return proxies
1323
1324def proxy_bypass_environment(host):
1325    """Test if proxies should not be used for a particular host.
1326
1327    Checks the environment for a variable named no_proxy, which should
1328    be a list of DNS suffixes separated by commas, or '*' for all hosts.
1329    """
1330    no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1331    # '*' is special case for always bypass
1332    if no_proxy == '*':
1333        return 1
1334    # strip port off host
1335    hostonly, port = splitport(host)
1336    # check if the host ends with any of the DNS suffixes
1337    for name in no_proxy.split(','):
1338        if name and (hostonly.endswith(name) or host.endswith(name)):
1339            return 1
1340    # otherwise, don't bypass
1341    return 0
1342
1343
1344if sys.platform == 'darwin':
1345    from _scproxy import _get_proxy_settings, _get_proxies
1346
1347    def proxy_bypass_macosx_sysconf(host):
1348        """
1349        Return True iff this host shouldn't be accessed using a proxy
1350
1351        This function uses the MacOSX framework SystemConfiguration
1352        to fetch the proxy information.
1353        """
1354        import re
1355        import socket
1356        from fnmatch import fnmatch
1357
1358        hostonly, port = splitport(host)
1359
1360        def ip2num(ipAddr):
1361            parts = ipAddr.split('.')
1362            parts = map(int, parts)
1363            if len(parts) != 4:
1364                parts = (parts + [0, 0, 0, 0])[:4]
1365            return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1366
1367        proxy_settings = _get_proxy_settings()
1368
1369        # Check for simple host names:
1370        if '.' not in host:
1371            if proxy_settings['exclude_simple']:
1372                return True
1373
1374        hostIP = None
1375
1376        for value in proxy_settings.get('exceptions', ()):
1377            # Items in the list are strings like these: *.local, 169.254/16
1378            if not value: continue
1379
1380            m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1381            if m is not None:
1382                if hostIP is None:
1383                    try:
1384                        hostIP = socket.gethostbyname(hostonly)
1385                        hostIP = ip2num(hostIP)
1386                    except socket.error:
1387                        continue
1388
1389                base = ip2num(m.group(1))
1390                mask = m.group(2)
1391                if mask is None:
1392                    mask = 8 * (m.group(1).count('.') + 1)
1393
1394                else:
1395                    mask = int(mask[1:])
1396                    mask = 32 - mask
1397
1398                if (hostIP >> mask) == (base >> mask):
1399                    return True
1400
1401            elif fnmatch(host, value):
1402                return True
1403
1404        return False
1405
1406    def getproxies_macosx_sysconf():
1407        """Return a dictionary of scheme -> proxy server URL mappings.
1408
1409        This function uses the MacOSX framework SystemConfiguration
1410        to fetch the proxy information.
1411        """
1412        return _get_proxies()
1413
1414    def proxy_bypass(host):
1415        if getproxies_environment():
1416            return proxy_bypass_environment(host)
1417        else:
1418            return proxy_bypass_macosx_sysconf(host)
1419
1420    def getproxies():
1421        return getproxies_environment() or getproxies_macosx_sysconf()
1422
1423elif os.name == 'nt':
1424    def getproxies_registry():
1425        """Return a dictionary of scheme -> proxy server URL mappings.
1426
1427        Win32 uses the registry to store proxies.
1428
1429        """
1430        proxies = {}
1431        try:
1432            import _winreg
1433        except ImportError:
1434            # Std module, so should be around - but you never know!
1435            return proxies
1436        try:
1437            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1438                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1439            proxyEnable = _winreg.QueryValueEx(internetSettings,
1440                                               'ProxyEnable')[0]
1441            if proxyEnable:
1442                # Returned as Unicode but problems if not converted to ASCII
1443                proxyServer = str(_winreg.QueryValueEx(internetSettings,
1444                                                       'ProxyServer')[0])
1445                if '=' in proxyServer:
1446                    # Per-protocol settings
1447                    for p in proxyServer.split(';'):
1448                        protocol, address = p.split('=', 1)
1449                        # See if address has a type:// prefix
1450                        import re
1451                        if not re.match('^([^/:]+)://', address):
1452                            address = '%s://%s' % (protocol, address)
1453                        proxies[protocol] = address
1454                else:
1455                    # Use one setting for all protocols
1456                    if proxyServer[:5] == 'http:':
1457                        proxies['http'] = proxyServer
1458                    else:
1459                        proxies['http'] = 'http://%s' % proxyServer
1460                        proxies['https'] = 'https://%s' % proxyServer
1461                        proxies['ftp'] = 'ftp://%s' % proxyServer
1462            internetSettings.Close()
1463        except (WindowsError, ValueError, TypeError):
1464            # Either registry key not found etc, or the value in an
1465            # unexpected format.
1466            # proxies already set up to be empty so nothing to do
1467            pass
1468        return proxies
1469
1470    def getproxies():
1471        """Return a dictionary of scheme -> proxy server URL mappings.
1472
1473        Returns settings gathered from the environment, if specified,
1474        or the registry.
1475
1476        """
1477        return getproxies_environment() or getproxies_registry()
1478
1479    def proxy_bypass_registry(host):
1480        try:
1481            import _winreg
1482            import re
1483        except ImportError:
1484            # Std modules, so should be around - but you never know!
1485            return 0
1486        try:
1487            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1488                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1489            proxyEnable = _winreg.QueryValueEx(internetSettings,
1490                                               'ProxyEnable')[0]
1491            proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1492                                                     'ProxyOverride')[0])
1493            # ^^^^ Returned as Unicode but problems if not converted to ASCII
1494        except WindowsError:
1495            return 0
1496        if not proxyEnable or not proxyOverride:
1497            return 0
1498        # try to make a host list from name and IP address.
1499        rawHost, port = splitport(host)
1500        host = [rawHost]
1501        try:
1502            addr = socket.gethostbyname(rawHost)
1503            if addr != rawHost:
1504                host.append(addr)
1505        except socket.error:
1506            pass
1507        try:
1508            fqdn = socket.getfqdn(rawHost)
1509            if fqdn != rawHost:
1510                host.append(fqdn)
1511        except socket.error:
1512            pass
1513        # make a check value list from the registry entry: replace the
1514        # '<local>' string by the localhost entry and the corresponding
1515        # canonical entry.
1516        proxyOverride = proxyOverride.split(';')
1517        i = 0
1518        while i < len(proxyOverride):
1519            if proxyOverride[i] == '<local>':
1520                proxyOverride[i:i+1] = ['localhost',
1521                                        '127.0.0.1',
1522                                        socket.gethostname(),
1523                                        socket.gethostbyname(
1524                                            socket.gethostname())]
1525            i += 1
1526        # print proxyOverride
1527        # now check if we match one of the registry values.
1528        for test in proxyOverride:
1529            test = test.replace(".", r"\.")     # mask dots
1530            test = test.replace("*", r".*")     # change glob sequence
1531            test = test.replace("?", r".")      # change glob char
1532            for val in host:
1533                # print "%s <--> %s" %( test, val )
1534                if re.match(test, val, re.I):
1535                    return 1
1536        return 0
1537
1538    def proxy_bypass(host):
1539        """Return a dictionary of scheme -> proxy server URL mappings.
1540
1541        Returns settings gathered from the environment, if specified,
1542        or the registry.
1543
1544        """
1545        if getproxies_environment():
1546            return proxy_bypass_environment(host)
1547        else:
1548            return proxy_bypass_registry(host)
1549
1550else:
1551    # By default use environment variables
1552    getproxies = getproxies_environment
1553    proxy_bypass = proxy_bypass_environment
1554
1555# Test and time quote() and unquote()
1556def test1():
1557    s = ''
1558    for i in range(256): s = s + chr(i)
1559    s = s*4
1560    t0 = time.time()
1561    qs = quote(s)
1562    uqs = unquote(qs)
1563    t1 = time.time()
1564    if uqs != s:
1565        print 'Wrong!'
1566    print repr(s)
1567    print repr(qs)
1568    print repr(uqs)
1569    print round(t1 - t0, 3), 'sec'
1570
1571
1572def reporthook(blocknum, blocksize, totalsize):
1573    # Report during remote transfers
1574    print "Block number: %d, Block size: %d, Total size: %d" % (
1575        blocknum, blocksize, totalsize)
1576
1577# Test program
1578def test(args=[]):
1579    if not args:
1580        args = [
1581            '/etc/passwd',
1582            'file:/etc/passwd',
1583            'file://localhost/etc/passwd',
1584            'ftp://ftp.gnu.org/pub/README',
1585            'http://www.python.org/index.html',
1586            ]
1587        if hasattr(URLopener, "open_https"):
1588            args.append('https://synergy.as.cmu.edu/~geek/')
1589    try:
1590        for url in args:
1591            print '-'*10, url, '-'*10
1592            fn, h = urlretrieve(url, None, reporthook)
1593            print fn
1594            if h:
1595                print '======'
1596                for k in h.keys(): print k + ':', h[k]
1597                print '======'
1598            fp = open(fn, 'rb')
1599            data = fp.read()
1600            del fp
1601            if '\r' in data:
1602                table = string.maketrans("", "")
1603                data = data.translate(table, "\r")
1604            print data
1605            fn, h = None, None
1606        print '-'*40
1607    finally:
1608        urlcleanup()
1609
1610def main():
1611    import getopt, sys
1612    try:
1613        opts, args = getopt.getopt(sys.argv[1:], "th")
1614    except getopt.error, msg:
1615        print msg
1616        print "Use -h for help"
1617        return
1618    t = 0
1619    for o, a in opts:
1620        if o == '-t':
1621            t = t + 1
1622        if o == '-h':
1623            print "Usage: python urllib.py [-t] [url ...]"
1624            print "-t runs self-test;",
1625            print "otherwise, contents of urls are printed"
1626            return
1627    if t:
1628        if t > 1:
1629            test1()
1630        test(args)
1631    else:
1632        if not args:
1633            print "Use -h for help"
1634        for url in args:
1635            print urlopen(url).read(),
1636
1637# Run test program when run as a script
1638if __name__ == '__main__':
1639    main()
1640