urllib.py revision 7a4e8379437017ada5d0e8e74c752cc6c4b5030f
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol.  All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info().  The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import time
29import sys
30from urlparse import urljoin as basejoin
31
32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34           "urlencode", "url2pathname", "pathname2url", "splittag",
35           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37           "splitnport", "splitquery", "splitattr", "splitvalue",
38           "getproxies"]
39
40__version__ = '1.17'    # XXX This version is not always updated :-(
41
42MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
43
44# Helper for non-unix systems
45if os.name == 'mac':
46    from macurl2path import url2pathname, pathname2url
47elif os.name == 'nt':
48    from nturl2path import url2pathname, pathname2url
49elif os.name == 'riscos':
50    from rourl2path import url2pathname, pathname2url
51else:
52    def url2pathname(pathname):
53        """OS-specific conversion from a relative URL of the 'file' scheme
54        to a file system path; not recommended for general use."""
55        return unquote(pathname)
56
57    def pathname2url(pathname):
58        """OS-specific conversion from a file system path to a relative URL
59        of the 'file' scheme; not recommended for general use."""
60        return quote(pathname)
61
62# This really consists of two pieces:
63# (1) a class which handles opening of all sorts of URLs
64#     (plus assorted utilities etc.)
65# (2) a set of functions for parsing URLs
66# XXX Should these be separated out into different modules?
67
68
69# Shortcut for basic usage
70_urlopener = None
71def urlopen(url, data=None, proxies=None):
72    """Create a file-like object for the specified URL to read from."""
73    from warnings import warnpy3k
74    warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
75             "favor of urllib2.urlopen()", stacklevel=2)
76
77    global _urlopener
78    if proxies is not None:
79        opener = FancyURLopener(proxies=proxies)
80    elif not _urlopener:
81        opener = FancyURLopener()
82        _urlopener = opener
83    else:
84        opener = _urlopener
85    if data is None:
86        return opener.open(url)
87    else:
88        return opener.open(url, data)
89def urlretrieve(url, filename=None, reporthook=None, data=None):
90    global _urlopener
91    if not _urlopener:
92        _urlopener = FancyURLopener()
93    return _urlopener.retrieve(url, filename, reporthook, data)
94def urlcleanup():
95    if _urlopener:
96        _urlopener.cleanup()
97
98# check for SSL
99try:
100    import ssl
101except:
102    _have_ssl = False
103else:
104    _have_ssl = True
105
106# exception raised when downloaded size does not match content-length
107class ContentTooShortError(IOError):
108    def __init__(self, message, content):
109        IOError.__init__(self, message)
110        self.content = content
111
112ftpcache = {}
113class URLopener:
114    """Class to open URLs.
115    This is a class rather than just a subroutine because we may need
116    more than one set of global protocol-specific options.
117    Note -- this is a base class for those who don't want the
118    automatic handling of errors type 302 (relocated) and 401
119    (authorization needed)."""
120
121    __tempfiles = None
122
123    version = "Python-urllib/%s" % __version__
124
125    # Constructor
126    def __init__(self, proxies=None, **x509):
127        if proxies is None:
128            proxies = getproxies()
129        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
130        self.proxies = proxies
131        self.key_file = x509.get('key_file')
132        self.cert_file = x509.get('cert_file')
133        self.addheaders = [('User-Agent', self.version)]
134        self.__tempfiles = []
135        self.__unlink = os.unlink # See cleanup()
136        self.tempcache = None
137        # Undocumented feature: if you assign {} to tempcache,
138        # it is used to cache files retrieved with
139        # self.retrieve().  This is not enabled by default
140        # since it does not work for changing documents (and I
141        # haven't got the logic to check expiration headers
142        # yet).
143        self.ftpcache = ftpcache
144        # Undocumented feature: you can use a different
145        # ftp cache by assigning to the .ftpcache member;
146        # in case you want logically independent URL openers
147        # XXX This is not threadsafe.  Bah.
148
149    def __del__(self):
150        self.close()
151
152    def close(self):
153        self.cleanup()
154
155    def cleanup(self):
156        # This code sometimes runs when the rest of this module
157        # has already been deleted, so it can't use any globals
158        # or import anything.
159        if self.__tempfiles:
160            for file in self.__tempfiles:
161                try:
162                    self.__unlink(file)
163                except OSError:
164                    pass
165            del self.__tempfiles[:]
166        if self.tempcache:
167            self.tempcache.clear()
168
169    def addheader(self, *args):
170        """Add a header to be used by the HTTP interface only
171        e.g. u.addheader('Accept', 'sound/basic')"""
172        self.addheaders.append(args)
173
174    # External interface
175    def open(self, fullurl, data=None):
176        """Use URLopener().open(file) instead of open(file, 'r')."""
177        fullurl = unwrap(toBytes(fullurl))
178        # percent encode url. fixing lame server errors like space within url
179        # parts
180        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
181        if self.tempcache and fullurl in self.tempcache:
182            filename, headers = self.tempcache[fullurl]
183            fp = open(filename, 'rb')
184            return addinfourl(fp, headers, fullurl)
185        urltype, url = splittype(fullurl)
186        if not urltype:
187            urltype = 'file'
188        if urltype in self.proxies:
189            proxy = self.proxies[urltype]
190            urltype, proxyhost = splittype(proxy)
191            host, selector = splithost(proxyhost)
192            url = (host, fullurl) # Signal special case to open_*()
193        else:
194            proxy = None
195        name = 'open_' + urltype
196        self.type = urltype
197        name = name.replace('-', '_')
198        if not hasattr(self, name):
199            if proxy:
200                return self.open_unknown_proxy(proxy, fullurl, data)
201            else:
202                return self.open_unknown(fullurl, data)
203        try:
204            if data is None:
205                return getattr(self, name)(url)
206            else:
207                return getattr(self, name)(url, data)
208        except socket.error, msg:
209            raise IOError, ('socket error', msg), sys.exc_info()[2]
210
211    def open_unknown(self, fullurl, data=None):
212        """Overridable interface to open unknown URL type."""
213        type, url = splittype(fullurl)
214        raise IOError, ('url error', 'unknown url type', type)
215
216    def open_unknown_proxy(self, proxy, fullurl, data=None):
217        """Overridable interface to open unknown URL type."""
218        type, url = splittype(fullurl)
219        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
220
221    # External interface
222    def retrieve(self, url, filename=None, reporthook=None, data=None):
223        """retrieve(url) returns (filename, headers) for a local object
224        or (tempfilename, headers) for a remote object."""
225        url = unwrap(toBytes(url))
226        if self.tempcache and url in self.tempcache:
227            return self.tempcache[url]
228        type, url1 = splittype(url)
229        if filename is None and (not type or type == 'file'):
230            try:
231                fp = self.open_local_file(url1)
232                hdrs = fp.info()
233                del fp
234                return url2pathname(splithost(url1)[1]), hdrs
235            except IOError, msg:
236                pass
237        fp = self.open(url, data)
238        try:
239            headers = fp.info()
240            if filename:
241                tfp = open(filename, 'wb')
242            else:
243                import tempfile
244                garbage, path = splittype(url)
245                garbage, path = splithost(path or "")
246                path, garbage = splitquery(path or "")
247                path, garbage = splitattr(path or "")
248                suffix = os.path.splitext(path)[1]
249                (fd, filename) = tempfile.mkstemp(suffix)
250                self.__tempfiles.append(filename)
251                tfp = os.fdopen(fd, 'wb')
252            try:
253                result = filename, headers
254                if self.tempcache is not None:
255                    self.tempcache[url] = result
256                bs = 1024*8
257                size = -1
258                read = 0
259                blocknum = 0
260                if reporthook:
261                    if "content-length" in headers:
262                        size = int(headers["Content-Length"])
263                    reporthook(blocknum, bs, size)
264                while 1:
265                    block = fp.read(bs)
266                    if block == "":
267                        break
268                    read += len(block)
269                    tfp.write(block)
270                    blocknum += 1
271                    if reporthook:
272                        reporthook(blocknum, bs, size)
273            finally:
274                tfp.close()
275        finally:
276            fp.close()
277        del fp
278        del tfp
279
280        # raise exception if actual size does not match content-length header
281        if size >= 0 and read < size:
282            raise ContentTooShortError("retrieval incomplete: got only %i out "
283                                       "of %i bytes" % (read, size), result)
284
285        return result
286
287    # Each method named open_<type> knows how to open that type of URL
288
289    def open_http(self, url, data=None):
290        """Use HTTP protocol."""
291        import httplib
292        user_passwd = None
293        proxy_passwd= None
294        if isinstance(url, str):
295            host, selector = splithost(url)
296            if host:
297                user_passwd, host = splituser(host)
298                host = unquote(host)
299            realhost = host
300        else:
301            host, selector = url
302            # check whether the proxy contains authorization information
303            proxy_passwd, host = splituser(host)
304            # now we proceed with the url we want to obtain
305            urltype, rest = splittype(selector)
306            url = rest
307            user_passwd = None
308            if urltype.lower() != 'http':
309                realhost = None
310            else:
311                realhost, rest = splithost(rest)
312                if realhost:
313                    user_passwd, realhost = splituser(realhost)
314                if user_passwd:
315                    selector = "%s://%s%s" % (urltype, realhost, rest)
316                if proxy_bypass(realhost):
317                    host = realhost
318
319            #print "proxy via http:", host, selector
320        if not host: raise IOError, ('http error', 'no host given')
321
322        if proxy_passwd:
323            import base64
324            proxy_auth = base64.b64encode(proxy_passwd).strip()
325        else:
326            proxy_auth = None
327
328        if user_passwd:
329            import base64
330            auth = base64.b64encode(user_passwd).strip()
331        else:
332            auth = None
333        h = httplib.HTTP(host)
334        if data is not None:
335            h.putrequest('POST', selector)
336            h.putheader('Content-Type', 'application/x-www-form-urlencoded')
337            h.putheader('Content-Length', '%d' % len(data))
338        else:
339            h.putrequest('GET', selector)
340        if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
341        if auth: h.putheader('Authorization', 'Basic %s' % auth)
342        if realhost: h.putheader('Host', realhost)
343        for args in self.addheaders: h.putheader(*args)
344        h.endheaders()
345        if data is not None:
346            h.send(data)
347        errcode, errmsg, headers = h.getreply()
348        fp = h.getfile()
349        if errcode == -1:
350            if fp: fp.close()
351            # something went wrong with the HTTP status line
352            raise IOError, ('http protocol error', 0,
353                            'got a bad status line', None)
354        # According to RFC 2616, "2xx" code indicates that the client's
355        # request was successfully received, understood, and accepted.
356        if (200 <= errcode < 300):
357            return addinfourl(fp, headers, "http:" + url, errcode)
358        else:
359            if data is None:
360                return self.http_error(url, fp, errcode, errmsg, headers)
361            else:
362                return self.http_error(url, fp, errcode, errmsg, headers, data)
363
364    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
365        """Handle http errors.
366        Derived class can override this, or provide specific handlers
367        named http_error_DDD where DDD is the 3-digit error code."""
368        # First check if there's a specific handler for this error
369        name = 'http_error_%d' % errcode
370        if hasattr(self, name):
371            method = getattr(self, name)
372            if data is None:
373                result = method(url, fp, errcode, errmsg, headers)
374            else:
375                result = method(url, fp, errcode, errmsg, headers, data)
376            if result: return result
377        return self.http_error_default(url, fp, errcode, errmsg, headers)
378
379    def http_error_default(self, url, fp, errcode, errmsg, headers):
380        """Default error handler: close the connection and raise IOError."""
381        void = fp.read()
382        fp.close()
383        raise IOError, ('http error', errcode, errmsg, headers)
384
385    if _have_ssl:
386        def open_https(self, url, data=None):
387            """Use HTTPS protocol."""
388
389            import httplib
390            user_passwd = None
391            proxy_passwd = None
392            if isinstance(url, str):
393                host, selector = splithost(url)
394                if host:
395                    user_passwd, host = splituser(host)
396                    host = unquote(host)
397                realhost = host
398            else:
399                host, selector = url
400                # here, we determine, whether the proxy contains authorization information
401                proxy_passwd, host = splituser(host)
402                urltype, rest = splittype(selector)
403                url = rest
404                user_passwd = None
405                if urltype.lower() != 'https':
406                    realhost = None
407                else:
408                    realhost, rest = splithost(rest)
409                    if realhost:
410                        user_passwd, realhost = splituser(realhost)
411                    if user_passwd:
412                        selector = "%s://%s%s" % (urltype, realhost, rest)
413                #print "proxy via https:", host, selector
414            if not host: raise IOError, ('https error', 'no host given')
415            if proxy_passwd:
416                import base64
417                proxy_auth = base64.b64encode(proxy_passwd).strip()
418            else:
419                proxy_auth = None
420            if user_passwd:
421                import base64
422                auth = base64.b64encode(user_passwd).strip()
423            else:
424                auth = None
425            h = httplib.HTTPS(host, 0,
426                              key_file=self.key_file,
427                              cert_file=self.cert_file)
428            if data is not None:
429                h.putrequest('POST', selector)
430                h.putheader('Content-Type',
431                            'application/x-www-form-urlencoded')
432                h.putheader('Content-Length', '%d' % len(data))
433            else:
434                h.putrequest('GET', selector)
435            if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
436            if auth: h.putheader('Authorization', 'Basic %s' % auth)
437            if realhost: h.putheader('Host', realhost)
438            for args in self.addheaders: h.putheader(*args)
439            h.endheaders()
440            if data is not None:
441                h.send(data)
442            errcode, errmsg, headers = h.getreply()
443            fp = h.getfile()
444            if errcode == -1:
445                if fp: fp.close()
446                # something went wrong with the HTTP status line
447                raise IOError, ('http protocol error', 0,
448                                'got a bad status line', None)
449            # According to RFC 2616, "2xx" code indicates that the client's
450            # request was successfully received, understood, and accepted.
451            if (200 <= errcode < 300):
452                return addinfourl(fp, headers, "https:" + url, errcode)
453            else:
454                if data is None:
455                    return self.http_error(url, fp, errcode, errmsg, headers)
456                else:
457                    return self.http_error(url, fp, errcode, errmsg, headers,
458                                           data)
459
460    def open_file(self, url):
461        """Use local file or FTP depending on form of URL."""
462        if not isinstance(url, str):
463            raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
464        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
465            return self.open_ftp(url)
466        else:
467            return self.open_local_file(url)
468
469    def open_local_file(self, url):
470        """Use local file."""
471        import mimetypes, mimetools, email.utils
472        try:
473            from cStringIO import StringIO
474        except ImportError:
475            from StringIO import StringIO
476        host, file = splithost(url)
477        localname = url2pathname(file)
478        try:
479            stats = os.stat(localname)
480        except OSError, e:
481            raise IOError(e.errno, e.strerror, e.filename)
482        size = stats.st_size
483        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
484        mtype = mimetypes.guess_type(url)[0]
485        headers = mimetools.Message(StringIO(
486            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
487            (mtype or 'text/plain', size, modified)))
488        if not host:
489            urlfile = file
490            if file[:1] == '/':
491                urlfile = 'file://' + file
492            return addinfourl(open(localname, 'rb'),
493                              headers, urlfile)
494        host, port = splitport(host)
495        if not port \
496           and socket.gethostbyname(host) in (localhost(), thishost()):
497            urlfile = file
498            if file[:1] == '/':
499                urlfile = 'file://' + file
500            return addinfourl(open(localname, 'rb'),
501                              headers, urlfile)
502        raise IOError, ('local file error', 'not on local host')
503
504    def open_ftp(self, url):
505        """Use FTP protocol."""
506        if not isinstance(url, str):
507            raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
508        import mimetypes, mimetools
509        try:
510            from cStringIO import StringIO
511        except ImportError:
512            from StringIO import StringIO
513        host, path = splithost(url)
514        if not host: raise IOError, ('ftp error', 'no host given')
515        host, port = splitport(host)
516        user, host = splituser(host)
517        if user: user, passwd = splitpasswd(user)
518        else: passwd = None
519        host = unquote(host)
520        user = unquote(user or '')
521        passwd = unquote(passwd or '')
522        host = socket.gethostbyname(host)
523        if not port:
524            import ftplib
525            port = ftplib.FTP_PORT
526        else:
527            port = int(port)
528        path, attrs = splitattr(path)
529        path = unquote(path)
530        dirs = path.split('/')
531        dirs, file = dirs[:-1], dirs[-1]
532        if dirs and not dirs[0]: dirs = dirs[1:]
533        if dirs and not dirs[0]: dirs[0] = '/'
534        key = user, host, port, '/'.join(dirs)
535        # XXX thread unsafe!
536        if len(self.ftpcache) > MAXFTPCACHE:
537            # Prune the cache, rather arbitrarily
538            for k in self.ftpcache.keys():
539                if k != key:
540                    v = self.ftpcache[k]
541                    del self.ftpcache[k]
542                    v.close()
543        try:
544            if not key in self.ftpcache:
545                self.ftpcache[key] = \
546                    ftpwrapper(user, passwd, host, port, dirs)
547            if not file: type = 'D'
548            else: type = 'I'
549            for attr in attrs:
550                attr, value = splitvalue(attr)
551                if attr.lower() == 'type' and \
552                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
553                    type = value.upper()
554            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
555            mtype = mimetypes.guess_type("ftp:" + url)[0]
556            headers = ""
557            if mtype:
558                headers += "Content-Type: %s\n" % mtype
559            if retrlen is not None and retrlen >= 0:
560                headers += "Content-Length: %d\n" % retrlen
561            headers = mimetools.Message(StringIO(headers))
562            return addinfourl(fp, headers, "ftp:" + url)
563        except ftperrors(), msg:
564            raise IOError, ('ftp error', msg), sys.exc_info()[2]
565
566    def open_data(self, url, data=None):
567        """Use "data" URL."""
568        if not isinstance(url, str):
569            raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
570        # ignore POSTed data
571        #
572        # syntax of data URLs:
573        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
574        # mediatype := [ type "/" subtype ] *( ";" parameter )
575        # data      := *urlchar
576        # parameter := attribute "=" value
577        import mimetools
578        try:
579            from cStringIO import StringIO
580        except ImportError:
581            from StringIO import StringIO
582        try:
583            [type, data] = url.split(',', 1)
584        except ValueError:
585            raise IOError, ('data error', 'bad data URL')
586        if not type:
587            type = 'text/plain;charset=US-ASCII'
588        semi = type.rfind(';')
589        if semi >= 0 and '=' not in type[semi:]:
590            encoding = type[semi+1:]
591            type = type[:semi]
592        else:
593            encoding = ''
594        msg = []
595        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
596                                            time.gmtime(time.time())))
597        msg.append('Content-type: %s' % type)
598        if encoding == 'base64':
599            import base64
600            data = base64.decodestring(data)
601        else:
602            data = unquote(data)
603        msg.append('Content-Length: %d' % len(data))
604        msg.append('')
605        msg.append(data)
606        msg = '\n'.join(msg)
607        f = StringIO(msg)
608        headers = mimetools.Message(f, 0)
609        #f.fileno = None     # needed for addinfourl
610        return addinfourl(f, headers, url)
611
612
613class FancyURLopener(URLopener):
614    """Derived class with handlers for errors we can handle (perhaps)."""
615
616    def __init__(self, *args, **kwargs):
617        URLopener.__init__(self, *args, **kwargs)
618        self.auth_cache = {}
619        self.tries = 0
620        self.maxtries = 10
621
622    def http_error_default(self, url, fp, errcode, errmsg, headers):
623        """Default error handling -- don't raise an exception."""
624        return addinfourl(fp, headers, "http:" + url, errcode)
625
626    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
627        """Error 302 -- relocated (temporarily)."""
628        self.tries += 1
629        if self.maxtries and self.tries >= self.maxtries:
630            if hasattr(self, "http_error_500"):
631                meth = self.http_error_500
632            else:
633                meth = self.http_error_default
634            self.tries = 0
635            return meth(url, fp, 500,
636                        "Internal Server Error: Redirect Recursion", headers)
637        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
638                                        data)
639        self.tries = 0
640        return result
641
642    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
643        if 'location' in headers:
644            newurl = headers['location']
645        elif 'uri' in headers:
646            newurl = headers['uri']
647        else:
648            return
649        void = fp.read()
650        fp.close()
651        # In case the server sent a relative URL, join with original:
652        newurl = basejoin(self.type + ":" + url, newurl)
653        return self.open(newurl)
654
655    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
656        """Error 301 -- also relocated (permanently)."""
657        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
658
659    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
660        """Error 303 -- also relocated (essentially identical to 302)."""
661        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
662
663    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
664        """Error 307 -- relocated, but turn POST into error."""
665        if data is None:
666            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
667        else:
668            return self.http_error_default(url, fp, errcode, errmsg, headers)
669
670    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
671        """Error 401 -- authentication required.
672        This function supports Basic authentication only."""
673        if not 'www-authenticate' in headers:
674            URLopener.http_error_default(self, url, fp,
675                                         errcode, errmsg, headers)
676        stuff = headers['www-authenticate']
677        import re
678        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
679        if not match:
680            URLopener.http_error_default(self, url, fp,
681                                         errcode, errmsg, headers)
682        scheme, realm = match.groups()
683        if scheme.lower() != 'basic':
684            URLopener.http_error_default(self, url, fp,
685                                         errcode, errmsg, headers)
686        name = 'retry_' + self.type + '_basic_auth'
687        if data is None:
688            return getattr(self,name)(url, realm)
689        else:
690            return getattr(self,name)(url, realm, data)
691
692    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
693        """Error 407 -- proxy authentication required.
694        This function supports Basic authentication only."""
695        if not 'proxy-authenticate' in headers:
696            URLopener.http_error_default(self, url, fp,
697                                         errcode, errmsg, headers)
698        stuff = headers['proxy-authenticate']
699        import re
700        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
701        if not match:
702            URLopener.http_error_default(self, url, fp,
703                                         errcode, errmsg, headers)
704        scheme, realm = match.groups()
705        if scheme.lower() != 'basic':
706            URLopener.http_error_default(self, url, fp,
707                                         errcode, errmsg, headers)
708        name = 'retry_proxy_' + self.type + '_basic_auth'
709        if data is None:
710            return getattr(self,name)(url, realm)
711        else:
712            return getattr(self,name)(url, realm, data)
713
714    def retry_proxy_http_basic_auth(self, url, realm, data=None):
715        host, selector = splithost(url)
716        newurl = 'http://' + host + selector
717        proxy = self.proxies['http']
718        urltype, proxyhost = splittype(proxy)
719        proxyhost, proxyselector = splithost(proxyhost)
720        i = proxyhost.find('@') + 1
721        proxyhost = proxyhost[i:]
722        user, passwd = self.get_user_passwd(proxyhost, realm, i)
723        if not (user or passwd): return None
724        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
725        self.proxies['http'] = 'http://' + proxyhost + proxyselector
726        if data is None:
727            return self.open(newurl)
728        else:
729            return self.open(newurl, data)
730
731    def retry_proxy_https_basic_auth(self, url, realm, data=None):
732        host, selector = splithost(url)
733        newurl = 'https://' + host + selector
734        proxy = self.proxies['https']
735        urltype, proxyhost = splittype(proxy)
736        proxyhost, proxyselector = splithost(proxyhost)
737        i = proxyhost.find('@') + 1
738        proxyhost = proxyhost[i:]
739        user, passwd = self.get_user_passwd(proxyhost, realm, i)
740        if not (user or passwd): return None
741        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
742        self.proxies['https'] = 'https://' + proxyhost + proxyselector
743        if data is None:
744            return self.open(newurl)
745        else:
746            return self.open(newurl, data)
747
748    def retry_http_basic_auth(self, url, realm, data=None):
749        host, selector = splithost(url)
750        i = host.find('@') + 1
751        host = host[i:]
752        user, passwd = self.get_user_passwd(host, realm, i)
753        if not (user or passwd): return None
754        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
755        newurl = 'http://' + host + selector
756        if data is None:
757            return self.open(newurl)
758        else:
759            return self.open(newurl, data)
760
761    def retry_https_basic_auth(self, url, realm, data=None):
762        host, selector = splithost(url)
763        i = host.find('@') + 1
764        host = host[i:]
765        user, passwd = self.get_user_passwd(host, realm, i)
766        if not (user or passwd): return None
767        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
768        newurl = 'https://' + host + selector
769        if data is None:
770            return self.open(newurl)
771        else:
772            return self.open(newurl, data)
773
774    def get_user_passwd(self, host, realm, clear_cache = 0):
775        key = realm + '@' + host.lower()
776        if key in self.auth_cache:
777            if clear_cache:
778                del self.auth_cache[key]
779            else:
780                return self.auth_cache[key]
781        user, passwd = self.prompt_user_passwd(host, realm)
782        if user or passwd: self.auth_cache[key] = (user, passwd)
783        return user, passwd
784
785    def prompt_user_passwd(self, host, realm):
786        """Override this in a GUI environment!"""
787        import getpass
788        try:
789            user = raw_input("Enter username for %s at %s: " % (realm,
790                                                                host))
791            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
792                (user, realm, host))
793            return user, passwd
794        except KeyboardInterrupt:
795            print
796            return None, None
797
798
799# Utility functions
800
801_localhost = None
802def localhost():
803    """Return the IP address of the magic hostname 'localhost'."""
804    global _localhost
805    if _localhost is None:
806        _localhost = socket.gethostbyname('localhost')
807    return _localhost
808
809_thishost = None
810def thishost():
811    """Return the IP address of the current host."""
812    global _thishost
813    if _thishost is None:
814        _thishost = socket.gethostbyname(socket.gethostname())
815    return _thishost
816
817_ftperrors = None
818def ftperrors():
819    """Return the set of errors raised by the FTP class."""
820    global _ftperrors
821    if _ftperrors is None:
822        import ftplib
823        _ftperrors = ftplib.all_errors
824    return _ftperrors
825
826_noheaders = None
827def noheaders():
828    """Return an empty mimetools.Message object."""
829    global _noheaders
830    if _noheaders is None:
831        import mimetools
832        try:
833            from cStringIO import StringIO
834        except ImportError:
835            from StringIO import StringIO
836        _noheaders = mimetools.Message(StringIO(), 0)
837        _noheaders.fp.close()   # Recycle file descriptor
838    return _noheaders
839
840
841# Utility classes
842
843class ftpwrapper:
844    """Class used by open_ftp() for cache of open FTP connections."""
845
846    def __init__(self, user, passwd, host, port, dirs,
847                 timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
848        self.user = user
849        self.passwd = passwd
850        self.host = host
851        self.port = port
852        self.dirs = dirs
853        self.timeout = timeout
854        self.init()
855
856    def init(self):
857        import ftplib
858        self.busy = 0
859        self.ftp = ftplib.FTP()
860        self.ftp.connect(self.host, self.port, self.timeout)
861        self.ftp.login(self.user, self.passwd)
862        for dir in self.dirs:
863            self.ftp.cwd(dir)
864
865    def retrfile(self, file, type):
866        import ftplib
867        self.endtransfer()
868        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
869        else: cmd = 'TYPE ' + type; isdir = 0
870        try:
871            self.ftp.voidcmd(cmd)
872        except ftplib.all_errors:
873            self.init()
874            self.ftp.voidcmd(cmd)
875        conn = None
876        if file and not isdir:
877            # Try to retrieve as a file
878            try:
879                cmd = 'RETR ' + file
880                conn = self.ftp.ntransfercmd(cmd)
881            except ftplib.error_perm, reason:
882                if str(reason)[:3] != '550':
883                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
884        if not conn:
885            # Set transfer mode to ASCII!
886            self.ftp.voidcmd('TYPE A')
887            # Try a directory listing. Verify that directory exists.
888            if file:
889                pwd = self.ftp.pwd()
890                try:
891                    try:
892                        self.ftp.cwd(file)
893                    except ftplib.error_perm, reason:
894                        raise IOError, ('ftp error', reason), sys.exc_info()[2]
895                finally:
896                    self.ftp.cwd(pwd)
897                cmd = 'LIST ' + file
898            else:
899                cmd = 'LIST'
900            conn = self.ftp.ntransfercmd(cmd)
901        self.busy = 1
902        # Pass back both a suitably decorated object and a retrieval length
903        return (addclosehook(conn[0].makefile('rb'),
904                             self.endtransfer), conn[1])
905    def endtransfer(self):
906        if not self.busy:
907            return
908        self.busy = 0
909        try:
910            self.ftp.voidresp()
911        except ftperrors():
912            pass
913
914    def close(self):
915        self.endtransfer()
916        try:
917            self.ftp.close()
918        except ftperrors():
919            pass
920
921class addbase:
922    """Base class for addinfo and addclosehook."""
923
924    def __init__(self, fp):
925        self.fp = fp
926        self.read = self.fp.read
927        self.readline = self.fp.readline
928        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
929        if hasattr(self.fp, "fileno"):
930            self.fileno = self.fp.fileno
931        else:
932            self.fileno = lambda: None
933        if hasattr(self.fp, "__iter__"):
934            self.__iter__ = self.fp.__iter__
935            if hasattr(self.fp, "next"):
936                self.next = self.fp.next
937
938    def __repr__(self):
939        return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
940                                             id(self), self.fp)
941
942    def close(self):
943        self.read = None
944        self.readline = None
945        self.readlines = None
946        self.fileno = None
947        if self.fp: self.fp.close()
948        self.fp = None
949
950class addclosehook(addbase):
951    """Class to add a close hook to an open file."""
952
953    def __init__(self, fp, closehook, *hookargs):
954        addbase.__init__(self, fp)
955        self.closehook = closehook
956        self.hookargs = hookargs
957
958    def close(self):
959        addbase.close(self)
960        if self.closehook:
961            self.closehook(*self.hookargs)
962            self.closehook = None
963            self.hookargs = None
964
965class addinfo(addbase):
966    """class to add an info() method to an open file."""
967
968    def __init__(self, fp, headers):
969        addbase.__init__(self, fp)
970        self.headers = headers
971
972    def info(self):
973        return self.headers
974
975class addinfourl(addbase):
976    """class to add info() and geturl() methods to an open file."""
977
978    def __init__(self, fp, headers, url, code=None):
979        addbase.__init__(self, fp)
980        self.headers = headers
981        self.url = url
982        self.code = code
983
984    def info(self):
985        return self.headers
986
987    def getcode(self):
988        return self.code
989
990    def geturl(self):
991        return self.url
992
993
994# Utilities to parse URLs (most of these return None for missing parts):
995# unwrap('<URL:type://host/path>') --> 'type://host/path'
996# splittype('type:opaquestring') --> 'type', 'opaquestring'
997# splithost('//host[:port]/path') --> 'host[:port]', '/path'
998# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
999# splitpasswd('user:passwd') -> 'user', 'passwd'
1000# splitport('host:port') --> 'host', 'port'
1001# splitquery('/path?query') --> '/path', 'query'
1002# splittag('/path#tag') --> '/path', 'tag'
1003# splitattr('/path;attr1=value1;attr2=value2;...') ->
1004#   '/path', ['attr1=value1', 'attr2=value2', ...]
1005# splitvalue('attr=value') --> 'attr', 'value'
1006# unquote('abc%20def') -> 'abc def'
1007# quote('abc def') -> 'abc%20def')
1008
1009try:
1010    unicode
1011except NameError:
1012    def _is_unicode(x):
1013        return 0
1014else:
1015    def _is_unicode(x):
1016        return isinstance(x, unicode)
1017
1018def toBytes(url):
1019    """toBytes(u"URL") --> 'URL'."""
1020    # Most URL schemes require ASCII. If that changes, the conversion
1021    # can be relaxed
1022    if _is_unicode(url):
1023        try:
1024            url = url.encode("ASCII")
1025        except UnicodeError:
1026            raise UnicodeError("URL " + repr(url) +
1027                               " contains non-ASCII characters")
1028    return url
1029
1030def unwrap(url):
1031    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1032    url = url.strip()
1033    if url[:1] == '<' and url[-1:] == '>':
1034        url = url[1:-1].strip()
1035    if url[:4] == 'URL:': url = url[4:].strip()
1036    return url
1037
1038_typeprog = None
1039def splittype(url):
1040    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1041    global _typeprog
1042    if _typeprog is None:
1043        import re
1044        _typeprog = re.compile('^([^/:]+):')
1045
1046    match = _typeprog.match(url)
1047    if match:
1048        scheme = match.group(1)
1049        return scheme.lower(), url[len(scheme) + 1:]
1050    return None, url
1051
1052_hostprog = None
1053def splithost(url):
1054    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1055    global _hostprog
1056    if _hostprog is None:
1057        import re
1058        _hostprog = re.compile('^//([^/?]*)(.*)$')
1059
1060    match = _hostprog.match(url)
1061    if match: return match.group(1, 2)
1062    return None, url
1063
1064_userprog = None
1065def splituser(host):
1066    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1067    global _userprog
1068    if _userprog is None:
1069        import re
1070        _userprog = re.compile('^(.*)@(.*)$')
1071
1072    match = _userprog.match(host)
1073    if match: return map(unquote, match.group(1, 2))
1074    return None, host
1075
1076_passwdprog = None
1077def splitpasswd(user):
1078    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1079    global _passwdprog
1080    if _passwdprog is None:
1081        import re
1082        _passwdprog = re.compile('^([^:]*):(.*)$')
1083
1084    match = _passwdprog.match(user)
1085    if match: return match.group(1, 2)
1086    return user, None
1087
1088# splittag('/path#tag') --> '/path', 'tag'
1089_portprog = None
1090def splitport(host):
1091    """splitport('host:port') --> 'host', 'port'."""
1092    global _portprog
1093    if _portprog is None:
1094        import re
1095        _portprog = re.compile('^(.*):([0-9]+)$')
1096
1097    match = _portprog.match(host)
1098    if match: return match.group(1, 2)
1099    return host, None
1100
1101_nportprog = None
1102def splitnport(host, defport=-1):
1103    """Split host and port, returning numeric port.
1104    Return given default port if no ':' found; defaults to -1.
1105    Return numerical port if a valid number are found after ':'.
1106    Return None if ':' but not a valid number."""
1107    global _nportprog
1108    if _nportprog is None:
1109        import re
1110        _nportprog = re.compile('^(.*):(.*)$')
1111
1112    match = _nportprog.match(host)
1113    if match:
1114        host, port = match.group(1, 2)
1115        try:
1116            if not port: raise ValueError, "no digits"
1117            nport = int(port)
1118        except ValueError:
1119            nport = None
1120        return host, nport
1121    return host, defport
1122
1123_queryprog = None
1124def splitquery(url):
1125    """splitquery('/path?query') --> '/path', 'query'."""
1126    global _queryprog
1127    if _queryprog is None:
1128        import re
1129        _queryprog = re.compile('^(.*)\?([^?]*)$')
1130
1131    match = _queryprog.match(url)
1132    if match: return match.group(1, 2)
1133    return url, None
1134
1135_tagprog = None
1136def splittag(url):
1137    """splittag('/path#tag') --> '/path', 'tag'."""
1138    global _tagprog
1139    if _tagprog is None:
1140        import re
1141        _tagprog = re.compile('^(.*)#([^#]*)$')
1142
1143    match = _tagprog.match(url)
1144    if match: return match.group(1, 2)
1145    return url, None
1146
1147def splitattr(url):
1148    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1149        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1150    words = url.split(';')
1151    return words[0], words[1:]
1152
1153_valueprog = None
1154def splitvalue(attr):
1155    """splitvalue('attr=value') --> 'attr', 'value'."""
1156    global _valueprog
1157    if _valueprog is None:
1158        import re
1159        _valueprog = re.compile('^([^=]*)=(.*)$')
1160
1161    match = _valueprog.match(attr)
1162    if match: return match.group(1, 2)
1163    return attr, None
1164
1165_hexdig = '0123456789ABCDEFabcdef'
1166_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
1167
1168def unquote(s):
1169    """unquote('abc%20def') -> 'abc def'."""
1170    res = s.split('%')
1171    for i in xrange(1, len(res)):
1172        item = res[i]
1173        try:
1174            res[i] = _hextochr[item[:2]] + item[2:]
1175        except KeyError:
1176            res[i] = '%' + item
1177        except UnicodeDecodeError:
1178            res[i] = unichr(int(item[:2], 16)) + item[2:]
1179    return "".join(res)
1180
1181def unquote_plus(s):
1182    """unquote('%7e/abc+def') -> '~/abc def'"""
1183    s = s.replace('+', ' ')
1184    return unquote(s)
1185
1186always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1187               'abcdefghijklmnopqrstuvwxyz'
1188               '0123456789' '_.-')
1189_safemaps = {}
1190
1191def quote(s, safe = '/'):
1192    """quote('abc def') -> 'abc%20def'
1193
1194    Each part of a URL, e.g. the path info, the query, etc., has a
1195    different set of reserved characters that must be quoted.
1196
1197    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1198    the following reserved characters.
1199
1200    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1201                  "$" | ","
1202
1203    Each of these characters is reserved in some component of a URL,
1204    but not necessarily in all of them.
1205
1206    By default, the quote function is intended for quoting the path
1207    section of a URL.  Thus, it will not encode '/'.  This character
1208    is reserved, but in typical usage the quote function is being
1209    called on a path where the existing slash characters are used as
1210    reserved characters.
1211    """
1212    cachekey = (safe, always_safe)
1213    try:
1214        safe_map = _safemaps[cachekey]
1215    except KeyError:
1216        safe += always_safe
1217        safe_map = {}
1218        for i in range(256):
1219            c = chr(i)
1220            safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1221        _safemaps[cachekey] = safe_map
1222    res = map(safe_map.__getitem__, s)
1223    return ''.join(res)
1224
1225def quote_plus(s, safe = ''):
1226    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1227    if ' ' in s:
1228        s = quote(s, safe + ' ')
1229        return s.replace(' ', '+')
1230    return quote(s, safe)
1231
1232def urlencode(query,doseq=0):
1233    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1234
1235    If any values in the query arg are sequences and doseq is true, each
1236    sequence element is converted to a separate parameter.
1237
1238    If the query arg is a sequence of two-element tuples, the order of the
1239    parameters in the output will match the order of parameters in the
1240    input.
1241    """
1242
1243    if hasattr(query,"items"):
1244        # mapping objects
1245        query = query.items()
1246    else:
1247        # it's a bother at times that strings and string-like objects are
1248        # sequences...
1249        try:
1250            # non-sequence items should not work with len()
1251            # non-empty strings will fail this
1252            if len(query) and not isinstance(query[0], tuple):
1253                raise TypeError
1254            # zero-length sequences of all types will get here and succeed,
1255            # but that's a minor nit - since the original implementation
1256            # allowed empty dicts that type of behavior probably should be
1257            # preserved for consistency
1258        except TypeError:
1259            ty,va,tb = sys.exc_info()
1260            raise TypeError, "not a valid non-string sequence or mapping object", tb
1261
1262    l = []
1263    if not doseq:
1264        # preserve old behavior
1265        for k, v in query:
1266            k = quote_plus(str(k))
1267            v = quote_plus(str(v))
1268            l.append(k + '=' + v)
1269    else:
1270        for k, v in query:
1271            k = quote_plus(str(k))
1272            if isinstance(v, str):
1273                v = quote_plus(v)
1274                l.append(k + '=' + v)
1275            elif _is_unicode(v):
1276                # is there a reasonable way to convert to ASCII?
1277                # encode generates a string, but "replace" or "ignore"
1278                # lose information and "strict" can raise UnicodeError
1279                v = quote_plus(v.encode("ASCII","replace"))
1280                l.append(k + '=' + v)
1281            else:
1282                try:
1283                    # is this a sufficient test for sequence-ness?
1284                    x = len(v)
1285                except TypeError:
1286                    # not a sequence
1287                    v = quote_plus(str(v))
1288                    l.append(k + '=' + v)
1289                else:
1290                    # loop over the sequence
1291                    for elt in v:
1292                        l.append(k + '=' + quote_plus(str(elt)))
1293    return '&'.join(l)
1294
1295# Proxy handling
1296def getproxies_environment():
1297    """Return a dictionary of scheme -> proxy server URL mappings.
1298
1299    Scan the environment for variables named <scheme>_proxy;
1300    this seems to be the standard convention.  If you need a
1301    different way, you can pass a proxies dictionary to the
1302    [Fancy]URLopener constructor.
1303
1304    """
1305    proxies = {}
1306    for name, value in os.environ.items():
1307        name = name.lower()
1308        if value and name[-6:] == '_proxy':
1309            proxies[name[:-6]] = value
1310    return proxies
1311
1312def proxy_bypass_environment(host):
1313    """Test if proxies should not be used for a particular host.
1314
1315    Checks the environment for a variable named no_proxy, which should
1316    be a list of DNS suffixes separated by commas, or '*' for all hosts.
1317    """
1318    no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1319    # '*' is special case for always bypass
1320    if no_proxy == '*':
1321        return 1
1322    # strip port off host
1323    hostonly, port = splitport(host)
1324    # check if the host ends with any of the DNS suffixes
1325    for name in no_proxy.split(','):
1326        if name and (hostonly.endswith(name) or host.endswith(name)):
1327            return 1
1328    # otherwise, don't bypass
1329    return 0
1330
1331
1332if sys.platform == 'darwin':
1333    from _scproxy import _get_proxy_settings, _get_proxies
1334
1335    def proxy_bypass_macosx_sysconf(host):
1336        """
1337        Return True iff this host shouldn't be accessed using a proxy
1338
1339        This function uses the MacOSX framework SystemConfiguration
1340        to fetch the proxy information.
1341        """
1342        import re
1343        import socket
1344        from fnmatch import fnmatch
1345
1346        hostonly, port = splitport(host)
1347
1348        def ip2num(ipAddr):
1349            parts = ipAddr.split('.')
1350            parts = map(int, parts)
1351            if len(parts) != 4:
1352                parts = (parts + [0, 0, 0, 0])[:4]
1353            return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1354
1355        proxy_settings = _get_proxy_settings()
1356
1357        # Check for simple host names:
1358        if '.' not in host:
1359            if proxy_settings['exclude_simple']:
1360                return True
1361
1362        hostIP = None
1363
1364        for value in proxy_settings.get('exceptions', ()):
1365            # Items in the list are strings like these: *.local, 169.254/16
1366            if not value: continue
1367
1368            m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1369            if m is not None:
1370                if hostIP is None:
1371                    try:
1372                        hostIP = socket.gethostbyname(hostonly)
1373                        hostIP = ip2num(hostIP)
1374                    except socket.error:
1375                        continue
1376
1377                base = ip2num(m.group(1))
1378                mask = m.group(2)
1379                if mask is None:
1380                    mask = 8 * (m.group(1).count('.') + 1)
1381
1382                else:
1383                    mask = int(mask[1:])
1384                    mask = 32 - mask
1385
1386                if (hostIP >> mask) == (base >> mask):
1387                    return True
1388
1389            elif fnmatch(host, value):
1390                return True
1391
1392        return False
1393
1394
1395    def getproxies_macosx_sysconf():
1396        """Return a dictionary of scheme -> proxy server URL mappings.
1397
1398        This function uses the MacOSX framework SystemConfiguration
1399        to fetch the proxy information.
1400        """
1401        return _get_proxies()
1402
1403
1404
1405    def proxy_bypass(host):
1406        if getproxies_environment():
1407            return proxy_bypass_environment(host)
1408        else:
1409            return proxy_bypass_macosx_sysconf(host)
1410
1411    def getproxies():
1412        return getproxies_environment() or getproxies_macosx_sysconf()
1413
1414elif os.name == 'nt':
1415    def getproxies_registry():
1416        """Return a dictionary of scheme -> proxy server URL mappings.
1417
1418        Win32 uses the registry to store proxies.
1419
1420        """
1421        proxies = {}
1422        try:
1423            import _winreg
1424        except ImportError:
1425            # Std module, so should be around - but you never know!
1426            return proxies
1427        try:
1428            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1429                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1430            proxyEnable = _winreg.QueryValueEx(internetSettings,
1431                                               'ProxyEnable')[0]
1432            if proxyEnable:
1433                # Returned as Unicode but problems if not converted to ASCII
1434                proxyServer = str(_winreg.QueryValueEx(internetSettings,
1435                                                       'ProxyServer')[0])
1436                if '=' in proxyServer:
1437                    # Per-protocol settings
1438                    for p in proxyServer.split(';'):
1439                        protocol, address = p.split('=', 1)
1440                        # See if address has a type:// prefix
1441                        import re
1442                        if not re.match('^([^/:]+)://', address):
1443                            address = '%s://%s' % (protocol, address)
1444                        proxies[protocol] = address
1445                else:
1446                    # Use one setting for all protocols
1447                    if proxyServer[:5] == 'http:':
1448                        proxies['http'] = proxyServer
1449                    else:
1450                        proxies['http'] = 'http://%s' % proxyServer
1451                        proxies['https'] = 'http://%s' % proxyServer
1452                        proxies['ftp'] = 'ftp://%s' % proxyServer
1453            internetSettings.Close()
1454        except (WindowsError, ValueError, TypeError):
1455            # Either registry key not found etc, or the value in an
1456            # unexpected format.
1457            # proxies already set up to be empty so nothing to do
1458            pass
1459        return proxies
1460
1461    def getproxies():
1462        """Return a dictionary of scheme -> proxy server URL mappings.
1463
1464        Returns settings gathered from the environment, if specified,
1465        or the registry.
1466
1467        """
1468        return getproxies_environment() or getproxies_registry()
1469
1470    def proxy_bypass_registry(host):
1471        try:
1472            import _winreg
1473            import re
1474        except ImportError:
1475            # Std modules, so should be around - but you never know!
1476            return 0
1477        try:
1478            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1479                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1480            proxyEnable = _winreg.QueryValueEx(internetSettings,
1481                                               'ProxyEnable')[0]
1482            proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1483                                                     'ProxyOverride')[0])
1484            # ^^^^ Returned as Unicode but problems if not converted to ASCII
1485        except WindowsError:
1486            return 0
1487        if not proxyEnable or not proxyOverride:
1488            return 0
1489        # try to make a host list from name and IP address.
1490        rawHost, port = splitport(host)
1491        host = [rawHost]
1492        try:
1493            addr = socket.gethostbyname(rawHost)
1494            if addr != rawHost:
1495                host.append(addr)
1496        except socket.error:
1497            pass
1498        try:
1499            fqdn = socket.getfqdn(rawHost)
1500            if fqdn != rawHost:
1501                host.append(fqdn)
1502        except socket.error:
1503            pass
1504        # make a check value list from the registry entry: replace the
1505        # '<local>' string by the localhost entry and the corresponding
1506        # canonical entry.
1507        proxyOverride = proxyOverride.split(';')
1508        i = 0
1509        while i < len(proxyOverride):
1510            if proxyOverride[i] == '<local>':
1511                proxyOverride[i:i+1] = ['localhost',
1512                                        '127.0.0.1',
1513                                        socket.gethostname(),
1514                                        socket.gethostbyname(
1515                                            socket.gethostname())]
1516            i += 1
1517        # print proxyOverride
1518        # now check if we match one of the registry values.
1519        for test in proxyOverride:
1520            test = test.replace(".", r"\.")     # mask dots
1521            test = test.replace("*", r".*")     # change glob sequence
1522            test = test.replace("?", r".")      # change glob char
1523            for val in host:
1524                # print "%s <--> %s" %( test, val )
1525                if re.match(test, val, re.I):
1526                    return 1
1527        return 0
1528
1529    def proxy_bypass(host):
1530        """Return a dictionary of scheme -> proxy server URL mappings.
1531
1532        Returns settings gathered from the environment, if specified,
1533        or the registry.
1534
1535        """
1536        if getproxies_environment():
1537            return proxy_bypass_environment(host)
1538        else:
1539            return proxy_bypass_registry(host)
1540
1541else:
1542    # By default use environment variables
1543    getproxies = getproxies_environment
1544    proxy_bypass = proxy_bypass_environment
1545
1546# Test and time quote() and unquote()
1547def test1():
1548    s = ''
1549    for i in range(256): s = s + chr(i)
1550    s = s*4
1551    t0 = time.time()
1552    qs = quote(s)
1553    uqs = unquote(qs)
1554    t1 = time.time()
1555    if uqs != s:
1556        print 'Wrong!'
1557    print repr(s)
1558    print repr(qs)
1559    print repr(uqs)
1560    print round(t1 - t0, 3), 'sec'
1561
1562
1563def reporthook(blocknum, blocksize, totalsize):
1564    # Report during remote transfers
1565    print "Block number: %d, Block size: %d, Total size: %d" % (
1566        blocknum, blocksize, totalsize)
1567
1568# Test program
1569def test(args=[]):
1570    if not args:
1571        args = [
1572            '/etc/passwd',
1573            'file:/etc/passwd',
1574            'file://localhost/etc/passwd',
1575            'ftp://ftp.gnu.org/pub/README',
1576            'http://www.python.org/index.html',
1577            ]
1578        if hasattr(URLopener, "open_https"):
1579            args.append('https://synergy.as.cmu.edu/~geek/')
1580    try:
1581        for url in args:
1582            print '-'*10, url, '-'*10
1583            fn, h = urlretrieve(url, None, reporthook)
1584            print fn
1585            if h:
1586                print '======'
1587                for k in h.keys(): print k + ':', h[k]
1588                print '======'
1589            fp = open(fn, 'rb')
1590            data = fp.read()
1591            del fp
1592            if '\r' in data:
1593                table = string.maketrans("", "")
1594                data = data.translate(table, "\r")
1595            print data
1596            fn, h = None, None
1597        print '-'*40
1598    finally:
1599        urlcleanup()
1600
1601def main():
1602    import getopt, sys
1603    try:
1604        opts, args = getopt.getopt(sys.argv[1:], "th")
1605    except getopt.error, msg:
1606        print msg
1607        print "Use -h for help"
1608        return
1609    t = 0
1610    for o, a in opts:
1611        if o == '-t':
1612            t = t + 1
1613        if o == '-h':
1614            print "Usage: python urllib.py [-t] [url ...]"
1615            print "-t runs self-test;",
1616            print "otherwise, contents of urls are printed"
1617            return
1618    if t:
1619        if t > 1:
1620            test1()
1621        test(args)
1622    else:
1623        if not args:
1624            print "Use -h for help"
1625        for url in args:
1626            print urlopen(url).read(),
1627
1628# Run test program when run as a script
1629if __name__ == '__main__':
1630    main()
1631